diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a758008f640..c026d34466d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -474,7 +474,6 @@ set(ZIG_STAGE2_SOURCES lib/std/os/linux.zig lib/std/os/linux.zig lib/std/os/linux/IoUring.zig - lib/std/os/linux/io_uring_sqe.zig lib/std/os/linux/x86_64.zig lib/std/os/linux/x86_64.zig lib/std/os/windows.zig diff --git a/lib/std/Io/Threaded.zig b/lib/std/Io/Threaded.zig index 11f9b149caf5..129ff15f309b 100644 --- a/lib/std/Io/Threaded.zig +++ b/lib/std/Io/Threaded.zig @@ -1266,8 +1266,7 @@ fn dirStatPathLinux( var path_buffer: [posix.PATH_MAX]u8 = undefined; const sub_path_posix = try pathToPosix(sub_path, &path_buffer); - const flags: u32 = linux.AT.NO_AUTOMOUNT | - @as(u32, if (!options.follow_symlinks) linux.AT.SYMLINK_NOFOLLOW else 0); + const flags: linux.At = .{ .no_automount = true, .symlink_nofollow = if (!options.follow_symlinks) true else false }; while (true) { try t.checkCancel(); @@ -1276,7 +1275,13 @@ fn dirStatPathLinux( dir.handle, sub_path_posix, flags, - linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME, + .{ + .type = true, + .mode = true, + .atime = true, + .mtime = true, + .ctime = true, + }, &statx, ); switch (linux.E.init(rc)) { @@ -1422,8 +1427,14 @@ fn fileStatLinux(userdata: ?*anyopaque, file: Io.File) Io.File.StatError!Io.File const rc = linux.statx( file.handle, "", - linux.AT.EMPTY_PATH, - linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME, + .{ .empty_path = true }, + .{ + .type = true, + .mode = true, + .atime = true, + .mtime = true, + .ctime = true, + }, &statx, ); switch (linux.E.init(rc)) { @@ -5838,7 +5849,7 @@ pub fn futexWake(ptr: *const std.atomic.Value(u32), max_waiters: u32) void { .linux => { const linux = std.os.linux; switch (linux.E.init(linux.futex_3arg( - &ptr.raw, + ptr, .{ .cmd = .WAKE, .private = true }, @min(max_waiters, std.math.maxInt(i32)), ))) { diff --git a/lib/std/Thread.zig b/lib/std/Thread.zig index 59c6d78166c6..5f85d74b3307 100644 --- a/lib/std/Thread.zig +++ b/lib/std/Thread.zig @@ -1216,8 +1216,8 @@ const LinuxThreadImpl = struct { thread: *ThreadCompletion, const ThreadCompletion = struct { - completion: Completion = Completion.init(.running), - child_tid: std.atomic.Value(i32) = std.atomic.Value(i32).init(1), + completion: Completion = .init(.running), + child_tid: std.atomic.Value(i32) = .init(1), parent_tid: i32 = undefined, mapped: []align(std.heap.page_size_min) u8, @@ -1662,7 +1662,7 @@ const LinuxThreadImpl = struct { if (tid == 0) break; switch (linux.E.init(linux.futex_4arg( - &self.thread.child_tid.raw, + @ptrCast(&self.thread.child_tid), .{ .cmd = .WAIT, .private = false }, @bitCast(tid), null, diff --git a/lib/std/Thread/Futex.zig b/lib/std/Thread/Futex.zig index 6c7b58a54093..2e84ebfdfa7e 100644 --- a/lib/std/Thread/Futex.zig +++ b/lib/std/Thread/Futex.zig @@ -263,7 +263,7 @@ const LinuxImpl = struct { } const rc = linux.futex_4arg( - &ptr.raw, + ptr, .{ .cmd = .WAIT, .private = true }, expect, if (timeout != null) &ts else null, @@ -285,7 +285,7 @@ const LinuxImpl = struct { fn wake(ptr: *const atomic.Value(u32), max_waiters: u32) void { const rc = linux.futex_3arg( - &ptr.raw, + ptr, .{ .cmd = .WAKE, .private = true }, @min(max_waiters, std.math.maxInt(i32)), ); diff --git a/lib/std/fs.zig b/lib/std/fs.zig index 6db63b6e2b6f..c2784e825084 100644 --- a/lib/std/fs.zig +++ b/lib/std/fs.zig @@ -187,7 +187,7 @@ pub fn cwd() Dir { } else if (native_os == .wasi) { return .{ .fd = std.options.wasiCwd() }; } else { - return .{ .fd = posix.AT.FDCWD }; + return .{ .fd = posix.AT.fdcwd }; } } diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 882de1f4584f..6a046f217979 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -4,6 +4,7 @@ //! * Implement all the syscalls in the same way that libc functions will //! provide `rename` when only the `renameat` syscall exists. const std = @import("../std.zig"); +const atomic = std.atomic; const builtin = @import("builtin"); const assert = std.debug.assert; const maxInt = std.math.maxInt; @@ -22,6 +23,7 @@ const iovec = std.posix.iovec; const iovec_const = std.posix.iovec_const; const winsize = std.posix.winsize; const ACCMODE = std.posix.ACCMODE; +pub const IoUring = @import("linux/IoUring.zig"); test { if (builtin.os.tag == .linux) { @@ -493,6 +495,72 @@ pub const O = switch (native_arch) { else => @compileError("missing std.os.linux.O constants for this architecture"), }; +/// flags for `pipe2` and `IoUring.pipe` +/// matches flags in `O` but specific to `pipe2` syscall +pub const Pipe2 = switch (native_arch) { + .x86_64, .x86, .riscv32, .riscv64, .loongarch64, .hexagon, .or1k, .s390x => packed struct(u32) { + _: u7 = 0, + /// Parameter to `pipe2` selecting notification pipe + notification_pipe: bool = false, + _9: u3 = 0, + nonblock: bool = false, + _13: u2 = 0, + direct: bool = false, + _16: u4 = 0, + cloexec: bool = false, + _21: u12 = 0, + }, + .aarch64, .aarch64_be, .arm, .armeb, .thumb, .thumbeb, .m68k => packed struct(u32) { + _: u7 = 0, + /// Parameter to `pipe2` selecting notification pipe + notification_pipe: bool = false, + _9: u3 = 0, + nonblock: bool = false, + _13: u4 = 0, + direct: bool = false, + _18: u2 = 0, + cloexec: bool = false, + _21: u12 = 0, + }, + .sparc64 => packed struct(u32) { + _: u11 = 0, + /// Parameter to `pipe2` selecting notification pipe + notification_pipe: bool = false, + _13: u2 = 0, + nonblock: bool = false, + _16: u5 = 0, + direct: bool = false, + _22: u1 = 0, + cloexec: bool = false, + _24: u9 = 0, + }, + .mips, .mipsel, .mips64, .mips64el => packed struct(u32) { + _: u7 = 0, + nonblock: bool = false, + _9: u2 = 0, + /// Parameter to `pipe2` selecting notification pipe + notification_pipe: bool = false, + _12: u4 = 0, + direct: bool = false, + _17: u3 = 0, + cloexec: bool = false, + _21: u12 = 0, + }, + .powerpc, .powerpcle, .powerpc64, .powerpc64le => packed struct(u32) { + _: u7 = 0, + /// Parameter to `pipe2` selecting notification pipe + notification_pipe: bool = false, + _9: u3 = 0, + nonblock: bool = false, + _13: u5 = 0, + direct: bool = false, + _19: u1 = 0, + cloexec: bool = false, + _21: u12 = 0, + }, + else => @compileError("missing std.os.linux.Pipe2 flags for this architecture"), +}; + /// Set by startup code, used by `getauxval`. pub var elf_aux_maybe: ?[*]std.elf.Auxv = null; @@ -687,7 +755,14 @@ pub const futex_param4 = extern union { /// /// The futex_op parameter is a sub-command and flags. The sub-command /// defines which of the subsequent paramters are relevant. -pub fn futex(uaddr: *const anyopaque, futex_op: FUTEX_OP, val: u32, val2timeout: futex_param4, uaddr2: ?*const anyopaque, val3: u32) usize { +pub fn futex( + uaddr: *const atomic.Value(u32), + futex_op: FUTEX_OP, + val: u32, + val2timeout: futex_param4, + uaddr2: ?*const anyopaque, + val3: u32, +) usize { return syscall6( if (@hasField(SYS, "futex") and native_arch != .hexagon) .futex else .futex_time64, @intFromPtr(uaddr), @@ -701,7 +776,7 @@ pub fn futex(uaddr: *const anyopaque, futex_op: FUTEX_OP, val: u32, val2timeout: /// Three-argument variation of the v1 futex call. Only suitable for a /// futex_op that ignores the remaining arguments (e.g., FUTUX_OP.WAKE). -pub fn futex_3arg(uaddr: *const anyopaque, futex_op: FUTEX_OP, val: u32) usize { +pub fn futex_3arg(uaddr: *const atomic.Value(u32), futex_op: FUTEX_OP, val: u32) usize { return syscall3( if (@hasField(SYS, "futex") and native_arch != .hexagon) .futex else .futex_time64, @intFromPtr(uaddr), @@ -712,7 +787,7 @@ pub fn futex_3arg(uaddr: *const anyopaque, futex_op: FUTEX_OP, val: u32) usize { /// Four-argument variation on the v1 futex call. Only suitable for /// futex_op that ignores the remaining arguments (e.g., FUTEX_OP.WAIT). -pub fn futex_4arg(uaddr: *const anyopaque, futex_op: FUTEX_OP, val: u32, timeout: ?*const timespec) usize { +pub fn futex_4arg(uaddr: *const atomic.Value(u32), futex_op: FUTEX_OP, val: u32, timeout: ?*const timespec) usize { return syscall4( if (@hasField(SYS, "futex") and native_arch != .hexagon) .futex else .futex_time64, @intFromPtr(uaddr), @@ -722,13 +797,13 @@ pub fn futex_4arg(uaddr: *const anyopaque, futex_op: FUTEX_OP, val: u32, timeout ); } -/// Given an array of `futex2_waitone`, wait on each uaddr. +/// Given an array of `Futex2.WaitOne`, wait on each uaddr. /// The thread wakes if a futex_wake() is performed at any uaddr. /// The syscall returns immediately if any futex has *uaddr != val. /// timeout is an optional, absolute timeout value for the operation. /// The `flags` argument is for future use and currently should be `.{}`. /// Flags for private futexes, sizes, etc. should be set on the -/// individual flags of each `futex2_waitone`. +/// individual flags of each `Futex2.WaitOne`. /// /// Returns the array index of one of the woken futexes. /// No further information is provided: any number of other futexes may also @@ -739,19 +814,19 @@ pub fn futex_4arg(uaddr: *const anyopaque, futex_op: FUTEX_OP, val: u32, timeout /// /// Requires at least kernel v5.16. pub fn futex2_waitv( - futexes: [*]const futex2_waitone, - /// Length of `futexes`. Max of FUTEX2_WAITONE_MAX. - nr_futexes: u32, - flags: FUTEX2_FLAGS_WAITV, + /// The length of `futexes` slice must not exceed `Futex2.waitone_max` + futexes: []const Futex2.WaitOne, + flags: Futex2.Waitv, /// Optional absolute timeout. Always 64-bit, even on 32-bit platforms. timeout: ?*const kernel_timespec, /// Clock to be used for the timeout, realtime or monotonic. clockid: clockid_t, ) usize { + assert(futexes.len <= Futex2.waitone_max); return syscall5( .futex_waitv, - @intFromPtr(futexes), - nr_futexes, + @intFromPtr(futexes.ptr), + @intCast(futexes.len), @as(u32, @bitCast(flags)), @intFromPtr(timeout), @intFromEnum(clockid), @@ -765,12 +840,12 @@ pub fn futex2_waitv( /// Requires at least kernel v6.7. pub fn futex2_wait( /// Address of the futex to wait on. - uaddr: *const anyopaque, + uaddr: *const atomic.Value(u32), /// Value of `uaddr`. val: usize, /// Bitmask to match against incoming wakeup masks. Must not be zero. - mask: usize, - flags: FUTEX2_FLAGS, + mask: Futex2.Bitset, + flags: Futex2.Wait, /// Optional absolute timeout. Always 64-bit, even on 32-bit platforms. timeout: ?*const kernel_timespec, /// Clock to be used for the timeout, realtime or monotonic. @@ -780,7 +855,7 @@ pub fn futex2_wait( .futex_wait, @intFromPtr(uaddr), val, - mask, + @intCast(mask.toInt()), @as(u32, @bitCast(flags)), @intFromPtr(timeout), @intFromEnum(clockid), @@ -794,18 +869,18 @@ pub fn futex2_wait( /// Requires at least kernel v6.7. pub fn futex2_wake( /// Futex to wake - uaddr: *const anyopaque, + uaddr: *const atomic.Value(u32), /// Bitmask to match against waiters. - mask: usize, + mask: Futex2.Bitset, /// Maximum number of waiters on the futex to wake. nr_wake: i32, - flags: FUTEX2_FLAGS, + flags: Futex2.Wake, ) usize { return syscall4( .futex_wake, @intFromPtr(uaddr), - mask, - @as(u32, @bitCast(nr_wake)), + @intCast(mask.toInt()), + @intCast(nr_wake), @as(u32, @bitCast(flags)), ); } @@ -816,9 +891,9 @@ pub fn futex2_wake( /// Requires at least kernel v6.7. pub fn futex2_requeue( /// The source and destination futexes. Must be a 2-element array. - waiters: [*]const futex2_waitone, + waiters: *const [2]Futex2.WaitOne, /// Currently unused. - flags: FUTEX2_FLAGS_REQUEUE, + flags: Futex2.Requeue, /// Maximum number of waiters to wake on the source futex. nr_wake: i32, /// Maximum number of waiters to transfer to the destination futex. @@ -828,8 +903,8 @@ pub fn futex2_requeue( .futex_requeue, @intFromPtr(waiters), @as(u32, @bitCast(flags)), - @as(u32, @bitCast(nr_wake)), - @as(u32, @bitCast(nr_requeue)), + @intCast(nr_wake), + @intCast(nr_requeue), ); } @@ -922,7 +997,7 @@ pub fn readlink(noalias path: [*:0]const u8, noalias buf_ptr: [*]u8, buf_len: us if (@hasField(SYS, "readlink")) { return syscall3(.readlink, @intFromPtr(path), @intFromPtr(buf_ptr), buf_len); } else { - return syscall4(.readlinkat, @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(path), @intFromPtr(buf_ptr), buf_len); + return syscall4(.readlinkat, @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(path), @intFromPtr(buf_ptr), buf_len); } } @@ -934,7 +1009,7 @@ pub fn mkdir(path: [*:0]const u8, mode: mode_t) usize { if (@hasField(SYS, "mkdir")) { return syscall2(.mkdir, @intFromPtr(path), mode); } else { - return syscall3(.mkdirat, @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(path), mode); + return syscall3(.mkdirat, @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(path), mode); } } @@ -946,7 +1021,7 @@ pub fn mknod(path: [*:0]const u8, mode: u32, dev: u32) usize { if (@hasField(SYS, "mknod")) { return syscall3(.mknod, @intFromPtr(path), mode, dev); } else { - return mknodat(AT.FDCWD, path, mode, dev); + return mknodat(At.fdcwd, path, mode, dev); } } @@ -1179,7 +1254,7 @@ pub fn rmdir(path: [*:0]const u8) usize { if (@hasField(SYS, "rmdir")) { return syscall1(.rmdir, @intFromPtr(path)); } else { - return syscall3(.unlinkat, @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(path), AT.REMOVEDIR); + return syscall3(.unlinkat, @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(path), @as(u32, @bitCast(At{ .removedir_or_handle_fid = .{ .removedir = true } }))); } } @@ -1187,7 +1262,7 @@ pub fn symlink(existing: [*:0]const u8, new: [*:0]const u8) usize { if (@hasField(SYS, "symlink")) { return syscall2(.symlink, @intFromPtr(existing), @intFromPtr(new)); } else { - return syscall3(.symlinkat, @intFromPtr(existing), @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(new)); + return syscall3(.symlinkat, @intFromPtr(existing), @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(new)); } } @@ -1238,7 +1313,7 @@ pub fn access(path: [*:0]const u8, mode: u32) usize { if (@hasField(SYS, "access")) { return syscall2(.access, @intFromPtr(path), mode); } else { - return faccessat(AT.FDCWD, path, mode, 0); + return faccessat(At.fdcwd, path, mode, 0); } } @@ -1339,9 +1414,9 @@ pub fn rename(old: [*:0]const u8, new: [*:0]const u8) usize { if (@hasField(SYS, "rename")) { return syscall2(.rename, @intFromPtr(old), @intFromPtr(new)); } else if (@hasField(SYS, "renameat")) { - return syscall4(.renameat, @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(old), @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(new)); + return syscall4(.renameat, @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(old), @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(new)); } else { - return syscall5(.renameat2, @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(old), @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(new), 0); + return syscall5(.renameat2, @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(old), @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(new), 0); } } @@ -1383,7 +1458,7 @@ pub fn open(path: [*:0]const u8, flags: O, perm: mode_t) usize { } else { return syscall4( .openat, - @bitCast(@as(isize, AT.FDCWD)), + @bitCast(@as(isize, At.fdcwd)), @intFromPtr(path), @as(u32, @bitCast(flags)), perm, @@ -1396,7 +1471,7 @@ pub fn create(path: [*:0]const u8, perm: mode_t) usize { } pub fn openat(dirfd: i32, path: [*:0]const u8, flags: O, mode: mode_t) usize { - // dirfd could be negative, for example AT.FDCWD is -100 + // dirfd could be negative, for example At.fdcwd is -100 return syscall4(.openat, @bitCast(@as(isize, dirfd)), @intFromPtr(path), @as(u32, @bitCast(flags)), mode); } @@ -1422,7 +1497,7 @@ pub fn chmod(path: [*:0]const u8, mode: mode_t) usize { if (@hasField(SYS, "chmod")) { return syscall2(.chmod, @intFromPtr(path), mode); } else { - return fchmodat(AT.FDCWD, path, mode, 0); + return fchmodat(At.fdcwd, path, mode, 0); } } @@ -1554,9 +1629,9 @@ pub fn link(oldpath: [*:0]const u8, newpath: [*:0]const u8) usize { } else { return syscall5( .linkat, - @as(usize, @bitCast(@as(isize, AT.FDCWD))), + @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(oldpath), - @as(usize, @bitCast(@as(isize, AT.FDCWD))), + @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(newpath), 0, ); @@ -1578,7 +1653,7 @@ pub fn unlink(path: [*:0]const u8) usize { if (@hasField(SYS, "unlink")) { return syscall1(.unlink, @intFromPtr(path)); } else { - return syscall3(.unlinkat, @as(usize, @bitCast(@as(isize, AT.FDCWD))), @intFromPtr(path), 0); + return syscall3(.unlinkat, @as(usize, @bitCast(@as(isize, At.fdcwd))), @intFromPtr(path), 0); } } @@ -2238,25 +2313,37 @@ pub fn lstat(pathname: [*:0]const u8, statbuf: *Stat) usize { } } -pub fn fstatat(dirfd: i32, path: [*:0]const u8, stat_buf: *Stat, flags: u32) usize { +pub fn fstatat(dirfd: i32, path: [*:0]const u8, stat_buf: *Stat, flags: At) usize { if (native_arch == .riscv32 or native_arch.isLoongArch()) { // riscv32 and loongarch have made the interesting decision to not implement some of // the older stat syscalls, including this one. @compileError("No fstatat syscall on this architecture."); } else if (@hasField(SYS, "fstatat64")) { - return syscall4(.fstatat64, @as(usize, @bitCast(@as(isize, dirfd))), @intFromPtr(path), @intFromPtr(stat_buf), flags); + return syscall4( + .fstatat64, + @as(usize, @bitCast(@as(isize, dirfd))), + @intFromPtr(path), + @intFromPtr(stat_buf), + @intCast(@as(u32, @bitCast(flags))), + ); } else { - return syscall4(.fstatat, @as(usize, @bitCast(@as(isize, dirfd))), @intFromPtr(path), @intFromPtr(stat_buf), flags); + return syscall4( + .fstatat, + @as(usize, @bitCast(@as(isize, dirfd))), + @intFromPtr(path), + @intFromPtr(stat_buf), + @bitCast(flags), + ); } } -pub fn statx(dirfd: i32, path: [*:0]const u8, flags: u32, mask: u32, statx_buf: *Statx) usize { +pub fn statx(dirfd: i32, path: [*:0]const u8, flags: At, mask: Statx.Mask, statx_buf: *Statx) usize { return syscall5( .statx, @as(usize, @bitCast(@as(isize, dirfd))), @intFromPtr(path), - flags, - mask, + @intCast(@as(u32, @bitCast(flags))), + @intCast(@as(u32, @bitCast(mask))), @intFromPtr(statx_buf), ); } @@ -2416,8 +2503,14 @@ pub fn epoll_create1(flags: usize) usize { return syscall1(.epoll_create1, flags); } -pub fn epoll_ctl(epoll_fd: i32, op: u32, fd: i32, ev: ?*epoll_event) usize { - return syscall4(.epoll_ctl, @as(usize, @bitCast(@as(isize, epoll_fd))), @as(usize, @intCast(op)), @as(usize, @bitCast(@as(isize, fd))), @intFromPtr(ev)); +pub fn epoll_ctl(epoll_fd: i32, op: EpollOp, fd: i32, ev: ?*epoll_event) usize { + return syscall4( + .epoll_ctl, + @as(usize, @bitCast(@as(isize, epoll_fd))), + @as(usize, @intFromEnum(op)), + @as(usize, @bitCast(@as(isize, fd))), + @intFromPtr(ev), + ); } pub fn epoll_wait(epoll_fd: i32, events: [*]epoll_event, maxevents: u32, timeout: i32) usize { @@ -2510,15 +2603,15 @@ pub fn uname(uts: *utsname) usize { return syscall1(.uname, @intFromPtr(uts)); } -pub fn io_uring_setup(entries: u32, p: *io_uring_params) usize { +pub fn io_uring_setup(entries: u32, p: *IoUring.Params) usize { return syscall2(.io_uring_setup, entries, @intFromPtr(p)); } -pub fn io_uring_enter(fd: i32, to_submit: u32, min_complete: u32, flags: u32, sig: ?*sigset_t) usize { - return syscall6(.io_uring_enter, @as(usize, @bitCast(@as(isize, fd))), to_submit, min_complete, flags, @intFromPtr(sig), NSIG / 8); +pub fn io_uring_enter(fd: i32, to_submit: u32, min_complete: u32, flags: IoUring.uflags.Enter, sig: ?*sigset_t) usize { + return syscall6(.io_uring_enter, @as(usize, @bitCast(@as(isize, fd))), to_submit, min_complete, @intCast(@as(u32, @bitCast(flags))), @intFromPtr(sig), NSIG / 8); } -pub fn io_uring_register(fd: i32, opcode: IORING_REGISTER, arg: ?*const anyopaque, nr_args: u32) usize { +pub fn io_uring_register(fd: i32, opcode: IoUring.RegisterOp, arg: ?*const anyopaque, nr_args: u32) usize { return syscall4(.io_uring_register, @as(usize, @bitCast(@as(isize, fd))), @intFromEnum(opcode), @intFromPtr(arg), nr_args); } @@ -3483,41 +3576,72 @@ pub const STDIN_FILENO = 0; pub const STDOUT_FILENO = 1; pub const STDERR_FILENO = 2; -pub const AT = struct { - /// Special value used to indicate openat should use the current working directory - pub const FDCWD = -100; - +/// Deprecated alias to At +pub const AT = At; +/// matches AT_* and AT_STATX_* +pub const At = packed struct(u32) { + _u1: u8 = 0, /// Do not follow symbolic links - pub const SYMLINK_NOFOLLOW = 0x100; - + symlink_nofollow: bool = false, /// Remove directory instead of unlinking file - pub const REMOVEDIR = 0x200; - + removedir: bool = false, /// Follow symbolic links. - pub const SYMLINK_FOLLOW = 0x400; - + symlink_follow: bool = false, /// Suppress terminal automount traversal - pub const NO_AUTOMOUNT = 0x800; - + no_automount: bool = false, /// Allow empty relative pathname - pub const EMPTY_PATH = 0x1000; + empty_path: bool = false, + /// Force the attributes to be sync'd with the server + statx_force_sync: bool = false, + /// Don't sync attributes with the server + statx_dont_sync: bool = false, + /// Apply to the entire subtree + recursive: bool = false, + _17: u16 = 0, + + /// File handle is needed to compare object identity and may not be usable + /// with open_by_handle_at(2) + pub const handle_fid: At = .{ .removedir = true }; + + /// Special value used to indicate openat should use the current working directory + pub const fdcwd = -100; + // https://github.com/torvalds/linux/blob/d3479214c05dbd07bc56f8823e7bd8719fcd39a9/tools/perf/trace/beauty/fs_at_flags.sh#L15 + /// AT_STATX_SYNC_TYPE is not a bit, its a mask of + /// AT_STATX_SYNC_AS_STAT, AT_STATX_FORCE_SYNC and AT_STATX_DONT_SYNC /// Type of synchronisation required from statx() - pub const STATX_SYNC_TYPE = 0x6000; + pub const statx_sync_type = 0x6000; - /// - Do whatever stat() does - pub const STATX_SYNC_AS_STAT = 0x0000; + /// Do whatever stat() does + /// This is the default and is very much filesystem-specific + pub const statx_sync_as_stat: At = .{}; + // DEPRECATED ALIASES + // + // + /// Special value used to indicate openat should use the current working directory + pub const FDCWD = fdcwd; + /// Do not follow symbolic links + pub const SYMLINK_NOFOLLOW: u32 = @bitCast(At{ .symlink_nofollow = true }); + /// Remove directory instead of unlinking file + pub const REMOVEDIR: u32 = @bitCast(At{ .removedir = true }); + pub const HANDLE_FID: u32 = @bitCast(handle_fid); + /// Follow symbolic links. + pub const SYMLINK_FOLLOW: u32 = @bitCast(At{ .symlink_follow = true }); + /// Suppress terminal automount traversal + pub const NO_AUTOMOUNT: u32 = @bitCast(At{ .no_automount = true }); + /// Allow empty relative pathname + pub const EMPTY_PATH: u32 = @bitCast(At{ .empty_path = true }); + /// Type of synchronisation required from statx() + pub const STATX_SYNC_TYPE: u32 = @bitCast(statx_sync_type); + /// - Do whatever stat() does + pub const STATX_SYNC_AS_STAT: u32 = @bitCast(statx_sync_as_stat); /// - Force the attributes to be sync'd with the server - pub const STATX_FORCE_SYNC = 0x2000; - + pub const STATX_FORCE_SYNC: u32 = @bitCast(At{ .statx_force_sync = true }); /// - Don't sync attributes with the server - pub const STATX_DONT_SYNC = 0x4000; - + pub const STATX_DONT_SYNC: u32 = @bitCast(At{ .statx_dont_sync = true }); /// Apply to the entire subtree - pub const RECURSIVE = 0x8000; - - pub const HANDLE_FID = REMOVEDIR; + pub const RECURSIVE: u32 = @bitCast(At{ .recursive = true }); }; pub const FALLOC = struct { @@ -3603,39 +3727,146 @@ pub const FUTEX_WAKE_OP_CMP = enum(u4) { GE = 5, }; -/// Max numbers of elements in a `futex2_waitone` array. -pub const FUTEX2_WAITONE_MAX = 128; +pub const Futex2 = struct { + /// Max numbers of elements in a `futex_waitv` .ie `WaitOne` array + /// matches FUTEX_WAITV_MAX + pub const waitone_max = 128; -/// For futex v2 API, the size of the futex at the uaddr. v1 futex are -/// always implicitly U32. As of kernel v6.14, only U32 is implemented -/// for v2 futexes. -pub const FUTEX2_SIZE = enum(u2) { - U8 = 0, - U16 = 1, - U32 = 2, - U64 = 3, -}; + /// For futex v2 API, the size of the futex at the uaddr. v1 futex are + /// always implicitly U32. As of kernel v6.14, only U32 is implemented + /// for v2 futexes. + pub const Size = enum(u2) { + U8 = 0, + U16 = 1, + U32 = 2, + U64 = 3, + }; -/// As of kernel 6.14 there are no defined flags to futex2_waitv. -pub const FUTEX2_FLAGS_WAITV = packed struct(u32) { - _reserved: u32 = 0, -}; + /// flags for `futex2_requeue` syscall + /// As of kernel 6.14 there are no defined flags to futex2_requeue. + pub const Requeue = packed struct(u32) { + _: u32 = 0, + }; -/// As of kernel 6.14 there are no defined flags to futex2_requeue. -pub const FUTEX2_FLAGS_REQUEUE = packed struct(u32) { - _reserved: u32 = 0, -}; + /// flags for `futex2_waitv` syscall + /// As of kernel 6.14 there are no defined flags to futex2_waitv. + pub const Waitv = packed struct(u32) { + _: u32 = 0, + }; -/// Flags for futex v2 APIs (futex2_wait, futex2_wake, futex2_requeue, but -/// not the futex2_waitv syscall, but also used in the futex2_waitone struct). -pub const FUTEX2_FLAGS = packed struct(u32) { - size: FUTEX2_SIZE, - numa: bool = false, - _reserved: u4 = 0, - private: bool, - _undefined: u24 = 0, + /// flags for `futex2_wait` syscall + pub const Wait = packed struct(u32) { + size: Size, + numa: bool = false, + mpol: bool = false, + _5: u3 = 0, + private: bool, + _9: u24 = 0, + }; + + /// flags for `futex2_wake` syscall + pub const Wake = Wait; + + /// A waiter for vectorized wait + /// For `futex2_waitv` and `futex2_requeue`. Arrays of `WaitOne` + /// allow waiting on multiple futexes in one call. + /// matches `futex_waitv` in kernel + pub const WaitOne = extern struct { + /// Expected value at uaddr, should match size of futex. + val: u64, + /// User address to wait on. Top-bits must be 0 on 32-bit. + uaddr: u64, + /// Flags for this waiter. + flags: Wait, + /// Reserved member to preserve data alignment. + __reserved: u32 = 0, + }; + + /// `Bitset` for `futex2_wait`, `futex2_wake`, `IoUring.futex_wait` and + /// `IoUring.futex_wake` operations + /// At least one bit must be set before performing supported operations + /// The bitset is stored in the kernel-internal state of a waiter. During a + /// wake operation, the same mask previously set during the wait call can + /// be used to select which waiters to woke up + /// See https://man7.org/linux/man-pages/man2/futex_wake_bitset.2const.html + /// `IoUring` supports a u64 `Bitset` while the raw syscalls uses only u32 + /// bits of `Bitset` + pub const Bitset = packed struct(u64) { + waiter1: bool = false, + waiter2: bool = false, + waiter3: bool = false, + waiter4: bool = false, + waiter5: bool = false, + waiter6: bool = false, + waiter7: bool = false, + waiter8: bool = false, + waiter9: bool = false, + waiter10: bool = false, + waiter11: bool = false, + waiter12: bool = false, + waiter13: bool = false, + waiter14: bool = false, + waiter15: bool = false, + waiter16: bool = false, + waiter17: bool = false, + waiter18: bool = false, + waiter19: bool = false, + waiter20: bool = false, + waiter21: bool = false, + waiter22: bool = false, + waiter23: bool = false, + waiter24: bool = false, + waiter25: bool = false, + waiter26: bool = false, + waiter27: bool = false, + waiter28: bool = false, + waiter29: bool = false, + waiter30: bool = false, + waiter31: bool = false, + waiter32: bool = false, + io_uring_extra: u32 = 0, + + /// `Bitset` with all bits set for the FUTEX_xxx_BITSET OPs to request a + /// match of any bit. matches FUTEX_BITSET_MATCH_ANY + pub const match_any: Bitset = @bitCast(@as(u64, 0x0000_0000_ffff_ffff)); + /// An empty `Bitset` will not wake any threads because the kernel + /// requires at least one bit to be set in the bitmask to identify + /// which waiters should be woken up. Therefore, no action will be + /// taken if the bitset is zero, this is only useful in test + pub const empty: Bitset = .{}; + + /// Create from raw u64 value + pub fn fromInt(value: u64) Bitset { + const bitset: Bitset = @bitCast(value); + assert(bitset != empty); + return bitset; + } + + /// Convert to raw u64 for syscall + pub fn toInt(self: Bitset) u64 { + return @bitCast(self); + } + }; }; +/// DEPRECATED use `Futex2.WaitOne` +pub const futex2_waitone = Futex2.WaitOne; + +/// DEPRECATED use constant in `Futex2` +pub const FUTEX2_WAITONE_MAX = Futex2.waitone_max; + +/// DEPRECATED use `Size` type in `Futex2` +pub const FUTEX2_SIZE = Futex2.Size; + +/// DEPRECATED use `Waitv` in `Futex2` +pub const FUTEX2_FLAGS_WAITV = Futex2.Waitv; + +/// DEPRECATED use `Requeue` in `Futex2` +pub const FUTEX2_FLAGS_REQUEUE = Futex2.Requeue; + +/// DEPRECATED use `Wait` in `Futex2` +pub const FUTEX2_FLAGS = Futex2.Wait; + pub const PROT = struct { /// page can not be accessed pub const NONE = 0x0; @@ -3663,31 +3894,87 @@ pub const X_OK = 1; pub const W_OK = 2; pub const R_OK = 4; -pub const W = struct { - pub const NOHANG = 1; - pub const UNTRACED = 2; - pub const STOPPED = 2; - pub const EXITED = 4; - pub const CONTINUED = 8; - pub const NOWAIT = 0x1000000; +pub const W = packed struct(u32) { + nohang: bool = false, + stopped: bool = false, + exited: bool = false, + continued: bool = false, + _5: u20 = 0, + nowait: bool = false, + _26: u7 = 0, + /// alias to stopped + pub const untraced: W = .{ .stopped = true }; + + fn toInt(s: W) u32 { + return @bitCast(s); + } + + /// matches EXITSTATUS in C + pub fn exitStatus(s: W) u8 { + return @intCast((s.toInt() & 0xff00) >> 8); + } + + /// matches TERMSIG in C + pub fn termSig(s: W) u32 { + return s.toInt() & 0x7f; + } + + /// matches STOPSIG in C + pub fn stopSig(s: W) u32 { + return exitStatus(s); + } + + /// matches IFEXITED in C + pub fn ifExited(s: W) bool { + return termSig(s) == 0; + } + + /// matches IFSTOPPED in C + pub fn ifStopped(s: W) bool { + return @as(u16, @truncate(((s.toInt() & 0xffff) *% 0x10001) >> 8)) > 0x7f00; + } + /// matches IFSIGNALED in C + pub fn ifSignaled(s: W) bool { + return (s.toInt() & 0xffff) -% 1 < 0xff; + } + + // Deprecated constants + pub const NOHANG: u32 = @bitCast(W{ .nohang = true }); + pub const STOPPED: u32 = @bitCast(W{ .stopped = true }); + pub const UNTRACED: u32 = @bitCast(untraced); + pub const EXITED: u32 = @bitCast(W{ .exited = true }); + pub const CONTINUED: u32 = @bitCast(W{ .continued = true }); + pub const NOWAIT: u32 = @bitCast(W{ .nowait = true }); + + /// DEPRECATED alias to exitStatus pub fn EXITSTATUS(s: u32) u8 { - return @as(u8, @intCast((s & 0xff00) >> 8)); + return exitStatus(@bitCast(s)); } + + /// DEPRECATED alias to termSig pub fn TERMSIG(s: u32) u32 { - return s & 0x7f; + return termSig(@bitCast(s)); } + + /// DEPRECATED alias to stopSig pub fn STOPSIG(s: u32) u32 { - return EXITSTATUS(s); + return stopSig(@bitCast(s)); } + + /// DEPRECATED alias to ifExited pub fn IFEXITED(s: u32) bool { - return TERMSIG(s) == 0; + return ifExited(@bitCast(s)); } + + /// DEPRECATED alias to ifStopped pub fn IFSTOPPED(s: u32) bool { - return @as(u16, @truncate(((s & 0xffff) *% 0x10001) >> 8)) > 0x7f00; + return ifStopped(@bitCast(s)); } + + /// DEPRECATED alias to ifSignaled pub fn IFSIGNALED(s: u32) bool { - return (s & 0xffff) -% 1 < 0xff; + return ifSignaled(@bitCast(s)); } }; @@ -3886,22 +4173,93 @@ pub const SEEK = struct { pub const END = 2; }; -pub const SHUT = struct { - pub const RD = 0; - pub const WR = 1; - pub const RDWR = 2; +/// Deprecated alias to Shut +pub const SHUT = Shut; +/// enum sock_shutdown_cmd - Shutdown types +/// matches SHUT_* in kenel +pub const Shut = enum(u32) { + /// SHUT_RD: shutdown receptions + rd = 0, + /// SHUT_WR: shutdown transmissions + wd = 1, + /// SHUT_RDWR: shutdown receptions/transmissions + rdwr = 2, + + _, + + // deprecated constants of the fields + pub const RD: u32 = @intFromEnum(Shut.rd); + pub const WR: u32 = @intFromEnum(Shut.wd); + pub const RDWR: u32 = @intFromEnum(Shut.rdwr); }; -pub const SOCK = struct { - pub const STREAM = if (is_mips) 2 else 1; - pub const DGRAM = if (is_mips) 1 else 2; - pub const RAW = 3; - pub const RDM = 4; - pub const SEQPACKET = 5; - pub const DCCP = 6; - pub const PACKET = 10; - pub const CLOEXEC = if (is_sparc) 0o20000000 else 0o2000000; - pub const NONBLOCK = if (is_mips) 0o200 else if (is_sparc) 0o40000 else 0o4000; +/// flags for `sync_file_range(2)` syscall +/// matches SYNC_FILE_RANGE_* in kernel +pub const SyncFileRange = packed struct(u32) { + wait_before: bool = false, + write: bool = false, + wait_after: bool = false, + _: u29 = 0, + + pub const write_and_wait: SyncFileRange = .{ + .wait_before = true, + .write = true, + .wait_after = true, + }; +}; + +/// Deprecated alias to Sock +pub const SOCK = Sock; +/// SOCK_* Socket type and flags +pub const Sock = packed struct(u32) { + type: Type = .default, + flags: Flags = .{}, + + /// matches sock_type in kernel + pub const Type = enum(u7) { + default = 0, + stream = if (is_mips) 2 else 1, + dgram = if (is_mips) 1 else 2, + raw = 3, + rdm = 4, + seqpacket = 5, + dccp = 6, + packet = 10, + + _, + }; + + // bit range is (8 - 32] of the u32 + /// Flags for socket, socketpair, accept4 + pub const Flags = if (is_sparc) packed struct(u25) { + _8: u7 = 0, // start from u7 since Type comes before Flags + nonblock: bool = false, + _16: u7 = 0, + cloexec: bool = false, + _24: u9 = 0, + } else if (is_mips) packed struct(u25) { + nonblock: bool = false, + _9: u11 = 0, + cloexec: bool = false, + _21: u12 = 0, + } else packed struct(u25) { + _8: u4 = 0, + nonblock: bool = false, + _13: u7 = 0, + cloexec: bool = false, + _21: u12 = 0, + }; + + // Deprecated aliases for SOCK + pub const STREAM: u32 = @intFromEnum(Type.stream); + pub const DGRAM: u32 = @intFromEnum(Type.dgram); + pub const RAW: u32 = @intFromEnum(Type.raw); + pub const RDM: u32 = @intFromEnum(Type.rdm); + pub const SEQPACKET: u32 = @intFromEnum(Type.seqpacket); + pub const DCCP: u32 = @intFromEnum(Type.dccp); + pub const PACKET: u32 = @intFromEnum(Type.packet); + pub const CLOEXEC: u32 = (@as(u25, @bitCast(Flags{ .cloexec = true })) << 7); + pub const NONBLOCK: u32 = (@as(u25, @bitCast(Flags{ .nonblock = true })) << 7); }; pub const TCP = struct { @@ -4004,386 +4362,479 @@ pub const UDP_ENCAP = struct { pub const RXRPC = 6; }; -pub const PF = struct { - pub const UNSPEC = 0; - pub const LOCAL = 1; - pub const UNIX = LOCAL; - pub const FILE = LOCAL; - pub const INET = 2; - pub const AX25 = 3; - pub const IPX = 4; - pub const APPLETALK = 5; - pub const NETROM = 6; - pub const BRIDGE = 7; - pub const ATMPVC = 8; - pub const X25 = 9; - pub const INET6 = 10; - pub const ROSE = 11; - pub const DECnet = 12; - pub const NETBEUI = 13; - pub const SECURITY = 14; - pub const KEY = 15; - pub const NETLINK = 16; - pub const ROUTE = PF.NETLINK; - pub const PACKET = 17; - pub const ASH = 18; - pub const ECONET = 19; - pub const ATMSVC = 20; - pub const RDS = 21; - pub const SNA = 22; - pub const IRDA = 23; - pub const PPPOX = 24; - pub const WANPIPE = 25; - pub const LLC = 26; - pub const IB = 27; - pub const MPLS = 28; - pub const CAN = 29; - pub const TIPC = 30; - pub const BLUETOOTH = 31; - pub const IUCV = 32; - pub const RXRPC = 33; - pub const ISDN = 34; - pub const PHONET = 35; - pub const IEEE802154 = 36; - pub const CAIF = 37; - pub const ALG = 38; - pub const NFC = 39; - pub const VSOCK = 40; - pub const KCM = 41; - pub const QIPCRTR = 42; - pub const SMC = 43; - pub const XDP = 44; - pub const MAX = 45; -}; - -pub const AF = struct { - pub const UNSPEC = PF.UNSPEC; - pub const LOCAL = PF.LOCAL; - pub const UNIX = AF.LOCAL; - pub const FILE = AF.LOCAL; - pub const INET = PF.INET; - pub const AX25 = PF.AX25; - pub const IPX = PF.IPX; - pub const APPLETALK = PF.APPLETALK; - pub const NETROM = PF.NETROM; - pub const BRIDGE = PF.BRIDGE; - pub const ATMPVC = PF.ATMPVC; - pub const X25 = PF.X25; - pub const INET6 = PF.INET6; - pub const ROSE = PF.ROSE; - pub const DECnet = PF.DECnet; - pub const NETBEUI = PF.NETBEUI; - pub const SECURITY = PF.SECURITY; - pub const KEY = PF.KEY; - pub const NETLINK = PF.NETLINK; - pub const ROUTE = PF.ROUTE; - pub const PACKET = PF.PACKET; - pub const ASH = PF.ASH; - pub const ECONET = PF.ECONET; - pub const ATMSVC = PF.ATMSVC; - pub const RDS = PF.RDS; - pub const SNA = PF.SNA; - pub const IRDA = PF.IRDA; - pub const PPPOX = PF.PPPOX; - pub const WANPIPE = PF.WANPIPE; - pub const LLC = PF.LLC; - pub const IB = PF.IB; - pub const MPLS = PF.MPLS; - pub const CAN = PF.CAN; - pub const TIPC = PF.TIPC; - pub const BLUETOOTH = PF.BLUETOOTH; - pub const IUCV = PF.IUCV; - pub const RXRPC = PF.RXRPC; - pub const ISDN = PF.ISDN; - pub const PHONET = PF.PHONET; - pub const IEEE802154 = PF.IEEE802154; - pub const CAIF = PF.CAIF; - pub const ALG = PF.ALG; - pub const NFC = PF.NFC; - pub const VSOCK = PF.VSOCK; - pub const KCM = PF.KCM; - pub const QIPCRTR = PF.QIPCRTR; - pub const SMC = PF.SMC; - pub const XDP = PF.XDP; - pub const MAX = PF.MAX; -}; - -pub const SO = if (is_mips) struct { - pub const DEBUG = 1; - pub const REUSEADDR = 0x0004; - pub const KEEPALIVE = 0x0008; - pub const DONTROUTE = 0x0010; - pub const BROADCAST = 0x0020; - pub const LINGER = 0x0080; - pub const OOBINLINE = 0x0100; - pub const REUSEPORT = 0x0200; - pub const SNDBUF = 0x1001; - pub const RCVBUF = 0x1002; - pub const SNDLOWAT = 0x1003; - pub const RCVLOWAT = 0x1004; - pub const RCVTIMEO = 0x1006; - pub const SNDTIMEO = 0x1005; - pub const ERROR = 0x1007; - pub const TYPE = 0x1008; - pub const ACCEPTCONN = 0x1009; - pub const PROTOCOL = 0x1028; - pub const DOMAIN = 0x1029; - pub const NO_CHECK = 11; - pub const PRIORITY = 12; - pub const BSDCOMPAT = 14; - pub const PASSCRED = 17; - pub const PEERCRED = 18; - pub const PEERSEC = 30; - pub const SNDBUFFORCE = 31; - pub const RCVBUFFORCE = 33; - pub const SECURITY_AUTHENTICATION = 22; - pub const SECURITY_ENCRYPTION_TRANSPORT = 23; - pub const SECURITY_ENCRYPTION_NETWORK = 24; - pub const BINDTODEVICE = 25; - pub const ATTACH_FILTER = 26; - pub const DETACH_FILTER = 27; - pub const GET_FILTER = ATTACH_FILTER; - pub const PEERNAME = 28; - pub const TIMESTAMP_OLD = 29; - pub const PASSSEC = 34; - pub const TIMESTAMPNS_OLD = 35; - pub const MARK = 36; - pub const TIMESTAMPING_OLD = 37; - pub const RXQ_OVFL = 40; - pub const WIFI_STATUS = 41; - pub const PEEK_OFF = 42; - pub const NOFCS = 43; - pub const LOCK_FILTER = 44; - pub const SELECT_ERR_QUEUE = 45; - pub const BUSY_POLL = 46; - pub const MAX_PACING_RATE = 47; - pub const BPF_EXTENSIONS = 48; - pub const INCOMING_CPU = 49; - pub const ATTACH_BPF = 50; - pub const DETACH_BPF = DETACH_FILTER; - pub const ATTACH_REUSEPORT_CBPF = 51; - pub const ATTACH_REUSEPORT_EBPF = 52; - pub const CNX_ADVICE = 53; - pub const MEMINFO = 55; - pub const INCOMING_NAPI_ID = 56; - pub const COOKIE = 57; - pub const PEERGROUPS = 59; - pub const ZEROCOPY = 60; - pub const TXTIME = 61; - pub const BINDTOIFINDEX = 62; - pub const TIMESTAMP_NEW = 63; - pub const TIMESTAMPNS_NEW = 64; - pub const TIMESTAMPING_NEW = 65; - pub const RCVTIMEO_NEW = 66; - pub const SNDTIMEO_NEW = 67; - pub const DETACH_REUSEPORT_BPF = 68; -} else if (is_ppc) struct { - pub const DEBUG = 1; - pub const REUSEADDR = 2; - pub const TYPE = 3; - pub const ERROR = 4; - pub const DONTROUTE = 5; - pub const BROADCAST = 6; - pub const SNDBUF = 7; - pub const RCVBUF = 8; - pub const KEEPALIVE = 9; - pub const OOBINLINE = 10; - pub const NO_CHECK = 11; - pub const PRIORITY = 12; - pub const LINGER = 13; - pub const BSDCOMPAT = 14; - pub const REUSEPORT = 15; - pub const RCVLOWAT = 16; - pub const SNDLOWAT = 17; - pub const RCVTIMEO = 18; - pub const SNDTIMEO = 19; - pub const PASSCRED = 20; - pub const PEERCRED = 21; - pub const ACCEPTCONN = 30; - pub const PEERSEC = 31; - pub const SNDBUFFORCE = 32; - pub const RCVBUFFORCE = 33; - pub const PROTOCOL = 38; - pub const DOMAIN = 39; - pub const SECURITY_AUTHENTICATION = 22; - pub const SECURITY_ENCRYPTION_TRANSPORT = 23; - pub const SECURITY_ENCRYPTION_NETWORK = 24; - pub const BINDTODEVICE = 25; - pub const ATTACH_FILTER = 26; - pub const DETACH_FILTER = 27; - pub const GET_FILTER = ATTACH_FILTER; - pub const PEERNAME = 28; - pub const TIMESTAMP_OLD = 29; - pub const PASSSEC = 34; - pub const TIMESTAMPNS_OLD = 35; - pub const MARK = 36; - pub const TIMESTAMPING_OLD = 37; - pub const RXQ_OVFL = 40; - pub const WIFI_STATUS = 41; - pub const PEEK_OFF = 42; - pub const NOFCS = 43; - pub const LOCK_FILTER = 44; - pub const SELECT_ERR_QUEUE = 45; - pub const BUSY_POLL = 46; - pub const MAX_PACING_RATE = 47; - pub const BPF_EXTENSIONS = 48; - pub const INCOMING_CPU = 49; - pub const ATTACH_BPF = 50; - pub const DETACH_BPF = DETACH_FILTER; - pub const ATTACH_REUSEPORT_CBPF = 51; - pub const ATTACH_REUSEPORT_EBPF = 52; - pub const CNX_ADVICE = 53; - pub const MEMINFO = 55; - pub const INCOMING_NAPI_ID = 56; - pub const COOKIE = 57; - pub const PEERGROUPS = 59; - pub const ZEROCOPY = 60; - pub const TXTIME = 61; - pub const BINDTOIFINDEX = 62; - pub const TIMESTAMP_NEW = 63; - pub const TIMESTAMPNS_NEW = 64; - pub const TIMESTAMPING_NEW = 65; - pub const RCVTIMEO_NEW = 66; - pub const SNDTIMEO_NEW = 67; - pub const DETACH_REUSEPORT_BPF = 68; -} else if (is_sparc) struct { - pub const DEBUG = 1; - pub const REUSEADDR = 4; - pub const TYPE = 4104; - pub const ERROR = 4103; - pub const DONTROUTE = 16; - pub const BROADCAST = 32; - pub const SNDBUF = 4097; - pub const RCVBUF = 4098; - pub const KEEPALIVE = 8; - pub const OOBINLINE = 256; - pub const NO_CHECK = 11; - pub const PRIORITY = 12; - pub const LINGER = 128; - pub const BSDCOMPAT = 1024; - pub const REUSEPORT = 512; - pub const PASSCRED = 2; - pub const PEERCRED = 64; - pub const RCVLOWAT = 2048; - pub const SNDLOWAT = 4096; - pub const RCVTIMEO = 8192; - pub const SNDTIMEO = 16384; - pub const ACCEPTCONN = 32768; - pub const PEERSEC = 30; - pub const SNDBUFFORCE = 4106; - pub const RCVBUFFORCE = 4107; - pub const PROTOCOL = 4136; - pub const DOMAIN = 4137; - pub const SECURITY_AUTHENTICATION = 20481; - pub const SECURITY_ENCRYPTION_TRANSPORT = 20482; - pub const SECURITY_ENCRYPTION_NETWORK = 20484; - pub const BINDTODEVICE = 13; - pub const ATTACH_FILTER = 26; - pub const DETACH_FILTER = 27; - pub const GET_FILTER = 26; - pub const PEERNAME = 28; - pub const TIMESTAMP_OLD = 29; - pub const PASSSEC = 31; - pub const TIMESTAMPNS_OLD = 33; - pub const MARK = 34; - pub const TIMESTAMPING_OLD = 35; - pub const RXQ_OVFL = 36; - pub const WIFI_STATUS = 37; - pub const PEEK_OFF = 38; - pub const NOFCS = 39; - pub const LOCK_FILTER = 40; - pub const SELECT_ERR_QUEUE = 41; - pub const BUSY_POLL = 48; - pub const MAX_PACING_RATE = 49; - pub const BPF_EXTENSIONS = 50; - pub const INCOMING_CPU = 51; - pub const ATTACH_BPF = 52; - pub const DETACH_BPF = 27; - pub const ATTACH_REUSEPORT_CBPF = 53; - pub const ATTACH_REUSEPORT_EBPF = 54; - pub const CNX_ADVICE = 55; - pub const MEMINFO = 57; - pub const INCOMING_NAPI_ID = 58; - pub const COOKIE = 59; - pub const PEERGROUPS = 61; - pub const ZEROCOPY = 62; - pub const TXTIME = 63; - pub const BINDTOIFINDEX = 65; - pub const TIMESTAMP_NEW = 70; - pub const TIMESTAMPNS_NEW = 66; - pub const TIMESTAMPING_NEW = 67; - pub const RCVTIMEO_NEW = 68; - pub const SNDTIMEO_NEW = 69; - pub const DETACH_REUSEPORT_BPF = 71; -} else struct { - pub const DEBUG = 1; - pub const REUSEADDR = 2; - pub const TYPE = 3; - pub const ERROR = 4; - pub const DONTROUTE = 5; - pub const BROADCAST = 6; - pub const SNDBUF = 7; - pub const RCVBUF = 8; - pub const KEEPALIVE = 9; - pub const OOBINLINE = 10; - pub const NO_CHECK = 11; - pub const PRIORITY = 12; - pub const LINGER = 13; - pub const BSDCOMPAT = 14; - pub const REUSEPORT = 15; - pub const PASSCRED = 16; - pub const PEERCRED = 17; - pub const RCVLOWAT = 18; - pub const SNDLOWAT = 19; - pub const RCVTIMEO = 20; - pub const SNDTIMEO = 21; - pub const ACCEPTCONN = 30; - pub const PEERSEC = 31; - pub const SNDBUFFORCE = 32; - pub const RCVBUFFORCE = 33; - pub const PROTOCOL = 38; - pub const DOMAIN = 39; - pub const SECURITY_AUTHENTICATION = 22; - pub const SECURITY_ENCRYPTION_TRANSPORT = 23; - pub const SECURITY_ENCRYPTION_NETWORK = 24; - pub const BINDTODEVICE = 25; - pub const ATTACH_FILTER = 26; - pub const DETACH_FILTER = 27; - pub const GET_FILTER = ATTACH_FILTER; - pub const PEERNAME = 28; - pub const TIMESTAMP_OLD = 29; - pub const PASSSEC = 34; - pub const TIMESTAMPNS_OLD = 35; - pub const MARK = 36; - pub const TIMESTAMPING_OLD = 37; - pub const RXQ_OVFL = 40; - pub const WIFI_STATUS = 41; - pub const PEEK_OFF = 42; - pub const NOFCS = 43; - pub const LOCK_FILTER = 44; - pub const SELECT_ERR_QUEUE = 45; - pub const BUSY_POLL = 46; - pub const MAX_PACING_RATE = 47; - pub const BPF_EXTENSIONS = 48; - pub const INCOMING_CPU = 49; - pub const ATTACH_BPF = 50; - pub const DETACH_BPF = DETACH_FILTER; - pub const ATTACH_REUSEPORT_CBPF = 51; - pub const ATTACH_REUSEPORT_EBPF = 52; - pub const CNX_ADVICE = 53; - pub const MEMINFO = 55; - pub const INCOMING_NAPI_ID = 56; - pub const COOKIE = 57; - pub const PEERGROUPS = 59; - pub const ZEROCOPY = 60; - pub const TXTIME = 61; - pub const BINDTOIFINDEX = 62; - pub const TIMESTAMP_NEW = 63; - pub const TIMESTAMPNS_NEW = 64; - pub const TIMESTAMPING_NEW = 65; - pub const RCVTIMEO_NEW = 66; - pub const SNDTIMEO_NEW = 67; - pub const DETACH_REUSEPORT_BPF = 68; +// Deprecated Alias +pub const AF = Af; +pub const PF = Af; +/// Protocol Family (same values as Protocol Family) +pub const Pf = Af; +/// Address Family +pub const Af = enum(u16) { + unspec = 0, + unix = 1, + inet = 2, + ax25 = 3, + ipx = 4, + appletalk = 5, + netrom = 6, + bridge = 7, + atmpvc = 8, + x25 = 9, + inet6 = 10, + rose = 11, + decnet = 12, + netbeui = 13, + security = 14, + key = 15, + route = 16, + packet = 17, + ash = 18, + econet = 19, + atmsvc = 20, + rds = 21, + sna = 22, + irda = 23, + pppox = 24, + wanpipe = 25, + llc = 26, + ib = 27, + mpls = 28, + can = 29, + tipc = 30, + bluetooth = 31, + iucv = 32, + rxrpc = 33, + isdn = 34, + phonet = 35, + ieee802154 = 36, + caif = 37, + alg = 38, + nfc = 39, + vsock = 40, + kcm = 41, + qipcrtr = 42, + smc = 43, + xdp = 44, + max = 45, + _, + + // Aliases + pub const local = Af.unix; + pub const file = Af.unix; + pub const netlink = Af.route; + + // Deprecated constants for backward compatibility + pub const UNSPEC: u16 = @intFromEnum(Af.unspec); + pub const UNIX: u16 = @intFromEnum(Af.unix); + pub const LOCAL: u16 = @intFromEnum(local); + pub const FILE: u16 = @intFromEnum(file); + pub const INET: u16 = @intFromEnum(Af.inet); + pub const AX25: u16 = @intFromEnum(Af.ax25); + pub const IPX: u16 = @intFromEnum(Af.ipx); + pub const APPLETALK: u16 = @intFromEnum(Af.appletalk); + pub const NETROM: u16 = @intFromEnum(Af.netrom); + pub const BRIDGE: u16 = @intFromEnum(Af.bridge); + pub const ATMPVC: u16 = @intFromEnum(Af.atmpvc); + pub const X25: u16 = @intFromEnum(Af.x25); + pub const INET6: u16 = @intFromEnum(Af.inet6); + pub const ROSE: u16 = @intFromEnum(Af.rose); + pub const DECnet: u16 = @intFromEnum(Af.decnet); + pub const NETBEUI: u16 = @intFromEnum(Af.netbeui); + pub const SECURITY: u16 = @intFromEnum(Af.security); + pub const KEY: u16 = @intFromEnum(Af.key); + pub const ROUTE: u16 = @intFromEnum(Af.route); + pub const NETLINK: u16 = @intFromEnum(netlink); + pub const PACKET: u16 = @intFromEnum(Af.packet); + pub const ASH: u16 = @intFromEnum(Af.ash); + pub const ECONET: u16 = @intFromEnum(Af.econet); + pub const ATMSVC: u16 = @intFromEnum(Af.atmsvc); + pub const RDS: u16 = @intFromEnum(Af.rds); + pub const SNA: u16 = @intFromEnum(Af.sna); + pub const IRDA: u16 = @intFromEnum(Af.irda); + pub const PPPOX: u16 = @intFromEnum(Af.pppox); + pub const WANPIPE: u16 = @intFromEnum(Af.wanpipe); + pub const LLC: u16 = @intFromEnum(Af.llc); + pub const IB: u16 = @intFromEnum(Af.ib); + pub const MPLS: u16 = @intFromEnum(Af.mpls); + pub const CAN: u16 = @intFromEnum(Af.can); + pub const TIPC: u16 = @intFromEnum(Af.tipc); + pub const BLUETOOTH: u16 = @intFromEnum(Af.bluetooth); + pub const IUCV: u16 = @intFromEnum(Af.iucv); + pub const RXRPC: u16 = @intFromEnum(Af.rxrpc); + pub const ISDN: u16 = @intFromEnum(Af.isdn); + pub const PHONET: u16 = @intFromEnum(Af.phonet); + pub const IEEE802154: u16 = @intFromEnum(Af.ieee802154); + pub const CAIF: u16 = @intFromEnum(Af.caif); + pub const ALG: u16 = @intFromEnum(Af.alg); + pub const NFC: u16 = @intFromEnum(Af.nfc); + pub const VSOCK: u16 = @intFromEnum(Af.vsock); + pub const KCM: u16 = @intFromEnum(Af.kcm); + pub const QIPCRTR: u16 = @intFromEnum(Af.qipcrtr); + pub const SMC: u16 = @intFromEnum(Af.smc); + pub const XDP: u16 = @intFromEnum(Af.xdp); + pub const MAX: u16 = @intFromEnum(Af.max); +}; + +/// SO_* type +pub const So = if (is_mips) enum(u16) { + debug = 1, + reuseaddr = 0x0004, + keepalive = 0x0008, + dontroute = 0x0010, + broadcast = 0x0020, + linger = 0x0080, + oobinline = 0x0100, + reuseport = 0x0200, + sndbuf = 0x1001, + rcvbuf = 0x1002, + sndlowat = 0x1003, + rcvlowat = 0x1004, + sndtimeo = 0x1005, + rcvtimeo = 0x1006, + @"error" = 0x1007, + type = 0x1008, + acceptconn = 0x1009, + protocol = 0x1028, + domain = 0x1029, + no_check = 11, + priority = 12, + bsdcompat = 14, + passcred = 17, + peercred = 18, + peersec = 30, + sndbufforce = 31, + rcvbufforce = 33, + security_authentication = 22, + security_encryption_transport = 23, + security_encryption_network = 24, + bindtodevice = 25, + attach_filter = 26, + detach_filter = 27, + peername = 28, + timestamp_old = 29, + passsec = 34, + timestampns_old = 35, + mark = 36, + timestamping_old = 37, + rxq_ovfl = 40, + wifi_status = 41, + peek_off = 42, + nofcs = 43, + lock_filter = 44, + select_err_queue = 45, + busy_poll = 46, + max_pacing_rate = 47, + bpf_extensions = 48, + incoming_cpu = 49, + attach_bpf = 50, + attach_reuseport_cbpf = 51, + attach_reuseport_ebpf = 52, + cnx_advice = 53, + meminfo = 55, + incoming_napi_id = 56, + cookie = 57, + peergroups = 59, + zerocopy = 60, + txtime = 61, + bindtoifindex = 62, + timestamp_new = 63, + timestampns_new = 64, + timestamping_new = 65, + rcvtimeo_new = 66, + sndtimeo_new = 67, + detach_reuseport_bpf = 68, + _, + + // aliases + pub const get_filter: So = .attach_filter; + pub const detach_bpf: So = .detach_filter; +} else if (is_ppc) enum(u16) { + debug = 1, + reuseaddr = 2, + type = 3, + @"error" = 4, + dontroute = 5, + broadcast = 6, + sndbuf = 7, + rcvbuf = 8, + keepalive = 9, + oobinline = 10, + no_check = 11, + priority = 12, + linger = 13, + bsdcompat = 14, + reuseport = 15, + rcvlowat = 16, + sndlowat = 17, + rcvtimeo = 18, + sndtimeo = 19, + passcred = 20, + peercred = 21, + acceptconn = 30, + peersec = 31, + sndbufforce = 32, + rcvbufforce = 33, + protocol = 38, + domain = 39, + security_authentication = 22, + security_encryption_transport = 23, + security_encryption_network = 24, + bindtodevice = 25, + attach_filter = 26, + detach_filter = 27, + peername = 28, + timestamp_old = 29, + passsec = 34, + timestampns_old = 35, + mark = 36, + timestamping_old = 37, + rxq_ovfl = 40, + wifi_status = 41, + peek_off = 42, + nofcs = 43, + lock_filter = 44, + select_err_queue = 45, + busy_poll = 46, + max_pacing_rate = 47, + bpf_extensions = 48, + incoming_cpu = 49, + attach_bpf = 50, + attach_reuseport_cbpf = 51, + attach_reuseport_ebpf = 52, + cnx_advice = 53, + meminfo = 55, + incoming_napi_id = 56, + cookie = 57, + peergroups = 59, + zerocopy = 60, + txtime = 61, + bindtoifindex = 62, + timestamp_new = 63, + timestampns_new = 64, + timestamping_new = 65, + rcvtimeo_new = 66, + sndtimeo_new = 67, + detach_reuseport_bpf = 68, + _, + + // aliases + pub const get_filter: So = .attach_filter; + pub const detach_bpf: So = .detach_filter; +} else if (is_sparc) enum(u16) { + debug = 1, + reuseaddr = 4, + type = 4104, + @"error" = 4103, + dontroute = 16, + broadcast = 32, + sndbuf = 4097, + rcvbuf = 4098, + keepalive = 8, + oobinline = 256, + no_check = 11, + priority = 12, + linger = 128, + bsdcompat = 1024, + reuseport = 512, + passcred = 2, + peercred = 64, + rcvlowat = 2048, + sndlowat = 4096, + rcvtimeo = 8192, + sndtimeo = 16384, + acceptconn = 32768, + peersec = 30, + sndbufforce = 4106, + rcvbufforce = 4107, + protocol = 4136, + domain = 4137, + security_authentication = 20481, + security_encryption_transport = 20482, + security_encryption_network = 20484, + bindtodevice = 13, + attach_filter = 26, + detach_filter = 27, + peername = 28, + timestamp_old = 29, + passsec = 31, + timestampns_old = 33, + mark = 34, + timestamping_old = 35, + rxq_ovfl = 36, + wifi_status = 37, + peek_off = 38, + nofcs = 39, + lock_filter = 40, + select_err_queue = 41, + busy_poll = 48, + max_pacing_rate = 49, + bpf_extensions = 50, + incoming_cpu = 51, + attach_bpf = 52, + attach_reuseport_cbpf = 53, + attach_reuseport_ebpf = 54, + cnx_advice = 55, + meminfo = 57, + incoming_napi_id = 58, + cookie = 59, + peergroups = 61, + zerocopy = 62, + txtime = 63, + bindtoifindex = 65, + timestamp_new = 70, + timestampns_new = 66, + timestamping_new = 67, + rcvtimeo_new = 68, + sndtimeo_new = 69, + detach_reuseport_bpf = 71, + _, + + // aliases + pub const get_filter: So = .attach_filter; + pub const detach_bpf: So = .detach_filter; +} else enum(u16) { + debug = 1, + reuseaddr = 2, + type = 3, + @"error" = 4, + dontroute = 5, + broadcast = 6, + sndbuf = 7, + rcvbuf = 8, + keepalive = 9, + oobinline = 10, + no_check = 11, + priority = 12, + linger = 13, + bsdcompat = 14, + reuseport = 15, + passcred = 16, + peercred = 17, + rcvlowat = 18, + sndlowat = 19, + rcvtimeo = 20, + sndtimeo = 21, + acceptconn = 30, + peersec = 31, + sndbufforce = 32, + rcvbufforce = 33, + passsec = 34, + timestampns_old = 35, + mark = 36, + timestamping_old = 37, + protocol = 38, + domain = 39, + rxq_ovfl = 40, + wifi_status = 41, + peek_off = 42, + nofcs = 43, + lock_filter = 44, + select_err_queue = 45, + busy_poll = 46, + max_pacing_rate = 47, + bpf_extensions = 48, + incoming_cpu = 49, + attach_bpf = 50, + attach_reuseport_cbpf = 51, + attach_reuseport_ebpf = 52, + cnx_advice = 53, + meminfo = 55, + incoming_napi_id = 56, + cookie = 57, + peergroups = 59, + zerocopy = 60, + txtime = 61, + bindtoifindex = 62, + timestamp_new = 63, + timestampns_new = 64, + timestamping_new = 65, + rcvtimeo_new = 66, + sndtimeo_new = 67, + detach_reuseport_bpf = 68, + security_authentication = 22, + security_encryption_transport = 23, + security_encryption_network = 24, + bindtodevice = 25, + attach_filter = 26, + detach_filter = 27, + peername = 28, + timestamp_old = 29, + _, + + // aliases + pub const get_filter: So = .attach_filter; + pub const detach_bpf: So = .detach_filter; +}; + +/// Backwards-compatible SO_* constants +pub const SO = struct { + pub const DEBUG: u16 = @intFromEnum(So.debug); + pub const REUSEADDR: u16 = @intFromEnum(So.reuseaddr); + pub const KEEPALIVE: u16 = @intFromEnum(So.keepalive); + pub const DONTROUTE: u16 = @intFromEnum(So.dontroute); + pub const BROADCAST: u16 = @intFromEnum(So.broadcast); + pub const LINGER: u16 = @intFromEnum(So.linger); + pub const OOBINLINE: u16 = @intFromEnum(So.oobinline); + pub const REUSEPORT: u16 = @intFromEnum(So.reuseport); + pub const SNDBUF: u16 = @intFromEnum(So.sndbuf); + pub const RCVBUF: u16 = @intFromEnum(So.rcvbuf); + pub const SNDLOWAT: u16 = @intFromEnum(So.sndlowat); + pub const RCVLOWAT: u16 = @intFromEnum(So.rcvlowat); + pub const RCVTIMEO: u16 = @intFromEnum(So.rcvtimeo); + pub const SNDTIMEO: u16 = @intFromEnum(So.sndtimeo); + pub const ERROR: u16 = @intFromEnum(So.@"error"); + pub const TYPE: u16 = @intFromEnum(So.type); + pub const ACCEPTCONN: u16 = @intFromEnum(So.acceptconn); + pub const PROTOCOL: u16 = @intFromEnum(So.protocol); + pub const DOMAIN: u16 = @intFromEnum(So.domain); + pub const NO_CHECK: u16 = @intFromEnum(So.no_check); + pub const PRIORITY: u16 = @intFromEnum(So.priority); + pub const BSDCOMPAT: u16 = @intFromEnum(So.bsdcompat); + pub const PASSCRED: u16 = @intFromEnum(So.passcred); + pub const PEERCRED: u16 = @intFromEnum(So.peercred); + pub const PEERSEC: u16 = @intFromEnum(So.peersec); + pub const SNDBUFFORCE: u16 = @intFromEnum(So.sndbufforce); + pub const RCVBUFFORCE: u16 = @intFromEnum(So.rcvbufforce); + pub const SECURITY_AUTHENTICATION: u16 = @intFromEnum(So.security_authentication); + pub const SECURITY_ENCRYPTION_TRANSPORT: u16 = @intFromEnum(So.security_encryption_transport); + pub const SECURITY_ENCRYPTION_NETWORK: u16 = @intFromEnum(So.security_encryption_network); + pub const BINDTODEVICE: u16 = @intFromEnum(So.bindtodevice); + pub const ATTACH_FILTER: u16 = @intFromEnum(So.attach_filter); + pub const DETACH_FILTER: u16 = @intFromEnum(So.detach_filter); + pub const GET_FILTER: u16 = ATTACH_FILTER; // alias + pub const PEERNAME: u16 = @intFromEnum(So.peername); + pub const TIMESTAMP_OLD: u16 = @intFromEnum(So.timestamp_old); + pub const PASSSEC: u16 = @intFromEnum(So.passsec); + pub const TIMESTAMPNS_OLD: u16 = @intFromEnum(So.timestampns_old); + pub const MARK: u16 = @intFromEnum(So.mark); + pub const TIMESTAMPING_OLD: u16 = @intFromEnum(So.timestamping_old); + pub const RXQ_OVFL: u16 = @intFromEnum(So.rxq_ovfl); + pub const WIFI_STATUS: u16 = @intFromEnum(So.wifi_status); + pub const PEEK_OFF: u16 = @intFromEnum(So.peek_off); + pub const NOFCS: u16 = @intFromEnum(So.nofcs); + pub const LOCK_FILTER: u16 = @intFromEnum(So.lock_filter); + pub const SELECT_ERR_QUEUE: u16 = @intFromEnum(So.select_err_queue); + pub const BUSY_POLL: u16 = @intFromEnum(So.busy_poll); + pub const MAX_PACING_RATE: u16 = @intFromEnum(So.max_pacing_rate); + pub const BPF_EXTENSIONS: u16 = @intFromEnum(So.bpf_extensions); + pub const INCOMING_CPU: u16 = @intFromEnum(So.incoming_cpu); + pub const ATTACH_BPF: u16 = @intFromEnum(So.attach_bpf); + pub const DETACH_BPF: u16 = DETACH_FILTER; // alias in original + pub const ATTACH_REUSEPORT_CBPF: u16 = @intFromEnum(So.attach_reuseport_cbpf); + pub const ATTACH_REUSEPORT_EBPF: u16 = @intFromEnum(So.attach_reuseport_ebpf); + pub const CNX_ADVICE: u16 = @intFromEnum(So.cnx_advice); + pub const MEMINFO: u16 = @intFromEnum(So.meminfo); + pub const INCOMING_NAPI_ID: u16 = @intFromEnum(So.incoming_napi_id); + pub const COOKIE: u16 = @intFromEnum(So.cookie); + pub const PEERGROUPS: u16 = @intFromEnum(So.peergroups); + pub const ZEROCOPY: u16 = @intFromEnum(So.zerocopy); + pub const TXTIME: u16 = @intFromEnum(So.txtime); + pub const BINDTOIFINDEX: u16 = @intFromEnum(So.bindtoifindex); + pub const TIMESTAMP_NEW: u16 = @intFromEnum(So.timestamp_new); + pub const TIMESTAMPNS_NEW: u16 = @intFromEnum(So.timestampns_new); + pub const TIMESTAMPING_NEW: u16 = @intFromEnum(So.timestamping_new); + pub const RCVTIMEO_NEW: u16 = @intFromEnum(So.rcvtimeo_new); + pub const SNDTIMEO_NEW: u16 = @intFromEnum(So.sndtimeo_new); + pub const DETACH_REUSEPORT_BPF: u16 = @intFromEnum(So.detach_reuseport_bpf); }; pub const SCM = struct { @@ -4399,37 +4850,100 @@ pub const SCM = struct { pub const TXTIME = SO.TXTIME; }; -pub const SOL = struct { - pub const SOCKET = if (is_mips or is_sparc) 65535 else 1; - - pub const IP = 0; - pub const IPV6 = 41; - pub const ICMPV6 = 58; - - pub const RAW = 255; - pub const DECNET = 261; - pub const X25 = 262; - pub const PACKET = 263; - pub const ATM = 264; - pub const AAL = 265; - pub const IRDA = 266; - pub const NETBEUI = 267; - pub const LLC = 268; - pub const DCCP = 269; - pub const NETLINK = 270; - pub const TIPC = 271; - pub const RXRPC = 272; - pub const PPPOL2TP = 273; - pub const BLUETOOTH = 274; - pub const PNPIPE = 275; - pub const RDS = 276; - pub const IUCV = 277; - pub const CAIF = 278; - pub const ALG = 279; - pub const NFC = 280; - pub const KCM = 281; - pub const TLS = 282; - pub const XDP = 283; +/// Deprecated in favor of Sol +pub const SOL = Sol; +// https://github.com/torvalds/linux/blob/0d97f2067c166eb495771fede9f7b73999c67f66/include/linux/socket.h#L347C1-L388C22 +/// Socket option level for setsockopt(2)/getsockopt(2) +pub const Sol = enum(u16) { + ip = 0, + socket = if (is_mips or is_sparc) 65535 else 1, + tcp = 6, + udp = 17, + ipv6 = 41, + icmpv6 = 58, + sctp = 132, + /// UDP-Lite (RFC 3828) + udplite = 136, + raw = 255, + ipx = 256, + ax25 = 257, + atalk = 258, + netrom = 259, + rose = 260, + decnet = 261, + x25 = 262, + packet = 263, + /// ATM layer (cell level) + atm = 264, + /// ATM Adaption Layer (packet level) + aal = 265, + irda = 266, + netbeui = 267, + llc = 268, + dccp = 269, + netlink = 270, + tipc = 271, + rxrpc = 272, + pppol2tp = 273, + bluetooth = 274, + pnpipe = 275, + rds = 276, + iucv = 277, + caif = 278, + alg = 279, + nfc = 280, + kcm = 281, + tls = 282, + xdp = 283, + mptcp = 284, + mctp = 285, + smc = 286, + vsock = 287, + _, + + /// Deprecated constants for compatibility with current Zig + pub const IP: u16 = @intFromEnum(Sol.ip); + pub const SOCKET: u16 = @intFromEnum(Sol.socket); + pub const TCP: u16 = @intFromEnum(Sol.tcp); + pub const UDP: u16 = @intFromEnum(Sol.udp); + pub const IPV6: u16 = @intFromEnum(Sol.ipv6); + pub const ICMPV6: u16 = @intFromEnum(Sol.icmpv6); + pub const SCTP: u16 = @intFromEnum(Sol.sctp); + pub const UDPLITE: u16 = @intFromEnum(Sol.udplite); + + pub const RAW: u16 = @intFromEnum(Sol.raw); + pub const IPX: u16 = @intFromEnum(Sol.ipx); + pub const AX25: u16 = @intFromEnum(Sol.ax25); + pub const ATALK: u16 = @intFromEnum(Sol.atalk); + pub const NETROM: u16 = @intFromEnum(Sol.netrom); + pub const ROSE: u16 = @intFromEnum(Sol.rose); + pub const DECNET: u16 = @intFromEnum(Sol.decnet); + pub const X25: u16 = @intFromEnum(Sol.x25); + pub const PACKET: u16 = @intFromEnum(Sol.packet); + pub const ATM: u16 = @intFromEnum(Sol.atm); + pub const AAL: u16 = @intFromEnum(Sol.aal); + pub const IRDA: u16 = @intFromEnum(Sol.irda); + pub const NETBEUI: u16 = @intFromEnum(Sol.netbeui); + pub const LLC: u16 = @intFromEnum(Sol.llc); + pub const DCCP: u16 = @intFromEnum(Sol.dccp); + pub const NETLINK: u16 = @intFromEnum(Sol.netlink); + pub const TIPC: u16 = @intFromEnum(Sol.tipc); + pub const RXRPC: u16 = @intFromEnum(Sol.rxrpc); + pub const PPPOL2TP: u16 = @intFromEnum(Sol.pppol2tp); + pub const BLUETOOTH: u16 = @intFromEnum(Sol.bluetooth); + pub const PNPIPE: u16 = @intFromEnum(Sol.pnpipe); + pub const RDS: u16 = @intFromEnum(Sol.rds); + pub const IUCV: u16 = @intFromEnum(Sol.iucv); + pub const CAIF: u16 = @intFromEnum(Sol.caif); + pub const ALG: u16 = @intFromEnum(Sol.alg); + pub const NFC: u16 = @intFromEnum(Sol.nfc); + pub const KCM: u16 = @intFromEnum(Sol.kcm); + pub const TLS: u16 = @intFromEnum(Sol.tls); + pub const XDP: u16 = @intFromEnum(Sol.xdp); + pub const MPTCP: u16 = @intFromEnum(Sol.mptcp); + pub const MCTP: u16 = @intFromEnum(Sol.mctp); + pub const SMC: u16 = @intFromEnum(Sol.smc); + pub const VSOCK: u16 = @intFromEnum(Sol.vsock); }; pub const SOMAXCONN = 128; @@ -4855,28 +5369,87 @@ pub const ETH = struct { }; }; -pub const MSG = struct { - pub const OOB = 0x0001; - pub const PEEK = 0x0002; - pub const DONTROUTE = 0x0004; - pub const CTRUNC = 0x0008; - pub const PROXY = 0x0010; - pub const TRUNC = 0x0020; - pub const DONTWAIT = 0x0040; - pub const EOR = 0x0080; - pub const WAITALL = 0x0100; - pub const FIN = 0x0200; - pub const SYN = 0x0400; - pub const CONFIRM = 0x0800; - pub const RST = 0x1000; - pub const ERRQUEUE = 0x2000; - pub const NOSIGNAL = 0x4000; - pub const MORE = 0x8000; - pub const WAITFORONE = 0x10000; - pub const BATCH = 0x40000; - pub const ZEROCOPY = 0x4000000; - pub const FASTOPEN = 0x20000000; - pub const CMSG_CLOEXEC = 0x40000000; +// Deprecated alias for Msg +pub const MSG = Msg; +pub const Msg = packed struct(u32) { + /// Process out-of-band data + oob: bool = false, + /// Peek at incoming message + peek: bool = false, + /// Send without using routing tables + dontroute: bool = false, + /// Control data truncated + ctrunc: bool = false, + /// Do not send. Only probe path (e.g. for MTU) + probe: bool = false, + /// Normal data truncated + trunc: bool = false, + /// Nonblocking I/O + dontwait: bool = false, + /// End of record + eor: bool = false, + /// Wait for a full request + waitall: bool = false, + /// FIN flag + fin: bool = false, + /// SYN flag + syn: bool = false, + /// Confirm path validity + confirm: bool = false, + /// RST flag + rst: bool = false, + /// Fetch message from error queue + errqueue: bool = false, + /// Do not generate SIGPIPE + nosignal: bool = false, + /// Sender will send more + more: bool = false, + /// recvmmsg(): block until 1+ packets available + waitforone: bool = false, + _18: u1 = 0, + /// sendmmsg(): more messages coming + batch: bool = false, + /// sendpage() internal: page frags are not shared + no_shared_frags: bool = false, + /// sendpage() internal: page may carry plain text and require encryption + sendpage_decrypted: bool = false, + _22: u4 = 0, + /// Receive devmem skbs as cmsg + sock_devmem: bool = false, + /// Use user data in kernel path + zerocopy: bool = false, + /// Splice the pages from the iterator in sendmsg() + splice_pages: bool = false, + _29: u1 = 0, + /// Send data in TCP SYN + fastopen: bool = false, + /// Set close_on_exec for file descriptor received through SCM_RIGHTS + cmsg_cloexec: bool = false, + _: u1 = 0, + + // DEPRECATED CONSTANTS + pub const OOB: u32 = @bitCast(Msg{ .oob = true }); + pub const PEEK: u32 = @bitCast(Msg{ .peek = true }); + pub const DONTROUTE: u32 = @bitCast(Msg{ .dontroute = true }); + pub const CTRUNC: u32 = @bitCast(Msg{ .ctrunc = true }); + // fix typo PROBE not PROXY + pub const PROBE: u32 = @bitCast(Msg{ .probe = true }); + pub const TRUNC: u32 = @bitCast(Msg{ .trunc = true }); + pub const DONTWAIT: u32 = @bitCast(Msg{ .dontwait = true }); + pub const EOR: u32 = @bitCast(Msg{ .eor = true }); + pub const WAITALL: u32 = @bitCast(Msg{ .waitall = true }); + pub const FIN: u32 = @bitCast(Msg{ .fin = true }); + pub const SYN: u32 = @bitCast(Msg{ .syn = true }); + pub const CONFIRM: u32 = @bitCast(Msg{ .confirm = true }); + pub const RST: u32 = @bitCast(Msg{ .rst = true }); + pub const ERRQUEUE: u32 = @bitCast(Msg{ .errqueue = true }); + pub const NOSIGNAL: u32 = @bitCast(Msg{ .nosignal = true }); + pub const MORE: u32 = @bitCast(Msg{ .more = true }); + pub const WAITFORONE: u32 = @bitCast(Msg{ .waitforone = true }); + pub const BATCH: u32 = @bitCast(Msg{ .batch = true }); + pub const ZEROCOPY: u32 = @bitCast(Msg{ .zerocopy = true }); + pub const FASTOPEN: u32 = @bitCast(Msg{ .fastopen = true }); + pub const CMSG_CLOEXEC: u32 = @bitCast(Msg{ .cmsg_cloexec = true }); }; pub const DT = struct { @@ -5340,28 +5913,180 @@ pub const SER = struct { }; }; -pub const EPOLL = struct { +/// Valid opcodes to issue to sys_epoll_ctl() +pub const EpollOp = enum(u32) { + ctl_add = 1, + ctl_del = 2, + ctl_mod = 3, + _, + + // Deprecated Constants + pub const CTL_ADD: u32 = @intFromEnum(EpollOp.ctl_add); + pub const CTL_DEL: u32 = @intFromEnum(EpollOp.ctl_del); + pub const CTL_MOD: u32 = @intFromEnum(EpollOp.ctl_mod); +}; + +/// Deprecated alias for Epoll +pub const EPOLL = Epoll; +/// Epoll event masks +// https://github.com/torvalds/linux/blob/18a7e218cfcdca6666e1f7356533e4c988780b57/include/uapi/linux/eventpoll.h#L30 +pub const Epoll = if (is_mips) packed struct(u32) { + // EPOLL event types (lower 16 bits) + // + /// The associated file is available for read(2) operations + in: bool = false, + /// There is an exceptional condition on the file descriptor + pri: bool = false, + /// The associated file is available for write(2) operations + out: bool = false, + /// Error condition happened on the associated file descriptor + err: bool = false, + /// Hang up happened on the associated file descriptor + hup: bool = false, + /// Invalid request: fd not open + nval: bool = false, + /// Normal data may be read + rdnorm: bool = false, + /// Priority data may be read + rdband: bool = false, + /// Priority data may be written + wrband: bool = false, + _10: u1 = 0, + /// Message available (unused on Linux) + msg: bool = false, + _12: u2 = 0, + /// Stream socket peer closed connection + rdhup: bool = false, + _15: u13 = 0, + // EPOLL input flags (Higher order flags are included as internal stat) + // + /// Internal flag - wakeup generated by io_uring, used to detect + /// recursion back into the io_uring poll handler + uring_wake: bool = false, + /// Set exclusive wakeup mode for the target file descriptor + exclusive: bool = false, + /// Request the handling of system wakeup events so as to prevent system + /// suspends from happening while those events are being processed. + /// Assuming neither EPOLLET nor EPOLLONESHOT is set, system suspends will + /// not be re-allowed until epoll_wait is called again after consuming the + /// wakeup event(s). + /// Requires CAP_BLOCK_SUSPEND + wakeup: bool = false, + /// Set the One Shot behaviour for the target file descriptor + oneshot: bool = false, + /// Set the Edge Triggered behaviour for the target file descriptor + et: bool = false, + + /// Alias to out on Mips + /// Writing is now possible (normal data) + pub const wrnorm: Epoll = .{ .out = true }; + + // Deprecated Named constants + // EPOLL event types + pub const IN: u32 = @bitCast(Epoll{ .in = true }); + pub const PRI: u32 = @bitCast(Epoll{ .pri = true }); + pub const OUT: u32 = @bitCast(Epoll{ .out = true }); + pub const ERR: u32 = @bitCast(Epoll{ .err = true }); + pub const HUP: u32 = @bitCast(Epoll{ .hup = true }); + pub const NVAL: u32 = @bitCast(Epoll{ .nval = true }); + pub const RDNORM: u32 = @bitCast(Epoll{ .rdnorm = true }); + pub const RDBAND: u32 = @bitCast(Epoll{ .rdband = true }); + pub const WRNORM: u32 = @bitCast(wrnorm); + pub const WRBAND: u32 = @bitCast(Epoll{ .wrband = true }); + pub const MSG: u32 = @bitCast(Epoll{ .msg = true }); + pub const RDHUP: u32 = @bitCast(Epoll{ .rdhup = true }); + + // EPOLL input flags + pub const URING_WAKE: u32 = @bitCast(Epoll{ .uring_wake = true }); + pub const EXCLUSIVE: u32 = @bitCast(Epoll{ .exclusive = true }); + pub const WAKEUP: u32 = @bitCast(Epoll{ .wakeup = true }); + pub const ONESHOT: u32 = @bitCast(Epoll{ .oneshot = true }); + pub const ET: u32 = @bitCast(Epoll{ .et = true }); + + /// Flags for epoll_create1 pub const CLOEXEC = 1 << @bitOffsetOf(O, "CLOEXEC"); - pub const CTL_ADD = 1; - pub const CTL_DEL = 2; - pub const CTL_MOD = 3; + // Deprecated Op Constants use EpollOp enum type + pub const CTL_ADD: u32 = @intFromEnum(EpollOp.ctl_add); + pub const CTL_DEL: u32 = @intFromEnum(EpollOp.ctl_del); + pub const CTL_MOD: u32 = @intFromEnum(EpollOp.ctl_mod); +} else packed struct(u32) { + // EPOLL event types (lower 16 bits) + // + /// The associated file is available for read(2) operations + in: bool = false, + /// There is an exceptional condition on the file descriptor + pri: bool = false, + /// The associated file is available for write(2) operations + out: bool = false, + /// Error condition happened on the associated file descriptor + err: bool = false, + /// Hang up happened on the associated file descriptor + hup: bool = false, + /// Invalid request: fd not open + nval: bool = false, + /// Normal data may be read + rdnorm: bool = false, + /// Priority data may be read + rdband: bool = false, + /// Writing is now possible (normal data) + wrnorm: bool = false, + /// Priority data may be written + wrband: bool = false, + /// Message available (unused on Linux) + msg: bool = false, + _12: u2 = 0, + /// Stream socket peer closed connection + rdhup: bool = false, + _15: u13 = 0, + // EPOLL input flags (Higher order flags are included as internal stat) + // + /// Internal flag - wakeup generated by io_uring, used to detect + /// recursion back into the io_uring poll handler + uring_wake: bool = false, + /// Set exclusive wakeup mode for the target file descriptor + exclusive: bool = false, + /// Request the handling of system wakeup events so as to prevent system + /// suspends from happening while those events are being processed. + /// Assuming neither EPOLLET nor EPOLLONESHOT is set, system suspends will + /// not be re-allowed until epoll_wait is called again after consuming the + /// wakeup event(s). + /// Requires CAP_BLOCK_SUSPEND + wakeup: bool = false, + /// Set the One Shot behaviour for the target file descriptor + oneshot: bool = false, + /// Set the Edge Triggered behaviour for the target file descriptor + et: bool = false, + + // Deprecated Named constants + // EPOLL event types + pub const IN: u32 = @bitCast(Epoll{ .in = true }); + pub const PRI: u32 = @bitCast(Epoll{ .pri = true }); + pub const OUT: u32 = @bitCast(Epoll{ .out = true }); + pub const ERR: u32 = @bitCast(Epoll{ .err = true }); + pub const HUP: u32 = @bitCast(Epoll{ .hup = true }); + pub const NVAL: u32 = @bitCast(Epoll{ .nval = true }); + pub const RDNORM: u32 = @bitCast(Epoll{ .rdnorm = true }); + pub const RDBAND: u32 = @bitCast(Epoll{ .rdband = true }); + pub const WRNORM: u32 = @bitCast(Epoll{ .wrnorm = true }); + pub const WRBAND: u32 = @bitCast(Epoll{ .wrband = true }); + pub const MSG: u32 = @bitCast(Epoll{ .msg = true }); + pub const RDHUP: u32 = @bitCast(Epoll{ .rdhup = true }); + + // EPOLL input flags + pub const URING_WAKE: u32 = @bitCast(Epoll{ .uring_wake = true }); + pub const EXCLUSIVE: u32 = @bitCast(Epoll{ .exclusive = true }); + pub const WAKEUP: u32 = @bitCast(Epoll{ .wakeup = true }); + pub const ONESHOT: u32 = @bitCast(Epoll{ .oneshot = true }); + pub const ET: u32 = @bitCast(Epoll{ .et = true }); + + /// Flags for epoll_create1 + pub const CLOEXEC = 1 << @bitOffsetOf(O, "CLOEXEC"); - pub const IN = 0x001; - pub const PRI = 0x002; - pub const OUT = 0x004; - pub const RDNORM = 0x040; - pub const RDBAND = 0x080; - pub const WRNORM = if (is_mips) 0x004 else 0x100; - pub const WRBAND = if (is_mips) 0x100 else 0x200; - pub const MSG = 0x400; - pub const ERR = 0x008; - pub const HUP = 0x010; - pub const RDHUP = 0x2000; - pub const EXCLUSIVE = (@as(u32, 1) << 28); - pub const WAKEUP = (@as(u32, 1) << 29); - pub const ONESHOT = (@as(u32, 1) << 30); - pub const ET = (@as(u32, 1) << 31); + // Deprecated Op Constants use EpollOp enum type + pub const CTL_ADD: u32 = @intFromEnum(EpollOp.ctl_add); + pub const CTL_DEL: u32 = @intFromEnum(EpollOp.ctl_del); + pub const CTL_MOD: u32 = @intFromEnum(EpollOp.ctl_mod); }; pub const CLOCK = clockid_t; @@ -5864,6 +6589,7 @@ pub const signalfd_siginfo = extern struct { }; pub const in_port_t = u16; +// TODO: change to AF type pub const sa_family_t = u16; pub const socklen_t = u32; @@ -5884,7 +6610,7 @@ pub const sockaddr = extern struct { /// IPv4 socket address pub const in = extern struct { - family: sa_family_t = AF.INET, + family: sa_family_t = Af.INET, port: in_port_t, addr: u32, zero: [8]u8 = [8]u8{ 0, 0, 0, 0, 0, 0, 0, 0 }, @@ -5892,7 +6618,7 @@ pub const sockaddr = extern struct { /// IPv6 socket address pub const in6 = extern struct { - family: sa_family_t = AF.INET6, + family: sa_family_t = Af.INET6, port: in_port_t, flowinfo: u32, addr: [16]u8, @@ -5901,13 +6627,13 @@ pub const sockaddr = extern struct { /// UNIX domain socket address pub const un = extern struct { - family: sa_family_t = AF.UNIX, + family: sa_family_t = Af.UNIX, path: [108]u8, }; /// Packet socket address pub const ll = extern struct { - family: sa_family_t = AF.PACKET, + family: sa_family_t = Af.PACKET, protocol: u16, ifindex: i32, hatype: u16, @@ -5918,7 +6644,7 @@ pub const sockaddr = extern struct { /// Netlink socket address pub const nl = extern struct { - family: sa_family_t = AF.NETLINK, + family: sa_family_t = Af.NETLINK, __pad1: c_ushort = 0, /// port ID @@ -5929,7 +6655,7 @@ pub const sockaddr = extern struct { }; pub const xdp = extern struct { - family: u16 = AF.XDP, + family: u16 = Af.XDP, flags: u16, ifindex: u32, queue_id: u32, @@ -5938,7 +6664,7 @@ pub const sockaddr = extern struct { /// Address structure for vSockets pub const vm = extern struct { - family: sa_family_t = AF.VSOCK, + family: sa_family_t = Af.VSOCK, reserved1: u16 = 0, port: u32, cid: u32, @@ -6275,667 +7001,6 @@ else fields: siginfo_fields_union, }; -// io_uring_params.flags - -/// io_context is polled -pub const IORING_SETUP_IOPOLL = 1 << 0; - -/// SQ poll thread -pub const IORING_SETUP_SQPOLL = 1 << 1; - -/// sq_thread_cpu is valid -pub const IORING_SETUP_SQ_AFF = 1 << 2; - -/// app defines CQ size -pub const IORING_SETUP_CQSIZE = 1 << 3; - -/// clamp SQ/CQ ring sizes -pub const IORING_SETUP_CLAMP = 1 << 4; - -/// attach to existing wq -pub const IORING_SETUP_ATTACH_WQ = 1 << 5; - -/// start with ring disabled -pub const IORING_SETUP_R_DISABLED = 1 << 6; - -/// continue submit on error -pub const IORING_SETUP_SUBMIT_ALL = 1 << 7; - -/// Cooperative task running. When requests complete, they often require -/// forcing the submitter to transition to the kernel to complete. If this -/// flag is set, work will be done when the task transitions anyway, rather -/// than force an inter-processor interrupt reschedule. This avoids interrupting -/// a task running in userspace, and saves an IPI. -pub const IORING_SETUP_COOP_TASKRUN = 1 << 8; - -/// If COOP_TASKRUN is set, get notified if task work is available for -/// running and a kernel transition would be needed to run it. This sets -/// IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. -pub const IORING_SETUP_TASKRUN_FLAG = 1 << 9; - -/// SQEs are 128 byte -pub const IORING_SETUP_SQE128 = 1 << 10; -/// CQEs are 32 byte -pub const IORING_SETUP_CQE32 = 1 << 11; - -/// Only one task is allowed to submit requests -pub const IORING_SETUP_SINGLE_ISSUER = 1 << 12; - -/// Defer running task work to get events. -/// Rather than running bits of task work whenever the task transitions -/// try to do it just before it is needed. -pub const IORING_SETUP_DEFER_TASKRUN = 1 << 13; - -/// Application provides ring memory -pub const IORING_SETUP_NO_MMAP = 1 << 14; - -/// Register the ring fd in itself for use with -/// IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather -/// than an fd. -pub const IORING_SETUP_REGISTERED_FD_ONLY = 1 << 15; - -/// Removes indirection through the SQ index array. -pub const IORING_SETUP_NO_SQARRAY = 1 << 16; - -/// IO submission data structure (Submission Queue Entry) -pub const io_uring_sqe = @import("linux/io_uring_sqe.zig").io_uring_sqe; - -pub const IoUring = @import("linux/IoUring.zig"); - -/// If sqe->file_index is set to this for opcodes that instantiate a new -/// direct descriptor (like openat/openat2/accept), then io_uring will allocate -/// an available direct descriptor instead of having the application pass one -/// in. The picked direct descriptor will be returned in cqe->res, or -ENFILE -/// if the space is full. -/// Available since Linux 5.19 -pub const IORING_FILE_INDEX_ALLOC = maxInt(u32); - -pub const IOSQE_BIT = enum(u8) { - FIXED_FILE, - IO_DRAIN, - IO_LINK, - IO_HARDLINK, - ASYNC, - BUFFER_SELECT, - CQE_SKIP_SUCCESS, - - _, -}; - -// io_uring_sqe.flags - -/// use fixed fileset -pub const IOSQE_FIXED_FILE = 1 << @intFromEnum(IOSQE_BIT.FIXED_FILE); - -/// issue after inflight IO -pub const IOSQE_IO_DRAIN = 1 << @intFromEnum(IOSQE_BIT.IO_DRAIN); - -/// links next sqe -pub const IOSQE_IO_LINK = 1 << @intFromEnum(IOSQE_BIT.IO_LINK); - -/// like LINK, but stronger -pub const IOSQE_IO_HARDLINK = 1 << @intFromEnum(IOSQE_BIT.IO_HARDLINK); - -/// always go async -pub const IOSQE_ASYNC = 1 << @intFromEnum(IOSQE_BIT.ASYNC); - -/// select buffer from buf_group -pub const IOSQE_BUFFER_SELECT = 1 << @intFromEnum(IOSQE_BIT.BUFFER_SELECT); - -/// don't post CQE if request succeeded -/// Available since Linux 5.17 -pub const IOSQE_CQE_SKIP_SUCCESS = 1 << @intFromEnum(IOSQE_BIT.CQE_SKIP_SUCCESS); - -pub const IORING_OP = enum(u8) { - NOP, - READV, - WRITEV, - FSYNC, - READ_FIXED, - WRITE_FIXED, - POLL_ADD, - POLL_REMOVE, - SYNC_FILE_RANGE, - SENDMSG, - RECVMSG, - TIMEOUT, - TIMEOUT_REMOVE, - ACCEPT, - ASYNC_CANCEL, - LINK_TIMEOUT, - CONNECT, - FALLOCATE, - OPENAT, - CLOSE, - FILES_UPDATE, - STATX, - READ, - WRITE, - FADVISE, - MADVISE, - SEND, - RECV, - OPENAT2, - EPOLL_CTL, - SPLICE, - PROVIDE_BUFFERS, - REMOVE_BUFFERS, - TEE, - SHUTDOWN, - RENAMEAT, - UNLINKAT, - MKDIRAT, - SYMLINKAT, - LINKAT, - MSG_RING, - FSETXATTR, - SETXATTR, - FGETXATTR, - GETXATTR, - SOCKET, - URING_CMD, - SEND_ZC, - SENDMSG_ZC, - READ_MULTISHOT, - WAITID, - FUTEX_WAIT, - FUTEX_WAKE, - FUTEX_WAITV, - FIXED_FD_INSTALL, - FTRUNCATE, - BIND, - LISTEN, - RECV_ZC, - - _, -}; -// io_uring_sqe.uring_cmd_flags (rw_flags in the Zig struct) - -/// use registered buffer; pass thig flag along with setting sqe->buf_index. -pub const IORING_URING_CMD_FIXED = 1 << 0; - -// io_uring_sqe.fsync_flags (rw_flags in the Zig struct) -pub const IORING_FSYNC_DATASYNC = 1 << 0; - -// io_uring_sqe.timeout_flags (rw_flags in the Zig struct) -pub const IORING_TIMEOUT_ABS = 1 << 0; -pub const IORING_TIMEOUT_UPDATE = 1 << 1; // Available since Linux 5.11 -pub const IORING_TIMEOUT_BOOTTIME = 1 << 2; // Available since Linux 5.15 -pub const IORING_TIMEOUT_REALTIME = 1 << 3; // Available since Linux 5.15 -pub const IORING_LINK_TIMEOUT_UPDATE = 1 << 4; // Available since Linux 5.15 -pub const IORING_TIMEOUT_ETIME_SUCCESS = 1 << 5; // Available since Linux 5.16 -pub const IORING_TIMEOUT_CLOCK_MASK = IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME; -pub const IORING_TIMEOUT_UPDATE_MASK = IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE; - -// io_uring_sqe.splice_flags (rw_flags in the Zig struct) -// extends splice(2) flags -pub const IORING_SPLICE_F_FD_IN_FIXED = 1 << 31; - -// POLL_ADD flags. -// Note that since sqe->poll_events (rw_flags in the Zig struct) is the flag space, the command flags for POLL_ADD are stored in sqe->len. - -/// Multishot poll. Sets IORING_CQE_F_MORE if the poll handler will continue to report CQEs on behalf of the same SQE. -pub const IORING_POLL_ADD_MULTI = 1 << 0; -/// Update existing poll request, matching sqe->addr as the old user_data field. -pub const IORING_POLL_UPDATE_EVENTS = 1 << 1; -pub const IORING_POLL_UPDATE_USER_DATA = 1 << 2; -pub const IORING_POLL_ADD_LEVEL = 1 << 3; - -// ASYNC_CANCEL flags. - -/// Cancel all requests that match the given key -pub const IORING_ASYNC_CANCEL_ALL = 1 << 0; -/// Key off 'fd' for cancelation rather than the request 'user_data'. -pub const IORING_ASYNC_CANCEL_FD = 1 << 1; -/// Match any request -pub const IORING_ASYNC_CANCEL_ANY = 1 << 2; -/// 'fd' passed in is a fixed descriptor. Available since Linux 6.0 -pub const IORING_ASYNC_CANCEL_FD_FIXED = 1 << 3; - -// send/sendmsg and recv/recvmsg flags (sqe->ioprio) - -/// If set, instead of first attempting to send or receive and arm poll if that yields an -EAGAIN result, -/// arm poll upfront and skip the initial transfer attempt. -pub const IORING_RECVSEND_POLL_FIRST = 1 << 0; -/// Multishot recv. Sets IORING_CQE_F_MORE if the handler will continue to report CQEs on behalf of the same SQE. -pub const IORING_RECV_MULTISHOT = 1 << 1; -/// Use registered buffers, the index is stored in the buf_index field. -pub const IORING_RECVSEND_FIXED_BUF = 1 << 2; -/// If set, SEND[MSG]_ZC should report the zerocopy usage in cqe.res for the IORING_CQE_F_NOTIF cqe. -pub const IORING_SEND_ZC_REPORT_USAGE = 1 << 3; -/// If set, send or recv will grab as many buffers from the buffer group ID given and send them all. -/// The completion result will be the number of buffers send, with the starting buffer ID in cqe as per usual. -/// The buffers be contigious from the starting buffer ID. -/// Used with IOSQE_BUFFER_SELECT. -pub const IORING_RECVSEND_BUNDLE = 1 << 4; -/// CQE.RES FOR IORING_CQE_F_NOTIF if IORING_SEND_ZC_REPORT_USAGE was requested -pub const IORING_NOTIF_USAGE_ZC_COPIED = 1 << 31; - -/// accept flags stored in sqe->iopri -pub const IORING_ACCEPT_MULTISHOT = 1 << 0; - -/// IORING_OP_MSG_RING command types, stored in sqe->addr -pub const IORING_MSG_RING_COMMAND = enum(u8) { - /// pass sqe->len as 'res' and off as user_data - DATA, - /// send a registered fd to another ring - SEND_FD, -}; - -// io_uring_sqe.msg_ring_flags (rw_flags in the Zig struct) - -/// Don't post a CQE to the target ring. Not applicable for IORING_MSG_DATA, obviously. -pub const IORING_MSG_RING_CQE_SKIP = 1 << 0; - -/// Pass through the flags from sqe->file_index (splice_fd_in in the zig struct) to cqe->flags */ -pub const IORING_MSG_RING_FLAGS_PASS = 1 << 1; - -// IO completion data structure (Completion Queue Entry) -pub const io_uring_cqe = extern struct { - /// io_uring_sqe.data submission passed back - user_data: u64, - - /// result code for this event - res: i32, - flags: u32, - - // Followed by 16 bytes of padding if initialized with IORING_SETUP_CQE32, doubling cqe size - - pub fn err(self: io_uring_cqe) E { - if (self.res > -4096 and self.res < 0) { - return @as(E, @enumFromInt(-self.res)); - } - return .SUCCESS; - } - - // On successful completion of the provided buffers IO request, the CQE flags field - // will have IORING_CQE_F_BUFFER set and the selected buffer ID will be indicated by - // the upper 16-bits of the flags field. - pub fn buffer_id(self: io_uring_cqe) !u16 { - if (self.flags & IORING_CQE_F_BUFFER != IORING_CQE_F_BUFFER) { - return error.NoBufferSelected; - } - return @as(u16, @intCast(self.flags >> IORING_CQE_BUFFER_SHIFT)); - } -}; - -// io_uring_cqe.flags - -/// If set, the upper 16 bits are the buffer ID -pub const IORING_CQE_F_BUFFER = 1 << 0; -/// If set, parent SQE will generate more CQE entries. -/// Available since Linux 5.13. -pub const IORING_CQE_F_MORE = 1 << 1; -/// If set, more data to read after socket recv -pub const IORING_CQE_F_SOCK_NONEMPTY = 1 << 2; -/// Set for notification CQEs. Can be used to distinct them from sends. -pub const IORING_CQE_F_NOTIF = 1 << 3; -/// If set, the buffer ID set in the completion will get more completions. -pub const IORING_CQE_F_BUF_MORE = 1 << 4; - -pub const IORING_CQE_BUFFER_SHIFT = 16; - -/// Magic offsets for the application to mmap the data it needs -pub const IORING_OFF_SQ_RING = 0; -pub const IORING_OFF_CQ_RING = 0x8000000; -pub const IORING_OFF_SQES = 0x10000000; - -/// Filled with the offset for mmap(2) -pub const io_sqring_offsets = extern struct { - /// offset of ring head - head: u32, - - /// offset of ring tail - tail: u32, - - /// ring mask value - ring_mask: u32, - - /// entries in ring - ring_entries: u32, - - /// ring flags - flags: u32, - - /// number of sqes not submitted - dropped: u32, - - /// sqe index array - array: u32, - - resv1: u32, - user_addr: u64, -}; - -// io_sqring_offsets.flags - -/// needs io_uring_enter wakeup -pub const IORING_SQ_NEED_WAKEUP = 1 << 0; -/// kernel has cqes waiting beyond the cq ring -pub const IORING_SQ_CQ_OVERFLOW = 1 << 1; -/// task should enter the kernel -pub const IORING_SQ_TASKRUN = 1 << 2; - -pub const io_cqring_offsets = extern struct { - head: u32, - tail: u32, - ring_mask: u32, - ring_entries: u32, - overflow: u32, - cqes: u32, - flags: u32, - resv: u32, - user_addr: u64, -}; - -// io_cqring_offsets.flags - -/// disable eventfd notifications -pub const IORING_CQ_EVENTFD_DISABLED = 1 << 0; - -// io_uring_enter flags -pub const IORING_ENTER_GETEVENTS = 1 << 0; -pub const IORING_ENTER_SQ_WAKEUP = 1 << 1; -pub const IORING_ENTER_SQ_WAIT = 1 << 2; -pub const IORING_ENTER_EXT_ARG = 1 << 3; -pub const IORING_ENTER_REGISTERED_RING = 1 << 4; - -pub const io_uring_params = extern struct { - sq_entries: u32, - cq_entries: u32, - flags: u32, - sq_thread_cpu: u32, - sq_thread_idle: u32, - features: u32, - wq_fd: u32, - resv: [3]u32, - sq_off: io_sqring_offsets, - cq_off: io_cqring_offsets, -}; - -// io_uring_params.features flags - -pub const IORING_FEAT_SINGLE_MMAP = 1 << 0; -pub const IORING_FEAT_NODROP = 1 << 1; -pub const IORING_FEAT_SUBMIT_STABLE = 1 << 2; -pub const IORING_FEAT_RW_CUR_POS = 1 << 3; -pub const IORING_FEAT_CUR_PERSONALITY = 1 << 4; -pub const IORING_FEAT_FAST_POLL = 1 << 5; -pub const IORING_FEAT_POLL_32BITS = 1 << 6; -pub const IORING_FEAT_SQPOLL_NONFIXED = 1 << 7; -pub const IORING_FEAT_EXT_ARG = 1 << 8; -pub const IORING_FEAT_NATIVE_WORKERS = 1 << 9; -pub const IORING_FEAT_RSRC_TAGS = 1 << 10; -pub const IORING_FEAT_CQE_SKIP = 1 << 11; -pub const IORING_FEAT_LINKED_FILE = 1 << 12; - -// io_uring_register opcodes and arguments -pub const IORING_REGISTER = enum(u32) { - REGISTER_BUFFERS, - UNREGISTER_BUFFERS, - REGISTER_FILES, - UNREGISTER_FILES, - REGISTER_EVENTFD, - UNREGISTER_EVENTFD, - REGISTER_FILES_UPDATE, - REGISTER_EVENTFD_ASYNC, - REGISTER_PROBE, - REGISTER_PERSONALITY, - UNREGISTER_PERSONALITY, - REGISTER_RESTRICTIONS, - REGISTER_ENABLE_RINGS, - - // extended with tagging - REGISTER_FILES2, - REGISTER_FILES_UPDATE2, - REGISTER_BUFFERS2, - REGISTER_BUFFERS_UPDATE, - - // set/clear io-wq thread affinities - REGISTER_IOWQ_AFF, - UNREGISTER_IOWQ_AFF, - - // set/get max number of io-wq workers - REGISTER_IOWQ_MAX_WORKERS, - - // register/unregister io_uring fd with the ring - REGISTER_RING_FDS, - UNREGISTER_RING_FDS, - - // register ring based provide buffer group - REGISTER_PBUF_RING, - UNREGISTER_PBUF_RING, - - // sync cancelation API - REGISTER_SYNC_CANCEL, - - // register a range of fixed file slots for automatic slot allocation - REGISTER_FILE_ALLOC_RANGE, - - // return status information for a buffer group - REGISTER_PBUF_STATUS, - - // set/clear busy poll settings - REGISTER_NAPI, - UNREGISTER_NAPI, - - REGISTER_CLOCK, - - // clone registered buffers from source ring to current ring - REGISTER_CLONE_BUFFERS, - - // send MSG_RING without having a ring - REGISTER_SEND_MSG_RING, - - // register a netdev hw rx queue for zerocopy - REGISTER_ZCRX_IFQ, - - // resize CQ ring - REGISTER_RESIZE_RINGS, - - REGISTER_MEM_REGION, - - // flag added to the opcode to use a registered ring fd - REGISTER_USE_REGISTERED_RING = 1 << 31, - - _, -}; - -/// io_uring_restriction->opcode values -pub const IOWQ_CATEGORIES = enum(u8) { - BOUND, - UNBOUND, -}; - -/// deprecated, see struct io_uring_rsrc_update -pub const io_uring_files_update = extern struct { - offset: u32, - resv: u32, - fds: u64, -}; - -/// Register a fully sparse file space, rather than pass in an array of all -1 file descriptors. -pub const IORING_RSRC_REGISTER_SPARSE = 1 << 0; - -pub const io_uring_rsrc_register = extern struct { - nr: u32, - flags: u32, - resv2: u64, - data: u64, - tags: u64, -}; - -pub const io_uring_rsrc_update = extern struct { - offset: u32, - resv: u32, - data: u64, -}; - -pub const io_uring_rsrc_update2 = extern struct { - offset: u32, - resv: u32, - data: u64, - tags: u64, - nr: u32, - resv2: u32, -}; - -pub const io_uring_notification_slot = extern struct { - tag: u64, - resv: [3]u64, -}; - -pub const io_uring_notification_register = extern struct { - nr_slots: u32, - resv: u32, - resv2: u64, - data: u64, - resv3: u64, -}; - -pub const io_uring_napi = extern struct { - busy_poll_to: u32, - prefer_busy_poll: u8, - _pad: [3]u8, - resv: u64, -}; - -/// Skip updating fd indexes set to this value in the fd table */ -pub const IORING_REGISTER_FILES_SKIP = -2; - -pub const IO_URING_OP_SUPPORTED = 1 << 0; - -pub const io_uring_probe_op = extern struct { - op: IORING_OP, - resv: u8, - /// IO_URING_OP_* flags - flags: u16, - resv2: u32, - - pub fn is_supported(self: @This()) bool { - return self.flags & IO_URING_OP_SUPPORTED != 0; - } -}; - -pub const io_uring_probe = extern struct { - /// Last opcode supported - last_op: IORING_OP, - /// Length of ops[] array below - ops_len: u8, - resv: u16, - resv2: [3]u32, - ops: [256]io_uring_probe_op, - - /// Is the operation supported on the running kernel. - pub fn is_supported(self: @This(), op: IORING_OP) bool { - const i = @intFromEnum(op); - if (i > @intFromEnum(self.last_op) or i >= self.ops_len) - return false; - return self.ops[i].is_supported(); - } -}; - -pub const io_uring_restriction = extern struct { - opcode: IORING_RESTRICTION, - arg: extern union { - /// IORING_RESTRICTION_REGISTER_OP - register_op: IORING_REGISTER, - - /// IORING_RESTRICTION_SQE_OP - sqe_op: IORING_OP, - - /// IORING_RESTRICTION_SQE_FLAGS_* - sqe_flags: u8, - }, - resv: u8, - resv2: [3]u32, -}; - -/// io_uring_restriction->opcode values -pub const IORING_RESTRICTION = enum(u16) { - /// Allow an io_uring_register(2) opcode - REGISTER_OP = 0, - - /// Allow an sqe opcode - SQE_OP = 1, - - /// Allow sqe flags - SQE_FLAGS_ALLOWED = 2, - - /// Require sqe flags (these flags must be set on each submission) - SQE_FLAGS_REQUIRED = 3, - - _, -}; - -pub const IO_URING_SOCKET_OP = enum(u16) { - SIOCIN = 0, - SIOCOUTQ = 1, - GETSOCKOPT = 2, - SETSOCKOPT = 3, -}; - -pub const io_uring_buf = extern struct { - addr: u64, - len: u32, - bid: u16, - resv: u16, -}; - -pub const io_uring_buf_ring = extern struct { - resv1: u64, - resv2: u32, - resv3: u16, - tail: u16, -}; - -/// argument for IORING_(UN)REGISTER_PBUF_RING -pub const io_uring_buf_reg = extern struct { - ring_addr: u64, - ring_entries: u32, - bgid: u16, - flags: Flags, - resv: [3]u64, - - pub const Flags = packed struct { - _0: u1 = 0, - /// Incremental buffer consumption. - inc: bool, - _: u14 = 0, - }; -}; - -pub const io_uring_getevents_arg = extern struct { - sigmask: u64, - sigmask_sz: u32, - pad: u32, - ts: u64, -}; - -/// Argument for IORING_REGISTER_SYNC_CANCEL -pub const io_uring_sync_cancel_reg = extern struct { - addr: u64, - fd: i32, - flags: u32, - timeout: kernel_timespec, - pad: [4]u64, -}; - -/// Argument for IORING_REGISTER_FILE_ALLOC_RANGE -/// The range is specified as [off, off + len) -pub const io_uring_file_index_range = extern struct { - off: u32, - len: u32, - resv: u64, -}; - -pub const io_uring_recvmsg_out = extern struct { - namelen: u32, - controllen: u32, - payloadlen: u32, - flags: u32, -}; - pub const utsname = extern struct { sysname: [64:0]u8, nodename: [64:0]u8, @@ -6946,27 +7011,27 @@ pub const utsname = extern struct { }; pub const HOST_NAME_MAX = 64; -pub const STATX_TYPE = 0x0001; -pub const STATX_MODE = 0x0002; -pub const STATX_NLINK = 0x0004; -pub const STATX_UID = 0x0008; -pub const STATX_GID = 0x0010; -pub const STATX_ATIME = 0x0020; -pub const STATX_MTIME = 0x0040; -pub const STATX_CTIME = 0x0080; -pub const STATX_INO = 0x0100; -pub const STATX_SIZE = 0x0200; -pub const STATX_BLOCKS = 0x0400; -pub const STATX_BASIC_STATS = 0x07ff; - -pub const STATX_BTIME = 0x0800; - -pub const STATX_ATTR_COMPRESSED = 0x0004; -pub const STATX_ATTR_IMMUTABLE = 0x0010; -pub const STATX_ATTR_APPEND = 0x0020; -pub const STATX_ATTR_NODUMP = 0x0040; -pub const STATX_ATTR_ENCRYPTED = 0x0800; -pub const STATX_ATTR_AUTOMOUNT = 0x1000; +pub const Rename = packed struct(u32) { + /// Don't overwrite target + noreplace: bool = false, + /// Exchange source and dest + exchange: bool = false, + /// Whiteout source + whiteout: bool = false, + _: u29 = 0, +}; + +/// By default (i.e, flags is .{}), the extended attribute will be created +/// if it does not exist, or the value will be replaced if the attribute +/// already exists. To modify this semantics, one of the fields in `SetXattr` +/// can be specified in flags. Matches XATTR_* in kernel +pub const SetXattr = packed struct(u32) { + /// set value, fail if attr already exists + create: bool = false, + /// set value, fail if attr does not exist + replace: bool = false, + _: u30 = 0, +}; pub const statx_timestamp = extern struct { sec: i64, @@ -6977,13 +7042,13 @@ pub const statx_timestamp = extern struct { /// Renamed to `Statx` to not conflict with the `statx` function. pub const Statx = extern struct { /// Mask of bits indicating filled fields - mask: u32, + mask: Mask, /// Block size for filesystem I/O blksize: u32, /// Extra file attribute indicators - attributes: u64, + attributes: Attr, /// Number of hard links nlink: u32, @@ -7008,7 +7073,7 @@ pub const Statx = extern struct { blocks: u64, /// Mask to show what's supported in `attributes`. - attributes_mask: u64, + attributes_mask: Attr, /// Last access file timestamp atime: statx_timestamp, @@ -7035,8 +7100,127 @@ pub const Statx = extern struct { dev_minor: u32, __pad2: [14]u64, + + // https://github.com/torvalds/linux/blob/755fa5b4fb36627796af19932a432d343220ec63/include/uapi/linux/stat.h#L203 + /// matches STATX_* in kernel + pub const Mask = packed struct(u32) { + type: bool = false, + /// Want/got stx_mode & ~S_IFMT + mode: bool = false, + /// Want/got stx_nlink + nlink: bool = false, + /// Want/got stx_uid + uid: bool = false, + /// Want/got stx_gid + gid: bool = false, + /// Want/got stx_atime + atime: bool = false, + /// Want/got stx_mtime + mtime: bool = false, + /// Want/got stx_ctime + ctime: bool = false, + /// Want/got stx_ino + ino: bool = false, + /// Want/got stx_size + size: bool = false, + /// Want/got stx_blocks + blocks: bool = false, + /// Want/got stx_btime + btime: bool = false, + /// Got stx_mnt_id + mnt_id: bool = false, + /// Want/got direct I/O alignment info + dioalign: bool = false, + /// Want/got extended stx_mount_id + mnt_id_unique: bool = false, + /// Want/got stx_subvol + subvol: bool = false, + /// Want/got atomic_write_* fields + write_atomic: bool = false, + /// Want/got dio read alignment info + dio_read_align: bool = false, + /// Reserved for future struct statx expansion + _: u14 = 0, + + /// The stuff in the normal stat struct (bits 0-10) + pub const basic_stats: Mask = .{ + .type = true, + .mode = true, + .nlink = true, + .uid = true, + .gid = true, + .atime = true, + .mtime = true, + .ctime = true, + .ino = true, + .size = true, + .blocks = true, + }; + }; + + // https://github.com/torvalds/linux/blob/755fa5b4fb36627796af19932a432d343220ec63/include/uapi/linux/stat.h#L248 + /// matches STATX_ATTR_* in kernel + pub const Attr = packed struct(u64) { + _0: u2 = 0, + /// File is compressed by the fs + compressed: bool = false, + _1: u1 = 0, + /// File is marked immutable + immutable: bool = false, + /// File is append-only + append: bool = false, + /// File is not to be dumped + nodump: bool = false, + _2: u4 = 0, + /// File requires key to decrypt in fs + encrypted: bool = false, + /// Dir: Automount trigger + automount: bool = false, + /// Root of a mount + mount_root: bool = false, + _3: u6 = 0, + /// Verity protected file + verity: bool = false, + /// File is currently in DAX state + dax: bool = false, + /// File supports atomic write operations + write_atomic: bool = false, + _: u41 = 0, + }; }; +// DEPRECATED aliases to Statx.Mask and Statx.Attr +const STATX_TYPE: u32 = @bitCast(Statx.Mask{ .type = true }); +const STATX_MODE: u32 = @bitCast(Statx.Mask{ .mode = true }); +const STATX_NLINK: u32 = @bitCast(Statx.Mask{ .nlink = true }); +const STATX_UID: u32 = @bitCast(Statx.Mask{ .uid = true }); +const STATX_GID: u32 = @bitCast(Statx.Mask{ .gid = true }); +const STATX_ATIME: u32 = @bitCast(Statx.Mask{ .atime = true }); +const STATX_MTIME: u32 = @bitCast(Statx.Mask{ .mtime = true }); +const STATX_CTIME: u32 = @bitCast(Statx.Mask{ .ctime = true }); +const STATX_INO: u32 = @bitCast(Statx.Mask{ .ino = true }); +const STATX_SIZE: u32 = @bitCast(Statx.Mask{ .size = true }); +const STATX_BLOCKS: u32 = @bitCast(Statx.Mask{ .blocks = true }); +const STATX_BASIC_STATS: u32 = @bitCast(Statx.Mask.basic_stats); +const STATX_BTIME: u32 = @bitCast(Statx.Mask{ .btime = true }); +const STATX_MNT_ID: u32 = @bitCast(Statx.Mask{ .mnt_id = true }); +const STATX_DIOALIGN: u32 = @bitCast(Statx.Mask{ .dioalign = true }); +const STATX_MNT_ID_UNIQUE: u32 = @bitCast(Statx.Mask{ .mnt_id_unique = true }); +const STATX_SUBVOL: u32 = @bitCast(Statx.Mask{ .subvol = true }); +const STATX_WRITE_ATOMIC: u32 = @bitCast(Statx.Mask{ .write_atomic = true }); +const STATX_DIO_READ_ALIGN: u32 = @bitCast(Statx.Mask{ .dio_read_align = true }); + +const STATX_ATTR_COMPRESSED: u64 = @bitCast(Statx.Attr{ .compressed = true }); +const STATX_ATTR_IMMUTABLE: u64 = @bitCast(Statx.Attr{ .immutable = true }); +const STATX_ATTR_APPEND: u64 = @bitCast(Statx.Attr{ .append = true }); +const STATX_ATTR_NODUMP: u64 = @bitCast(Statx.Attr{ .nodump = true }); +const STATX_ATTR_ENCRYPTED: u64 = @bitCast(Statx.Attr{ .encrypted = true }); +const STATX_ATTR_AUTOMOUNT: u64 = @bitCast(Statx.Attr{ .automount = true }); +const STATX_ATTR_MOUNT_ROOT: u64 = @bitCast(Statx.Attr{ .mount_root = true }); +const STATX_ATTR_VERITY: u64 = @bitCast(Statx.Attr{ .verity = true }); +const STATX_ATTR_DAX: u64 = @bitCast(Statx.Attr{ .dax = true }); +const STATX_ATTR_WRITE_ATOMIC: u64 = @bitCast(Statx.Attr{ .write_atomic = true }); + pub const addrinfo = extern struct { flags: AI, family: i32, @@ -7062,40 +7246,83 @@ pub const AI = packed struct(u32) { pub const IPPORT_RESERVED = 1024; -pub const IPPROTO = struct { - pub const IP = 0; - pub const HOPOPTS = 0; - pub const ICMP = 1; - pub const IGMP = 2; - pub const IPIP = 4; - pub const TCP = 6; - pub const EGP = 8; - pub const PUP = 12; - pub const UDP = 17; - pub const IDP = 22; - pub const TP = 29; - pub const DCCP = 33; - pub const IPV6 = 41; - pub const ROUTING = 43; - pub const FRAGMENT = 44; - pub const RSVP = 46; - pub const GRE = 47; - pub const ESP = 50; - pub const AH = 51; - pub const ICMPV6 = 58; - pub const NONE = 59; - pub const DSTOPTS = 60; - pub const MTP = 92; - pub const BEETPH = 94; - pub const ENCAP = 98; - pub const PIM = 103; - pub const COMP = 108; - pub const SCTP = 132; - pub const MH = 135; - pub const UDPLITE = 136; - pub const MPLS = 137; - pub const RAW = 255; - pub const MAX = 256; +/// Deprecated alias to IpProto +pub const IPPROTO = IpProto; +/// IP Protocol numbers +pub const IpProto = enum(u16) { + ip = 0, + icmp = 1, + igmp = 2, + ipip = 4, + tcp = 6, + egp = 8, + pup = 12, + udp = 17, + idp = 22, + tp = 29, + dccp = 33, + ipv6 = 41, + routing = 43, + fragment = 44, + rsvp = 46, + gre = 47, + esp = 50, + ah = 51, + icmpv6 = 58, + none = 59, + dstopts = 60, + mtp = 92, + beetph = 94, + encap = 98, + pim = 103, + comp = 108, + sctp = 132, + mh = 135, + udplite = 136, + mpls = 137, + raw = 255, + max = 256, + _, + + // Aliases + pub const hopopts = IpProto.ip; + pub const default = IpProto.ip; + + // Deprecated constants use enum instead + // Legacy constants for backward compatibility + pub const IP: u16 = @intFromEnum(IpProto.ip); + pub const HOPOPTS: u16 = @intFromEnum(hopopts); + pub const ICMP: u16 = @intFromEnum(IpProto.icmp); + pub const IGMP: u16 = @intFromEnum(IpProto.igmp); + pub const IPIP: u16 = @intFromEnum(IpProto.ipip); + pub const TCP: u16 = @intFromEnum(IpProto.tcp); + pub const EGP: u16 = @intFromEnum(IpProto.egp); + pub const PUP: u16 = @intFromEnum(IpProto.pup); + pub const UDP: u16 = @intFromEnum(IpProto.udp); + pub const IDP: u16 = @intFromEnum(IpProto.idp); + pub const TP: u16 = @intFromEnum(IpProto.tp); + pub const DCCP: u16 = @intFromEnum(IpProto.dccp); + pub const IPV6: u16 = @intFromEnum(IpProto.ipv6); + pub const ROUTING: u16 = @intFromEnum(IpProto.routing); + pub const FRAGMENT: u16 = @intFromEnum(IpProto.fragment); + pub const RSVP: u16 = @intFromEnum(IpProto.rsvp); + pub const GRE: u16 = @intFromEnum(IpProto.gre); + pub const ESP: u16 = @intFromEnum(IpProto.esp); + pub const AH: u16 = @intFromEnum(IpProto.ah); + pub const ICMPV6: u16 = @intFromEnum(IpProto.icmpv6); + pub const NONE: u16 = @intFromEnum(IpProto.none); + pub const DSTOPTS: u16 = @intFromEnum(IpProto.DSTOPTS); + pub const MTP: u16 = @intFromEnum(IpProto.mtp); + pub const BEETPH: u16 = @intFromEnum(IpProto.beetph); + pub const ENCAP: u16 = @intFromEnum(IpProto.encap); + pub const PIM: u16 = @intFromEnum(IpProto.pim); + pub const COMP: u16 = @intFromEnum(IpProto.comp); + pub const SCTP: u16 = @intFromEnum(IpProto.sctp); + pub const MH: u16 = @intFromEnum(IpProto.mh); + pub const UDPLITE: u16 = @intFromEnum(IpProto.udplite); + pub const MPLS: u16 = @intFromEnum(IpProto.mpls); + pub const RAW: u16 = @intFromEnum(IpProto.raw); + pub const MAX: u16 = @intFromEnum(IpProto.max); }; pub const tcp_repair_opt = extern struct { @@ -8345,53 +8572,141 @@ pub const rlimit = extern struct { max: rlim_t, }; -pub const MADV = struct { - pub const NORMAL = 0; - pub const RANDOM = 1; - pub const SEQUENTIAL = 2; - pub const WILLNEED = 3; - pub const DONTNEED = 4; - pub const FREE = 8; - pub const REMOVE = 9; - pub const DONTFORK = 10; - pub const DOFORK = 11; - pub const MERGEABLE = 12; - pub const UNMERGEABLE = 13; - pub const HUGEPAGE = 14; - pub const NOHUGEPAGE = 15; - pub const DONTDUMP = 16; - pub const DODUMP = 17; - pub const WIPEONFORK = 18; - pub const KEEPONFORK = 19; - pub const COLD = 20; - pub const PAGEOUT = 21; - pub const HWPOISON = 100; - pub const SOFT_OFFLINE = 101; -}; - -pub const POSIX_FADV = switch (native_arch) { - .s390x => if (@typeInfo(usize).int.bits == 64) struct { - pub const NORMAL = 0; - pub const RANDOM = 1; - pub const SEQUENTIAL = 2; - pub const WILLNEED = 3; - pub const DONTNEED = 6; - pub const NOREUSE = 7; - } else struct { - pub const NORMAL = 0; - pub const RANDOM = 1; - pub const SEQUENTIAL = 2; - pub const WILLNEED = 3; - pub const DONTNEED = 4; - pub const NOREUSE = 5; +/// DEPRECATED alias for Madvise +pub const MADV = Madvise; + +/// advice flags for `madvise` +/// matches MADV_* in kernel +pub const Madvise = enum(u32) { + /// no further special treatment + normal = 0, + /// expect random page references + random = 1, + /// expect sequential page references + sequential = 2, + /// will need these pages + willneed = 3, + /// don't need these pages + dontneed = 4, + /// free pages only if memory pressure + free = 8, + /// remove these pages & resources + remove = 9, + /// don't inherit across fork + dontfork = 10, + /// do inherit across fork + dofork = 11, + /// KSM may merge identical pages + mergeable = 12, + /// KSM may not merge identical pages + unmergeable = 13, + /// Worth backing with hugepages + hugepage = 14, + /// Not worth backing with hugepages + nohugepage = 15, + /// Explicity exclude from the core dump, overrides the coredump filter bits + dontdump = 16, + /// Clear the MADV_DONTDUMP flag + dodump = 17, + /// Zero memory on fork, child only + wipeonfork = 18, + /// Undo MADV_WIPEONFORK + keeponfork = 19, + /// deactivate these pages + cold = 20, + /// reclaim these pages + pageout = 21, + /// populate (prefault) page tables readable + populate_read = 22, + /// populate (prefault) page tables writable + populate_write = 23, + /// like DONTNEED, but drop locked pages too + dontneed_locked = 24, + /// Synchronous hugepage collapse + collapse = 25, + /// poison a page for testing + hwpoison = 100, + /// soft offline page for testing + soft_offline = 101, + /// fatal signal on access to range + guard_install = 102, + /// unguard range + guard_remove = 103, + _, + + // DEPRECATED aliases for `Madvise` + pub const NORMAL: u32 = @intFromEnum(Madvise.normal); + pub const RANDOM: u32 = @intFromEnum(Madvise.random); + pub const SEQUENTIAL: u32 = @intFromEnum(Madvise.sequential); + pub const WILLNEED: u32 = @intFromEnum(Madvise.willneed); + pub const DONTNEED: u32 = @intFromEnum(Madvise.dontneed); + pub const FREE: u32 = @intFromEnum(Madvise.free); + pub const REMOVE: u32 = @intFromEnum(Madvise.remove); + pub const DONTFORK: u32 = @intFromEnum(Madvise.dontfork); + pub const DOFORK: u32 = @intFromEnum(Madvise.dofork); + pub const MERGEABLE: u32 = @intFromEnum(Madvise.mergeable); + pub const UNMERGEABLE: u32 = @intFromEnum(Madvise.unmergeable); + pub const HUGEPAGE: u32 = @intFromEnum(Madvise.hugepage); + pub const NOHUGEPAGE: u32 = @intFromEnum(Madvise.nohugepage); + pub const DONTDUMP: u32 = @intFromEnum(Madvise.dontdump); + pub const DODUMP: u32 = @intFromEnum(Madvise.dodump); + pub const WIPEONFORK: u32 = @intFromEnum(Madvise.wipeonfork); + pub const KEEPONFORK: u32 = @intFromEnum(Madvise.keeponfork); + pub const COLD: u32 = @intFromEnum(Madvise.cold); + pub const PAGEOUT: u32 = @intFromEnum(Madvise.pageout); + pub const HWPOISON: u32 = @intFromEnum(Madvise.hwpoison); + pub const SOFT_OFFLINE: u32 = @intFromEnum(Madvise.soft_offline); +}; + +/// DEPRECATED alias to Fadvice +pub const POSIX_FADV = Fadvise; + +/// advice flags for `posix_fadvise` +/// matches POSIX_FADV_* in kernel +pub const Fadvise = switch (native_arch) { + .s390x => if (@typeInfo(usize).int.bits == 64) enum(u32) { + /// No further special treatment + normal = 0, + /// Expect random page references + random = 1, + /// Expect sequential page references + sequential = 2, + /// Will need these pages + willneed = 3, + /// Don't need these pages + dontneed = 6, + /// Data will be accessed once + noreuse = 7, + _, + + pub const NORMAL: u32 = @intFromEnum(Fadvise.normal); + pub const RANDOM: u32 = @intFromEnum(Fadvise.random); + pub const SEQUENTIAL: u32 = @intFromEnum(Fadvise.sequential); + pub const WILLNEED: u32 = @intFromEnum(Fadvise.willneed); + pub const DONTNEED: u32 = @intFromEnum(Fadvise.dontneed); + pub const NOREUSE: u32 = @intFromEnum(Fadvise.noreuse); }, - else => struct { - pub const NORMAL = 0; - pub const RANDOM = 1; - pub const SEQUENTIAL = 2; - pub const WILLNEED = 3; - pub const DONTNEED = 4; - pub const NOREUSE = 5; + else => enum(u32) { + /// No further special treatment + normal = 0, + /// Expect random page references + random = 1, + /// Expect sequential page references + sequential = 2, + /// Will need these pages + willneed = 3, + /// Don't need these pages + dontneed = 4, + /// Data will be accessed once + noreuse = 5, + _, + + pub const NORMAL: u32 = @intFromEnum(Fadvise.normal); + pub const RANDOM: u32 = @intFromEnum(Fadvise.random); + pub const SEQUENTIAL: u32 = @intFromEnum(Fadvise.sequential); + pub const WILLNEED: u32 = @intFromEnum(Fadvise.willneed); + pub const DONTNEED: u32 = @intFromEnum(Fadvise.dontneed); + pub const NOREUSE: u32 = @intFromEnum(Fadvise.noreuse); }, }; @@ -9793,19 +10108,6 @@ pub const PTRACE = struct { }; }; -/// For futex2_waitv and futex2_requeue. Arrays of `futex2_waitone` allow -/// waiting on multiple futexes in one call. -pub const futex2_waitone = extern struct { - /// Expected value at uaddr, should match size of futex. - val: u64, - /// User address to wait on. Top-bits must be 0 on 32-bit. - uaddr: u64, - /// Flags for this waiter. - flags: FUTEX2_FLAGS, - /// Reserved member to preserve alignment. - __reserved: u32 = 0, -}; - pub const cache_stat_range = extern struct { off: u64, len: u64, diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index 54b4a5bd386d..5658ff57dc32 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -3,6 +3,7 @@ const std = @import("std"); const builtin = @import("builtin"); const assert = std.debug.assert; const mem = std.mem; +const math = std.math; const net = std.Io.net; const posix = std.posix; const linux = std.os.linux; @@ -11,71 +12,86 @@ const is_linux = builtin.os.tag == .linux; const page_size_min = std.heap.page_size_min; fd: linux.fd_t = -1, -sq: SubmissionQueue, -cq: CompletionQueue, -flags: u32, -features: u32, +sq: Sq, +cq: Cq, +flags: uflags.Setup, +features: uflags.Features, +/// matches int_flags in liburing +init_flags: uflags.Init, /// A friendly way to setup an io_uring, with default linux.io_uring_params. -/// `entries` must be a power of two between 1 and 32768, although the kernel will make the final -/// call on how many entries the submission and completion queues will ultimately have, +/// `entries` must be a power of two between 1 and 32768, although the kernel +/// will make the final call on how many entries the submission and completion +/// queues will ultimately have, /// see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L8027-L8050. -/// Matches the interface of io_uring_queue_init() in liburing. -pub fn init(entries: u16, flags: u32) !IoUring { - var params = mem.zeroInit(linux.io_uring_params, .{ +/// Matches the interface of `io_uring_queue_init()` in liburing. +pub fn init(entries: u16, flags: uflags.Setup) !IoUring { + var params = mem.zeroInit(Params, .{ .flags = flags, .sq_thread_idle = 1000, }); - return try IoUring.init_params(entries, ¶ms); + return try .init_params(entries, ¶ms); } -/// A powerful way to setup an io_uring, if you want to tweak linux.io_uring_params such as submission -/// queue thread cpu affinity or thread idle timeout (the kernel and our default is 1 second). -/// `params` is passed by reference because the kernel needs to modify the parameters. -/// Matches the interface of io_uring_queue_init_params() in liburing. -pub fn init_params(entries: u16, p: *linux.io_uring_params) !IoUring { +/// A powerful way to setup an io_uring, if you want to tweak +/// linux.io_uring_params such as submission queue thread cpu affinity or +/// thread idle timeout (the kernel and our default is 1 second). +/// `params` is passed by reference because the kernel needs to modify the +/// parameters. +/// Matches the interface of `io_uring_queue_init_params()` in liburing. +pub fn init_params(entries: u16, p: *Params) !IoUring { if (entries == 0) return error.EntriesZero; - if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; - + if (!math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; assert(p.sq_entries == 0); - assert(p.cq_entries == 0 or p.flags & linux.IORING_SETUP_CQSIZE != 0); - assert(p.features == 0); - assert(p.wq_fd == 0 or p.flags & linux.IORING_SETUP_ATTACH_WQ != 0); + assert(p.features.empty()); assert(p.resv[0] == 0); assert(p.resv[1] == 0); assert(p.resv[2] == 0); + assert(p.cq_entries == 0 or p.flags.cqsize); + assert(p.wq_fd == 0 or p.flags.attach_wq); + + // flags compatibility + if (p.flags.sqpoll) assert(!(p.flags.coop_taskrun or p.flags.taskrun_flag or p.flags.defer_taskrun)); + if (p.flags.sq_aff) assert(p.flags.sqpoll); + if (p.flags.defer_taskrun) assert(p.flags.single_issuer); + const res = linux.io_uring_setup(entries, p); switch (linux.E.init(res)) { .SUCCESS => {}, .FAULT => return error.ParamsOutsideAccessibleAddressSpace, - // The resv array contains non-zero data, p.flags contains an unsupported flag, - // entries out of bounds, IORING_SETUP_SQ_AFF was specified without IORING_SETUP_SQPOLL, - // or IORING_SETUP_CQSIZE was specified but linux.io_uring_params.cq_entries was invalid: + // The resv array contains non-zero data, p.flags contains an + // unsupported flag, entries out of bounds, IORING_SETUP_SQ_AFF was + // specified without IORING_SETUP_SQPOLL, or IORING_SETUP_CQSIZE was + // specified but linux.io_uring_params.cq_entries was invalid: .INVAL => return error.ArgumentsInvalid, .MFILE => return error.ProcessFdQuotaExceeded, .NFILE => return error.SystemFdQuotaExceeded, .NOMEM => return error.SystemResources, - // IORING_SETUP_SQPOLL was specified but effective user ID lacks sufficient privileges, - // or a container seccomp policy prohibits io_uring syscalls: + // IORING_SETUP_SQPOLL was specified but effective user ID lacks + // sufficient privileges, or a container seccomp policy prohibits + // io_uring syscalls: .PERM => return error.PermissionDenied, .NOSYS => return error.SystemOutdated, else => |errno| return posix.unexpectedErrno(errno), } - const fd = @as(linux.fd_t, @intCast(res)); + const fd: linux.fd_t = @intCast(res); assert(fd >= 0); errdefer posix.close(fd); - // Kernel versions 5.4 and up use only one mmap() for the submission and completion queues. - // This is not an optional feature for us... if the kernel does it, we have to do it. - // The thinking on this by the kernel developers was that both the submission and the - // completion queue rings have sizes just over a power of two, but the submission queue ring - // is significantly smaller with u32 slots. By bundling both in a single mmap, the kernel - // gets the submission queue ring for free. + // Kernel versions 5.4 and up use only one mmap() for the submission and + // completion queues. + // This is not an optional feature for us... if the kernel does it, we have + // to do it. The thinking on this by the kernel developers was that both + // the submission and the completion queue rings have sizes just over a + // power of two, but the submission queue ring is significantly smaller + // with u32 slots. By bundling both in a single mmap, the kernel gets the + // submission queue ring for free. // See https://patchwork.kernel.org/patch/11115257 for the kernel patch. - // We do not support the double mmap() done before 5.4, because we want to keep the - // init/deinit mmap paths simple and because io_uring has had many bug fixes even since 5.4. - if ((p.features & linux.IORING_FEAT_SINGLE_MMAP) == 0) { + // We do not support the double mmap() done before 5.4, because we want to + // keep the init/deinit mmap paths simple and because io_uring has had many + // bug fixes even since 5.4. + if (!p.features.single_mmap) { return error.SystemOutdated; } @@ -84,18 +100,21 @@ pub fn init_params(entries: u16, p: *linux.io_uring_params) !IoUring { assert(p.cq_entries != 0); assert(p.cq_entries >= p.sq_entries); - // From here on, we only need to read from params, so pass `p` by value as immutable. - // The completion queue shares the mmap with the submission queue, so pass `sq` there too. - var sq = try SubmissionQueue.init(fd, p.*); + // From here on, we only need to read from params, so pass `p` by value as + // immutable. + // The completion queue shares the mmap with the submission queue, so pass + // `sq` there too. + var sq: Sq = try .init(fd, p.*); errdefer sq.deinit(); - var cq = try CompletionQueue.init(fd, p.*, sq); + var cq: Cq = try .init(fd, p.*, sq); errdefer cq.deinit(); // Check that our starting state is as we expect. assert(sq.head.* == 0); assert(sq.tail.* == 0); assert(sq.mask == p.sq_entries - 1); - // Allow flags.* to be non-zero, since the kernel may set IORING_SQ_NEED_WAKEUP at any time. + // Allow flags.* to be non-zero, since the kernel may set + // IORING_SQ_NEED_WAKEUP at any time. assert(sq.dropped.* == 0); assert(sq.array.len == p.sq_entries); assert(sq.sqes.len == p.sq_entries); @@ -108,12 +127,13 @@ pub fn init_params(entries: u16, p: *linux.io_uring_params) !IoUring { assert(cq.overflow.* == 0); assert(cq.cqes.len == p.cq_entries); - return IoUring{ + return .{ .fd = fd, .sq = sq, .cq = cq, .flags = p.flags, .features = p.features, + .init_flags = .{}, }; } @@ -126,17 +146,20 @@ pub fn deinit(self: *IoUring) void { self.fd = -1; } -/// Returns a pointer to a vacant SQE, or an error if the submission queue is full. -/// We follow the implementation (and atomics) of liburing's `io_uring_get_sqe()` exactly. +/// Returns a pointer to a vacant SQE, or an error if the submission queue is +/// full. We follow the implementation (and atomics) of liburing's +/// `io_uring_get_sqe()` exactly. /// However, instead of a null we return an error to force safe handling. -/// Any situation where the submission queue is full tends more towards a control flow error, -/// and the null return in liburing is more a C idiom than anything else, for lack of a better -/// alternative. In Zig, we have first-class error handling... so let's use it. -/// Matches the implementation of io_uring_get_sqe() in liburing. -pub fn get_sqe(self: *IoUring) !*linux.io_uring_sqe { +/// Any situation where the submission queue is full tends more towards a +/// control flow error, and the null return in liburing is more a C idiom than +/// anything else, for lack of a better alternative. In Zig, we have +/// first-class error handling... so let's use it. +/// Matches the implementation of `io_uring_get_sqe()` in liburing. +pub fn get_sqe(self: *IoUring) !*Sqe { const head = @atomicLoad(u32, self.sq.head, .acquire); - // Remember that these head and tail offsets wrap around every four billion operations. - // We must therefore use wrapping addition and subtraction to avoid a runtime crash. + // Remember that these head and tail offsets wrap around every four billion + // operations. We must therefore use wrapping addition and subtraction to + // avoid a runtime crash. const next = self.sq.sqe_tail +% 1; if (next -% head > self.sq.sqes.len) return error.SubmissionQueueFull; const sqe = &self.sq.sqes[self.sq.sqe_tail & self.sq.mask]; @@ -144,26 +167,28 @@ pub fn get_sqe(self: *IoUring) !*linux.io_uring_sqe { return sqe; } -/// Submits the SQEs acquired via get_sqe() to the kernel. You can call this once after you have -/// called get_sqe() multiple times to setup multiple I/O requests. -/// Returns the number of SQEs submitted, if not used alongside IORING_SETUP_SQPOLL. -/// If the io_uring instance is uses IORING_SETUP_SQPOLL, the value returned on success is not -/// guaranteed to match the amount of actually submitted sqes during this call. A value higher -/// or lower, including 0, may be returned. -/// Matches the implementation of io_uring_submit() in liburing. +/// Submits the SQEs acquired via `get_sqe()` to the kernel. You can call this +/// once after you have called `get_sqe()` multiple times to setup multiple I/O +/// requests. +/// Returns the number of SQEs submitted, if not used alongside +/// IORING_SETUP_SQPOLL. +/// If the io_uring instance uses IORING_SETUP_SQPOLL, the value returned on +/// success is not guaranteed to match the amount of actually submitted sqes +/// during this call. A value higher or lower, including 0, may be returned. +/// Matches the implementation of `io_uring_submit()` in liburing. pub fn submit(self: *IoUring) !u32 { return self.submit_and_wait(0); } -/// Like submit(), but allows waiting for events as well. +/// Like `submit()`, but allows waiting for events as well. /// Returns the number of SQEs submitted. -/// Matches the implementation of io_uring_submit_and_wait() in liburing. +/// Matches the implementation of `io_uring_submit_and_wait()` in liburing. pub fn submit_and_wait(self: *IoUring, wait_nr: u32) !u32 { const submitted = self.flush_sq(); - var flags: u32 = 0; + var flags: uflags.Enter = self.enter_flags(); if (self.sq_ring_needs_enter(&flags) or wait_nr > 0) { - if (wait_nr > 0 or (self.flags & linux.IORING_SETUP_IOPOLL) != 0) { - flags |= linux.IORING_ENTER_GETEVENTS; + if (wait_nr > 0 or self.flags.iopoll) { + flags.getevents = true; } return try self.enter(submitted, wait_nr, flags); } @@ -172,45 +197,53 @@ pub fn submit_and_wait(self: *IoUring, wait_nr: u32) !u32 { /// Tell the kernel we have submitted SQEs and/or want to wait for CQEs. /// Returns the number of SQEs submitted. -pub fn enter(self: *IoUring, to_submit: u32, min_complete: u32, flags: u32) !u32 { +pub fn enter(self: *IoUring, to_submit: u32, min_complete: u32, flags: uflags.Enter) !u32 { assert(self.fd >= 0); const res = linux.io_uring_enter(self.fd, to_submit, min_complete, flags, null); switch (linux.E.init(res)) { .SUCCESS => {}, - // The kernel was unable to allocate memory or ran out of resources for the request. - // The application should wait for some completions and try again: + // The kernel was unable to allocate memory or ran out of resources for + // the request. The application should wait for some completions and + // try again: .AGAIN => return error.SystemResources, - // The SQE `fd` is invalid, or IOSQE_FIXED_FILE was set but no files were registered: + // The SQE `fd` is invalid, or IOSQE_FIXED_FILE was set but no files + // were registered: .BADF => return error.FileDescriptorInvalid, // The file descriptor is valid, but the ring is not in the right state. // See io_uring_register(2) for how to enable the ring. .BADFD => return error.FileDescriptorInBadState, - // The application attempted to overcommit the number of requests it can have pending. - // The application should wait for some completions and try again: + // The application attempted to overcommit the number of requests it + // can have pending. The application should wait for some completions + // and try again: .BUSY => return error.CompletionQueueOvercommitted, - // The SQE is invalid, or valid but the ring was setup with IORING_SETUP_IOPOLL: + // The SQE is invalid, or valid but the ring was setup with + // IORING_SETUP_IOPOLL: .INVAL => return error.SubmissionQueueEntryInvalid, - // The buffer is outside the process' accessible address space, or IORING_OP_READ_FIXED - // or IORING_OP_WRITE_FIXED was specified but no buffers were registered, or the range - // described by `addr` and `len` is not within the buffer registered at `buf_index`: + // The buffer is outside the process' accessible address space, or + // IORING_OP_READ_FIXED or IORING_OP_WRITE_FIXED was specified but no + // buffers were registered, or the range described by `addr` and `len` + // is not within the buffer registered at `buf_index`: .FAULT => return error.BufferInvalid, .NXIO => return error.RingShuttingDown, - // The kernel believes our `self.fd` does not refer to an io_uring instance, - // or the opcode is valid but not supported by this kernel (more likely): + // The kernel believes our `self.fd` does not refer to an io_uring + // instance, or the opcode is valid but not supported by this kernel + // (more likely): .OPNOTSUPP => return error.OpcodeNotSupported, - // The operation was interrupted by a delivery of a signal before it could complete. - // This can happen while waiting for events with IORING_ENTER_GETEVENTS: + // The operation was interrupted by a delivery of a signal before it + // could complete. This can happen while waiting for events with + // IORING_ENTER_GETEVENTS: .INTR => return error.SignalInterrupt, else => |errno| return posix.unexpectedErrno(errno), } - return @as(u32, @intCast(res)); + return @intCast(res); } /// Sync internal state with kernel ring state on the SQ side. -/// Returns the number of all pending events in the SQ ring, for the shared ring. -/// This return value includes previously flushed SQEs, as per liburing. -/// The rationale is to suggest that an io_uring_enter() call is needed rather than not. -/// Matches the implementation of __io_uring_flush_sq() in liburing. +/// Returns the number of all pending events in the SQ ring, for the shared +/// ring. This return value includes previously flushed SQEs, as per liburing. +/// The rationale is to suggest that an `io_uring_enter()` call is needed rather +/// than not. +/// Matches the implementation of `__io_uring_flush_sq()` in liburing. pub fn flush_sq(self: *IoUring) u32 { if (self.sq.sqe_head != self.sq.sqe_tail) { // Fill in SQEs that we have queued up, adding them to the kernel ring. @@ -222,64 +255,75 @@ pub fn flush_sq(self: *IoUring) u32 { tail +%= 1; self.sq.sqe_head +%= 1; } - // Ensure that the kernel can actually see the SQE updates when it sees the tail update. + // Ensure that the kernel can actually see the SQE updates when it sees + // the tail update. @atomicStore(u32, self.sq.tail, tail, .release); } return self.sq_ready(); } /// Returns true if we are not using an SQ thread (thus nobody submits but us), -/// or if IORING_SQ_NEED_WAKEUP is set and the SQ thread must be explicitly awakened. -/// For the latter case, we set the SQ thread wakeup flag. -/// Matches the implementation of sq_ring_needs_enter() in liburing. -pub fn sq_ring_needs_enter(self: *IoUring, flags: *u32) bool { - assert(flags.* == 0); - if ((self.flags & linux.IORING_SETUP_SQPOLL) == 0) return true; - if ((@atomicLoad(u32, self.sq.flags, .unordered) & linux.IORING_SQ_NEED_WAKEUP) != 0) { - flags.* |= linux.IORING_ENTER_SQ_WAKEUP; +/// or if IORING_SQ_NEED_WAKEUP is set and the SQ thread must be explicitly +/// awakened. For the latter case, we set the SQ thread wakeup flag. +/// Matches the implementation of `sq_ring_needs_enter()` in liburing. +pub fn sq_ring_needs_enter(self: *IoUring, flags: *uflags.Enter) bool { + assert(flags.*.valid_init_flags()); + if (!self.flags.sqpoll) return true; + if (@atomicLoad(Sq.Flags, self.sq.flags, .unordered).need_wakeup) { + flags.*.sq_wakeup = true; return true; } return false; } -/// Returns the number of flushed and unflushed SQEs pending in the submission queue. -/// In other words, this is the number of SQEs in the submission queue, i.e. its length. -/// These are SQEs that the kernel is yet to consume. -/// Matches the implementation of io_uring_sq_ready in liburing. +/// Returns the number of flushed and unflushed SQEs pending in the submission +/// queue. In other words, this is the number of SQEs in the submission queue, +/// i.e. its length. These are SQEs that the kernel is yet to consume. +/// Matches the implementation of `io_uring_sq_ready()` in liburing. pub fn sq_ready(self: *IoUring) u32 { - // Always use the shared ring state (i.e. head and not sqe_head) to avoid going out of sync, - // see https://github.com/axboe/liburing/issues/92. + // Always use the shared ring state (i.e. head and not sqe_head) to avoid + // going out of sync, see https://github.com/axboe/liburing/issues/92. return self.sq.sqe_tail -% @atomicLoad(u32, self.sq.head, .acquire); } /// Returns the number of CQEs in the completion queue, i.e. its length. /// These are CQEs that the application is yet to consume. -/// Matches the implementation of io_uring_cq_ready in liburing. +/// Matches the implementation of `io_uring_cq_ready()` in liburing. pub fn cq_ready(self: *IoUring) u32 { return @atomicLoad(u32, self.cq.tail, .acquire) -% self.cq.head.*; } -/// Copies as many CQEs as are ready, and that can fit into the destination `cqes` slice. -/// If none are available, enters into the kernel to wait for at most `wait_nr` CQEs. +/// Copies as many CQEs as are ready, and that can fit into the destination +/// `cqes` slice. If none are available, enters into the kernel to wait for at +/// least `wait_nr` CQEs. /// Returns the number of CQEs copied, advancing the CQ ring. -/// Provides all the wait/peek methods found in liburing, but with batching and a single method. -/// The rationale for copying CQEs rather than copying pointers is that pointers are 8 bytes -/// whereas CQEs are not much more at only 16 bytes, and this provides a safer faster interface. -/// Safer, because you no longer need to call cqe_seen(), avoiding idempotency bugs. -/// Faster, because we can now amortize the atomic store release to `cq.head` across the batch. +/// Provides all the wait/peek methods found in liburing, but with batching and +/// a single method. +/// The rationale for copying CQEs rather than copying pointers is that +/// pointers are 8 bytes whereas CQEs are not much more at only 16 bytes, and +/// this provides a safer faster interface. +/// Safer, because you no longer need to call `cqe_seen()`, avoiding idempotency +/// bugs. Faster, because we can now amortize the atomic store release to +/// `cq.head` across the batch. /// See https://github.com/axboe/liburing/issues/103#issuecomment-686665007. -/// Matches the implementation of io_uring_peek_batch_cqe() in liburing, but supports waiting. -pub fn copy_cqes(self: *IoUring, cqes: []linux.io_uring_cqe, wait_nr: u32) !u32 { +/// Matches the implementation of `io_uring_peek_batch_cqe()` in liburing, but +/// supports waiting. +pub fn copy_cqes(self: *IoUring, cqes: []Cqe, wait_nr: u32) !u32 { const count = self.copy_cqes_ready(cqes); if (count > 0) return count; if (self.cq_ring_needs_flush() or wait_nr > 0) { - _ = try self.enter(0, wait_nr, linux.IORING_ENTER_GETEVENTS); + const flags = blk: { + var flags = self.enter_flags(); + flags.getevents = true; + break :blk flags; + }; + _ = try self.enter(0, wait_nr, flags); return self.copy_cqes_ready(cqes); } return 0; } -fn copy_cqes_ready(self: *IoUring, cqes: []linux.io_uring_cqe) u32 { +fn copy_cqes_ready(self: *IoUring, cqes: []Cqe) u32 { const ready = self.cq_ready(); const count = @min(cqes.len, ready); const head = self.cq.head.* & self.cq.mask; @@ -298,89 +342,125 @@ fn copy_cqes_ready(self: *IoUring, cqes: []linux.io_uring_cqe) u32 { return count; } -/// Returns a copy of an I/O completion, waiting for it if necessary, and advancing the CQ ring. -/// A convenience method for `copy_cqes()` for when you don't need to batch or peek. -pub fn copy_cqe(ring: *IoUring) !linux.io_uring_cqe { - var cqes: [1]linux.io_uring_cqe = undefined; +/// Returns a copy of an I/O completion, waiting for it if necessary, and +/// advancing the CQ ring. +/// A convenience method for `copy_cqes()` for when you don't need to batch or +/// peek. +pub fn copy_cqe(ring: *IoUring) !Cqe { + var cqes: [1]Cqe = undefined; while (true) { const count = try ring.copy_cqes(&cqes, 1); if (count > 0) return cqes[0]; } } -/// Matches the implementation of cq_ring_needs_flush() in liburing. +/// Matches the implementation of `cq_ring_needs_flush()` in liburing. pub fn cq_ring_needs_flush(self: *IoUring) bool { - return (@atomicLoad(u32, self.sq.flags, .unordered) & linux.IORING_SQ_CQ_OVERFLOW) != 0; + const sq_flags = @atomicLoad(Sq.Flags, self.sq.flags, .unordered); + if (sq_flags.cq_overflow or sq_flags.taskrun) return true; + return false; } /// For advanced use cases only that implement custom completion queue methods. -/// If you use copy_cqes() or copy_cqe() you must not call cqe_seen() or cq_advance(). -/// Must be called exactly once after a zero-copy CQE has been processed by your application. +/// If you use `copy_cqes()` or `copy_cqe()` you must not call `cqe_seen()` or +/// `cq_advance()`. Must be called exactly once after a zero-copy CQE has been +/// processed by your application. /// Not idempotent, calling more than once will result in other CQEs being lost. -/// Matches the implementation of cqe_seen() in liburing. -pub fn cqe_seen(self: *IoUring, cqe: *linux.io_uring_cqe) void { +/// Matches the implementation of `cqe_seen()` in liburing. +pub fn cqe_seen(self: *IoUring, cqe: *Cqe) void { _ = cqe; self.cq_advance(1); } /// For advanced use cases only that implement custom completion queue methods. -/// Matches the implementation of cq_advance() in liburing. +/// Matches the implementation of `cq_advance()` in liburing. pub fn cq_advance(self: *IoUring, count: u32) void { if (count > 0) { - // Ensure the kernel only sees the new head value after the CQEs have been read. + // Ensure the kernel only sees the new head value after the CQEs have + // been read. @atomicStore(u32, self.cq.head, self.cq.head.* +% count, .release); } } -/// Queues (but does not submit) an SQE to perform an `fsync(2)`. -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. -/// For example, for `fdatasync()` you can set `IORING_FSYNC_DATASYNC` in the SQE's `rw_flags`. -/// N.B. While SQEs are initiated in the order in which they appear in the submission queue, -/// operations execute in parallel and completions are unordered. Therefore, an application that -/// submits a write followed by an fsync in the submission queue cannot expect the fsync to -/// apply to the write, since the fsync may complete before the write is issued to the disk. -/// You should preferably use `link_with_next_sqe()` on a write's SQE to link it with an fsync, -/// or else insert a full write barrier using `drain_previous_sqes()` when queueing an fsync. -pub fn fsync(self: *IoUring, user_data: u64, fd: linux.fd_t, flags: u32) !*linux.io_uring_sqe { +/// Enable/disable setting of iowait by the kernel. +/// matches `io_uring_set_iowait` in liburing +pub fn set_iowait(self: *IoUring, enable_iowait: bool) !void { + if (!self.features.no_iowait) { + return error.SystemOutdated; + } + self.init_flags.no_iowait = !enable_iowait; +} + +/// matches `ring_enter_flags()` in liburing +pub fn enter_flags(self: *IoUring) uflags.Enter { + return self.init_flags.enter_flags(); +} + +/// Queues (but does not submit) an SQE to perform a `splice(2)` +/// Either `fd_in` or `fd_out` must be a pipe. +/// If `fd_in` refers to a pipe, `off_in` is ignored and must be set to +/// math.maxInt(u64). +/// If `fd_in` does not refer to a pipe and `off_in` is maxInt(u64), then `len` +/// are read from `fd_in` starting from the file offset, which is incremented +/// by the number of bytes read. +/// If `fd_in` does not refer to a pipe and `off_in` is not maxInt(u64), then +/// the starting offset of `fd_in` will be `off_in`. +/// +/// This splice operation can be used to implement sendfile by splicing to an +/// intermediate pipe first, then splice to the final destination. In fact, the +/// implementation of sendfile in kernel uses splice internally. +/// +/// NOTE that even if `fd_in` or `fd_out` refers to a pipe, the splice operation +/// can still fail with EINVAL if one of the fd doesn't explicitly support +/// splice operation, e.g. reading from terminal is unsupported from kernel 5.7 +/// to 5.11. See https://github.com/axboe/liburing/issues/291 +/// +/// Returns a pointer to the SQE so that you can further modify the SQE for +/// advanced use cases. +pub fn splice( + self: *IoUring, + user_data: u64, + fd_in: linux.fd_t, + off_in: u64, + fd_out: linux.fd_t, + off_out: u64, + len: u32, + flags: uflags.Splice, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_fsync(fd, flags); + sqe.prep_splice( + fd_in, + off_in, + fd_out, + off_out, + len, + flags, + ); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a no-op. -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. -/// A no-op is more useful than may appear at first glance. -/// For example, you could call `drain_previous_sqes()` on the returned SQE, to use the no-op to -/// know when the ring is idle before acting on a kill signal. -pub fn nop(self: *IoUring, user_data: u64) !*linux.io_uring_sqe { +pub fn tee( + self: *IoUring, + user_data: u64, + fd_in: linux.fd_t, + fd_out: linux.fd_t, + len: u32, + flags: uflags.Splice, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_nop(); + sqe.prep_tee(fd_in, fd_out, len, flags); sqe.user_data = user_data; return sqe; } -/// Used to select how the read should be handled. -pub const ReadBuffer = union(enum) { - /// io_uring will read directly into this buffer - buffer: []u8, - - /// io_uring will read directly into these buffers using readv. - iovecs: []const posix.iovec, - - /// io_uring will select a buffer that has previously been provided with `provide_buffers`. - /// The buffer group reference by `group_id` must contain at least one buffer for the read to work. - /// `len` controls the number of bytes to read into the selected buffer. - buffer_selection: struct { - group_id: u16, - len: usize, - }, -}; - -/// Queues (but does not submit) an SQE to perform a `read(2)` or `preadv(2)` depending on the buffer type. -/// * Reading into a `ReadBuffer.buffer` uses `read(2)` +/// Queues (but does not submit) an SQE to perform a `pread(2)` or `preadv(2)` +/// depending on the buffer type. +/// * Reading into a `ReadBuffer.buffer` uses `pread(2)` /// * Reading into a `ReadBuffer.iovecs` uses `preadv(2)` -/// If you want to do a `preadv2(2)` then set `rw_flags` on the returned SQE. See https://man7.org/linux/man-pages/man2/preadv2.2.html +/// +/// If you want to do a `preadv2(2)` then set `rw_flags` on the returned SQE. +/// See https://man7.org/linux/man-pages/man2/preadv2.2.html /// /// Returns a pointer to the SQE. pub fn read( @@ -389,14 +469,14 @@ pub fn read( fd: linux.fd_t, buffer: ReadBuffer, offset: u64, -) !*linux.io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); switch (buffer) { .buffer => |slice| sqe.prep_read(fd, slice, offset), .iovecs => |vecs| sqe.prep_readv(fd, vecs, offset), .buffer_selection => |selection| { - sqe.prep_rw(.READ, fd, 0, selection.len, offset); - sqe.flags |= linux.IOSQE_BUFFER_SELECT; + sqe.prep_rw(.read, fd, 0, selection.len, offset); + sqe.flags.buffer_select = true; sqe.buf_index = selection.group_id; }, } @@ -404,375 +484,303 @@ pub fn read( return sqe; } -/// Queues (but does not submit) an SQE to perform a `write(2)`. -/// Returns a pointer to the SQE. -pub fn write( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - buffer: []const u8, - offset: u64, -) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - sqe.prep_write(fd, buffer, offset); - sqe.user_data = user_data; - return sqe; -} - -/// Queues (but does not submit) an SQE to perform a `splice(2)` -/// Either `fd_in` or `fd_out` must be a pipe. -/// If `fd_in` refers to a pipe, `off_in` is ignored and must be set to std.math.maxInt(u64). -/// If `fd_in` does not refer to a pipe and `off_in` is maxInt(u64), then `len` are read -/// from `fd_in` starting from the file offset, which is incremented by the number of bytes read. -/// If `fd_in` does not refer to a pipe and `off_in` is not maxInt(u64), then the starting offset of `fd_in` will be `off_in`. -/// This splice operation can be used to implement sendfile by splicing to an intermediate pipe first, -/// then splice to the final destination. In fact, the implementation of sendfile in kernel uses splice internally. -/// -/// NOTE that even if fd_in or fd_out refers to a pipe, the splice operation can still fail with EINVAL if one of the -/// fd doesn't explicitly support splice peration, e.g. reading from terminal is unsupported from kernel 5.7 to 5.11. -/// See https://github.com/axboe/liburing/issues/291 -/// -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. -pub fn splice(self: *IoUring, user_data: u64, fd_in: linux.fd_t, off_in: u64, fd_out: linux.fd_t, off_out: u64, len: usize) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - sqe.prep_splice(fd_in, off_in, fd_out, off_out, len); - sqe.user_data = user_data; - return sqe; -} - /// Queues (but does not submit) an SQE to perform a IORING_OP_READ_FIXED. -/// The `buffer` provided must be registered with the kernel by calling `register_buffers` first. -/// The `buffer_index` must be the same as its index in the array provided to `register_buffers`. +/// The `buffer` provided must be registered with the kernel by calling +/// `register_buffers()` first. The `buffer_index` must be the same as its +/// index in the array provided to `register_buffers()`. /// -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +/// Returns a pointer to the SQE so that you can further modify the SQE for +/// advanced use cases. pub fn read_fixed( self: *IoUring, user_data: u64, fd: linux.fd_t, - buffer: *posix.iovec, + buffer: ReadBuffer, offset: u64, buffer_index: u16, -) !*linux.io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_read_fixed(fd, buffer, offset, buffer_index); + switch (buffer) { + .buffer => |slice| sqe.prep_read_fixed(fd, slice, offset, buffer_index), + .iovecs => |vecs| sqe.prep_readv_fixed(fd, vecs, offset, buffer_index), + .buffer_selection => |selection| { + sqe.prep_rw(.read_fixed, fd, 0, selection.len, offset); + sqe.flags.buffer_select = true; + sqe.buf_index = selection.group_id; + }, + } sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `pwritev()`. -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. -/// For example, if you want to do a `pwritev2()` then set `rw_flags` on the returned SQE. -/// See https://linux.die.net/man/2/pwritev. -pub fn writev( +/// Queues (but does not submit) an SQE to perform a `pwrite(2)` or `pwritev(2)` +/// depending on the write buffer type. +/// * Reading into a `WriteBuffer.buffer` uses `pwrite(2)` +/// * Reading into a `WriteBuffer.iovecs` uses `pwritev(2)` +/// +/// Returns a pointer to the SQE so that you can further modify the SQE for +/// advanced use cases. +/// For example, if you want to do a `pwritev2()` then set `rw_flags` on the +/// returned SQE. See https://linux.die.net/man/2/pwritev. +pub fn write( self: *IoUring, user_data: u64, fd: linux.fd_t, - iovecs: []const posix.iovec_const, + buffer: WriteBuffer, offset: u64, -) !*linux.io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_writev(fd, iovecs, offset); + switch (buffer) { + .buffer => |slice| sqe.prep_write(fd, slice, offset), + .iovecs => |vecs| sqe.prep_writev(fd, vecs, offset), + } sqe.user_data = user_data; return sqe; } /// Queues (but does not submit) an SQE to perform a IORING_OP_WRITE_FIXED. -/// The `buffer` provided must be registered with the kernel by calling `register_buffers` first. -/// The `buffer_index` must be the same as its index in the array provided to `register_buffers`. +/// The `buffer` provided must be registered with the kernel by calling +/// `register_buffers()` first. The `buffer_index` must be the same as its index +/// in the array provided to `register_buffers()`. /// -/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. +/// Returns a pointer to the SQE so that you can further modify the SQE for +/// advanced use cases. pub fn write_fixed( self: *IoUring, user_data: u64, fd: linux.fd_t, - buffer: *posix.iovec, + buffer: WriteBuffer, offset: u64, buffer_index: u16, -) !*linux.io_uring_sqe { +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_write_fixed(fd, buffer, offset, buffer_index); + switch (buffer) { + .buffer => |slice| { + sqe.prep_write_fixed(fd, slice, offset, buffer_index); + }, + .iovecs => |vecs| { + sqe.prep_writev_fixed(fd, vecs, offset, buffer_index); + }, + } sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform an `accept4(2)` on a socket. +/// Queues (but does not submit) an SQE to perform a `recvmsg(2)`. /// Returns a pointer to the SQE. -/// Available since 5.5 -pub fn accept( +/// Available since 5.3 +pub fn recvmsg( self: *IoUring, user_data: u64, fd: linux.fd_t, - addr: ?*posix.sockaddr, - addrlen: ?*posix.socklen_t, - flags: u32, -) !*linux.io_uring_sqe { + msg: *linux.msghdr, + flags: linux.Msg, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_accept(fd, addr, addrlen, flags); + sqe.prep_recvmsg(fd, msg, flags); sqe.user_data = user_data; return sqe; } -/// Queues an multishot accept on a socket. -/// -/// Multishot variant allows an application to issue a single accept request, -/// which will repeatedly trigger a CQE when a connection request comes in. -/// While IORING_CQE_F_MORE flag is set in CQE flags accept will generate -/// further CQEs. -/// -/// Available since 5.19 -pub fn accept_multishot( +/// Queues (but does not submit) an SQE to perform a multishot `recvmsg(2)`. +/// Returns a pointer to the SQE. +pub fn recvmsg_multishot( self: *IoUring, user_data: u64, fd: linux.fd_t, - addr: ?*posix.sockaddr, - addrlen: ?*posix.socklen_t, - flags: u32, -) !*linux.io_uring_sqe { + msg: *linux.msghdr, + flags: linux.Msg, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_multishot_accept(fd, addr, addrlen, flags); + sqe.prep_recvmsg_multishot(fd, msg, flags); sqe.user_data = user_data; return sqe; } -/// Queues an accept using direct (registered) file descriptors. -/// -/// To use an accept direct variant, the application must first have registered -/// a file table (with register_files). An unused table index will be -/// dynamically chosen and returned in the CQE res field. -/// -/// After creation, they can be used by setting IOSQE_FIXED_FILE in the SQE -/// flags member, and setting the SQE fd field to the direct descriptor value -/// rather than the regular file descriptor. -/// -/// Available since 5.19 -pub fn accept_direct( +/// Queues (but does not submit) an SQE to perform a `sendmsg(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.3 +pub fn sendmsg( self: *IoUring, user_data: u64, fd: linux.fd_t, - addr: ?*posix.sockaddr, - addrlen: ?*posix.socklen_t, - flags: u32, -) !*linux.io_uring_sqe { + msg: *const linux.msghdr_const, + flags: linux.Msg, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_accept_direct(fd, addr, addrlen, flags, linux.IORING_FILE_INDEX_ALLOC); + sqe.prep_sendmsg(fd, msg, flags); sqe.user_data = user_data; return sqe; } -/// Queues an multishot accept using direct (registered) file descriptors. -/// Available since 5.19 -pub fn accept_multishot_direct( +/// Queues (but does not submit) an SQE to perform a `poll(2)`. +/// Returns a pointer to the SQE. +pub fn poll_add( self: *IoUring, user_data: u64, fd: linux.fd_t, - addr: ?*posix.sockaddr, - addrlen: ?*posix.socklen_t, - flags: u32, -) !*linux.io_uring_sqe { + poll_mask: linux.Epoll, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_multishot_accept_direct(fd, addr, addrlen, flags); + sqe.prep_poll_add(fd, poll_mask); sqe.user_data = user_data; return sqe; } -/// Queue (but does not submit) an SQE to perform a `connect(2)` on a socket. +/// Queues (but does not submit) an SQE to perform a multishot `poll(2)`. /// Returns a pointer to the SQE. -pub fn connect( +pub fn poll_multishot( self: *IoUring, user_data: u64, fd: linux.fd_t, - addr: *const posix.sockaddr, - addrlen: posix.socklen_t, -) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - sqe.prep_connect(fd, addr, addrlen); - sqe.user_data = user_data; + poll_mask: linux.Epoll, +) !*Sqe { + const sqe = try self.poll_add(user_data, fd, poll_mask); + sqe.len = @bitCast(uflags.Poll{ .add_multi = true }); return sqe; } -/// Queues (but does not submit) an SQE to perform a `epoll_ctl(2)`. +/// Queues (but does not submit) an SQE to remove an existing poll operation. /// Returns a pointer to the SQE. -pub fn epoll_ctl( +pub fn poll_remove( self: *IoUring, user_data: u64, - epfd: linux.fd_t, - fd: linux.fd_t, - op: u32, - ev: ?*linux.epoll_event, -) !*linux.io_uring_sqe { + target_user_data: u64, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_epoll_ctl(epfd, fd, op, ev); + sqe.prep_poll_remove(target_user_data); sqe.user_data = user_data; return sqe; } -/// Used to select how the recv call should be handled. -pub const RecvBuffer = union(enum) { - /// io_uring will recv directly into this buffer - buffer: []u8, - - /// io_uring will select a buffer that has previously been provided with `provide_buffers`. - /// The buffer group referenced by `group_id` must contain at least one buffer for the recv call to work. - /// `len` controls the number of bytes to read into the selected buffer. - buffer_selection: struct { - group_id: u16, - len: usize, - }, -}; - -/// Queues (but does not submit) an SQE to perform a `recv(2)`. -/// Returns a pointer to the SQE. -/// Available since 5.6 -pub fn recv( +/// Queues (but does not submit) an SQE to update the user data of an existing +/// poll operation. Returns a pointer to the SQE. +pub fn poll_update( self: *IoUring, user_data: u64, - fd: linux.fd_t, - buffer: RecvBuffer, - flags: u32, -) !*linux.io_uring_sqe { + old_user_data: u64, + new_user_data: u64, + poll_mask: linux.Epoll, + flags: uflags.Poll, +) !*Sqe { const sqe = try self.get_sqe(); - switch (buffer) { - .buffer => |slice| sqe.prep_recv(fd, slice, flags), - .buffer_selection => |selection| { - sqe.prep_rw(.RECV, fd, 0, selection.len, 0); - sqe.rw_flags = flags; - sqe.flags |= linux.IOSQE_BUFFER_SELECT; - sqe.buf_index = selection.group_id; - }, - } + sqe.prep_poll_update(old_user_data, new_user_data, poll_mask, flags); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `send(2)`. -/// Returns a pointer to the SQE. -/// Available since 5.6 -pub fn send( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - buffer: []const u8, - flags: u32, -) !*linux.io_uring_sqe { +/// Queues (but does not submit) an SQE to perform an `fsync(2)`. +/// Returns a pointer to the SQE so that you can further modify the SQE for +/// advanced use cases. +/// For example, for `fdatasync()` you can set `IORING_FSYNC_DATASYNC` in the +/// SQE's `rw_flags`. +/// N.B. While SQEs are initiated in the order in which they appear in the +/// submission queue, operations execute in parallel and completions are +/// unordered. Therefore, an application that submits a write followed by an +/// fsync in the submission queue cannot expect the fsync to apply to the write, +/// since the fsync may complete before the write is issued to the disk. +/// You should preferably use `link_with_next_sqe()` on a write's SQE to link +/// it with an fsync, or else insert a full write barrier using +/// `drain_previous_sqes()` when queueing an fsync. +pub fn fsync(self: *IoUring, user_data: u64, fd: linux.fd_t, flags: uflags.Fsync) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_send(fd, buffer, flags); + sqe.prep_fsync(fd, flags); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`. +/// Queues (but does not submit) an SQE to perform a no-op. +/// Returns a pointer to the SQE so that you can further modify the SQE for +/// advanced use cases. +/// A no-op is more useful than may appear at first glance. +/// For example, you could call `drain_previous_sqes()` on the returned SQE, to +/// use the no-op to know when the ring is idle before acting on a kill signal. +pub fn nop(self: *IoUring, user_data: u64) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_nop(); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to register a timeout operation. +/// Returns a pointer to the SQE. /// -/// This operation will most likely produce two CQEs. The flags field of the -/// first cqe may likely contain IORING_CQE_F_MORE, which means that there will -/// be a second cqe with the user_data field set to the same value. The user -/// must not modify the data buffer until the notification is posted. The first -/// cqe follows the usual rules and so its res field will contain the number of -/// bytes sent or a negative error code. The notification's res field will be -/// set to zero and the flags field will contain IORING_CQE_F_NOTIF. The two -/// step model is needed because the kernel may hold on to buffers for a long -/// time, e.g. waiting for a TCP ACK. Notifications responsible for controlling -/// the lifetime of the buffers. Even errored requests may generate a -/// notification. +/// The timeout will complete when either the timeout expires, or after the +/// specified number of events complete (if `count` is greater than `0`). /// -/// Available since 6.0 -pub fn send_zc( +/// `flags` may be `0` for a relative timeout, or `IORING_TIMEOUT_ABS` for an +/// absolute timeout. +/// +/// The completion event result will be `-ETIME` if the timeout completed +/// through expiration, `0` if the timeout completed after the specified number +/// of events, or `-ECANCELED` if the timeout was removed before it expired. +/// +/// io_uring timeouts use the `CLOCK.MONOTONIC` clock source. +pub fn timeout( self: *IoUring, user_data: u64, - fd: linux.fd_t, - buffer: []const u8, - send_flags: u32, - zc_flags: u16, -) !*linux.io_uring_sqe { + ts: *const linux.kernel_timespec, + count: u32, + flags: uflags.Timeout, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_send_zc(fd, buffer, send_flags, zc_flags); + sqe.prep_timeout(ts, count, flags); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`. +/// Queues (but does not submit) an SQE to remove an existing timeout operation. /// Returns a pointer to the SQE. -/// Available since 6.0 -pub fn send_zc_fixed( +/// +/// The timeout is identified by its `user_data`. +/// +/// The completion event result will be `0` if the timeout was found and +/// cancelled successfully else: +/// `-EBUSY` if the timeout was found but expiration was already in progress, or +/// `-ENOENT` if the timeout was not found. +pub fn timeout_remove( self: *IoUring, user_data: u64, - fd: linux.fd_t, - buffer: []const u8, - send_flags: u32, - zc_flags: u16, - buf_index: u16, -) !*linux.io_uring_sqe { + timeout_user_data: u64, + flags: uflags.Timeout, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_send_zc_fixed(fd, buffer, send_flags, zc_flags, buf_index); + sqe.prep_timeout_remove(timeout_user_data, flags); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `recvmsg(2)`. -/// Returns a pointer to the SQE. -/// Available since 5.3 -pub fn recvmsg( +pub fn timeout_update( self: *IoUring, user_data: u64, - fd: linux.fd_t, - msg: *linux.msghdr, - flags: u32, -) !*linux.io_uring_sqe { + timeout_user_data: u64, + ts: *const linux.kernel_timespec, + flags: uflags.Timeout, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_recvmsg(fd, msg, flags); + sqe.prep_timeout_update(timeout_user_data, ts, flags); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `sendmsg(2)`. +/// Queues (but does not submit) an SQE to perform an `accept4(2)` on a socket. /// Returns a pointer to the SQE. -/// Available since 5.3 -pub fn sendmsg( +/// Available since 5.5 +pub fn accept( self: *IoUring, user_data: u64, fd: linux.fd_t, - msg: *const linux.msghdr_const, - flags: u32, -) !*linux.io_uring_sqe { + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.Sock, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_sendmsg(fd, msg, flags); + sqe.prep_accept(fd, addr, addrlen, flags); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform an async zerocopy `sendmsg(2)`. -/// Returns a pointer to the SQE. -/// Available since 6.1 -pub fn sendmsg_zc( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - msg: *const linux.msghdr_const, - flags: u32, -) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - sqe.prep_sendmsg_zc(fd, msg, flags); - sqe.user_data = user_data; - return sqe; -} - -/// Queues (but does not submit) an SQE to perform an `openat(2)`. -/// Returns a pointer to the SQE. -/// Available since 5.6. -pub fn openat( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.O, - mode: posix.mode_t, -) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - sqe.prep_openat(fd, path, flags, mode); - sqe.user_data = user_data; - return sqe; -} - -/// Queues an openat using direct (registered) file descriptors. +/// Queues an accept using direct (registered) file descriptors. /// /// To use an accept direct variant, the application must first have registered /// a file table (with register_files). An unused table index will be @@ -782,83 +790,88 @@ pub fn openat( /// flags member, and setting the SQE fd field to the direct descriptor value /// rather than the regular file descriptor. /// -/// Available since 5.15 -pub fn openat_direct( +/// Available since 5.19 +pub fn accept_direct( self: *IoUring, user_data: u64, fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.O, - mode: posix.mode_t, - file_index: u32, -) !*linux.io_uring_sqe { + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.Sock, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_openat_direct(fd, path, flags, mode, file_index); + sqe.prep_accept_direct(fd, addr, addrlen, flags, constants.FILE_INDEX_ALLOC); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `close(2)`. -/// Returns a pointer to the SQE. -/// Available since 5.6. -pub fn close(self: *IoUring, user_data: u64, fd: linux.fd_t) !*linux.io_uring_sqe { +/// Queues an multishot accept on a socket. +/// +/// Multishot variant allows an application to issue a single accept request, +/// which will repeatedly trigger a CQE when a connection request comes in. +/// While IORING_CQE_F_MORE flag is set in CQE flags accept will generate +/// further CQEs. +/// +/// Available since 5.19 +pub fn accept_multishot( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.Sock, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_close(fd); + sqe.prep_multishot_accept(fd, addr, addrlen, flags); sqe.user_data = user_data; return sqe; } -/// Queues close of registered file descriptor. -/// Available since 5.15 -pub fn close_direct(self: *IoUring, user_data: u64, file_index: u32) !*linux.io_uring_sqe { +/// Queues an multishot accept using direct (registered) file descriptors. +/// Available since 5.19 +pub fn accept_multishot_direct( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.Sock, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_close_direct(file_index); + sqe.prep_multishot_accept_direct(fd, addr, addrlen, flags); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to register a timeout operation. +/// Queues (but does not submit) an SQE to remove an existing operation. /// Returns a pointer to the SQE. /// -/// The timeout will complete when either the timeout expires, or after the specified number of -/// events complete (if `count` is greater than `0`). -/// -/// `flags` may be `0` for a relative timeout, or `IORING_TIMEOUT_ABS` for an absolute timeout. -/// -/// The completion event result will be `-ETIME` if the timeout completed through expiration, -/// `0` if the timeout completed after the specified number of events, or `-ECANCELED` if the -/// timeout was removed before it expired. +/// The operation is identified by its `user_data`. /// -/// io_uring timeouts use the `CLOCK.MONOTONIC` clock source. -pub fn timeout( +/// The completion event result will be `0` if the operation was found and +/// cancelled successfully else either of: +/// `-EALREADY` if the operation was found but was already in progress +/// `-ENOENT` if the operation was not found. +pub fn cancel( self: *IoUring, user_data: u64, - ts: *const linux.kernel_timespec, - count: u32, - flags: u32, -) !*linux.io_uring_sqe { + cancel_user_data: u64, + flags: uflags.AsyncCancel, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_timeout(ts, count, flags); + sqe.prep_cancel(cancel_user_data, flags); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to remove an existing timeout operation. -/// Returns a pointer to the SQE. -/// -/// The timeout is identified by its `user_data`. -/// -/// The completion event result will be `0` if the timeout was found and canceled successfully, -/// `-EBUSY` if the timeout was found but expiration was already in progress, or -/// `-ENOENT` if the timeout was not found. -pub fn timeout_remove( +pub fn cancel_fd( self: *IoUring, user_data: u64, - timeout_user_data: u64, - flags: u32, -) !*linux.io_uring_sqe { + fd: linux.fd_t, + flags: uflags.AsyncCancel, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_timeout_remove(timeout_user_data, flags); + sqe.prep_cancel_fd(fd, flags); sqe.user_data = user_data; return sqe; } @@ -866,659 +879,2709 @@ pub fn timeout_remove( /// Queues (but does not submit) an SQE to add a link timeout operation. /// Returns a pointer to the SQE. /// -/// You need to set linux.IOSQE_IO_LINK to flags of the target operation -/// and then call this method right after the target operation. +/// You need to set IOSQE_IO_LINK to flags of the target operation and then +/// call this method right after the target operation. /// See https://lwn.net/Articles/803932/ for detail. /// /// If the dependent request finishes before the linked timeout, the timeout /// is canceled. If the timeout finishes before the dependent request, the /// dependent request will be canceled. /// -/// The completion event result of the link_timeout will be -/// `-ETIME` if the timeout finishes before the dependent request -/// (in this case, the completion event result of the dependent request will -/// be `-ECANCELED`), or +/// The completion event result of the link_timeout will be either of: +/// `-ETIME` if the timeout finishes before the dependent request (in this case, +/// the completion event result of the dependent request will be `-ECANCELED`) /// `-EALREADY` if the dependent request finishes before the linked timeout. pub fn link_timeout( self: *IoUring, user_data: u64, ts: *const linux.kernel_timespec, - flags: u32, -) !*linux.io_uring_sqe { + flags: uflags.Timeout, +) !*Sqe { const sqe = try self.get_sqe(); sqe.prep_link_timeout(ts, flags); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `poll(2)`. +/// Queue (but does not submit) an SQE to perform a `connect(2)` on a socket. /// Returns a pointer to the SQE. -pub fn poll_add( +pub fn connect( self: *IoUring, user_data: u64, fd: linux.fd_t, - poll_mask: u32, -) !*linux.io_uring_sqe { + addr: *const linux.sockaddr, + addrlen: linux.socklen_t, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_poll_add(fd, poll_mask); + sqe.prep_connect(fd, addr, addrlen); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to remove an existing poll operation. +/// Queues (but does not submit) an SQE to perform an `bind(2)` on a socket. /// Returns a pointer to the SQE. -pub fn poll_remove( +/// Available since 6.11 +pub fn bind( self: *IoUring, user_data: u64, - target_user_data: u64, -) !*linux.io_uring_sqe { + fd: linux.fd_t, + addr: *const linux.sockaddr, + addrlen: linux.socklen_t, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_poll_remove(target_user_data); + sqe.prep_bind(fd, addr, addrlen); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to update the user data of an existing poll -/// operation. Returns a pointer to the SQE. -pub fn poll_update( +/// Queues (but does not submit) an SQE to perform an `listen(2)` on a socket. +/// Returns a pointer to the SQE. +/// Available since 6.11 +pub fn listen( self: *IoUring, user_data: u64, - old_user_data: u64, - new_user_data: u64, - poll_mask: u32, - flags: u32, -) !*linux.io_uring_sqe { + fd: linux.fd_t, + backlog: u32, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_poll_update(old_user_data, new_user_data, poll_mask, flags); + sqe.prep_listen(fd, backlog); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform an `fallocate(2)`. +/// Queues (but does not submit) an SQE to perform a `epoll_wait(2)`. /// Returns a pointer to the SQE. -pub fn fallocate( +pub fn epoll_wait( self: *IoUring, user_data: u64, fd: linux.fd_t, - mode: i32, - offset: u64, - len: u64, -) !*linux.io_uring_sqe { + events: ?*linux.epoll_event, + max_events: u32, + flags: linux.Epoll, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_fallocate(fd, mode, offset, len); + sqe.prep_epoll_wait(fd, events, max_events, flags); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform an `statx(2)`. +/// Queues (but does not submit) an SQE to perform a `epoll_ctl(2)`. /// Returns a pointer to the SQE. -pub fn statx( +pub fn epoll_ctl( self: *IoUring, user_data: u64, + epfd: linux.fd_t, fd: linux.fd_t, - path: [:0]const u8, - flags: u32, - mask: u32, - buf: *linux.Statx, -) !*linux.io_uring_sqe { + op: linux.EpollOp, + ev: ?*linux.epoll_event, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_statx(fd, path, flags, mask, buf); + sqe.prep_epoll_ctl(epfd, fd, op, ev); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to remove an existing operation. -/// Returns a pointer to the SQE. -/// -/// The operation is identified by its `user_data`. -/// -/// The completion event result will be `0` if the operation was found and canceled successfully, -/// `-EALREADY` if the operation was found but was already in progress, or -/// `-ENOENT` if the operation was not found. -pub fn cancel( +pub fn files_update( self: *IoUring, user_data: u64, - cancel_user_data: u64, - flags: u32, -) !*linux.io_uring_sqe { + fds: []const linux.fd_t, + offset: u32, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_cancel(cancel_user_data, flags); + sqe.prep_files_update(fds, offset); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `shutdown(2)`. +/// Queues (but does not submit) an SQE to perform an `fallocate(2)`. /// Returns a pointer to the SQE. -/// -/// The operation is identified by its `user_data`. -pub fn shutdown( +pub fn fallocate( self: *IoUring, user_data: u64, - sockfd: posix.socket_t, - how: u32, -) !*linux.io_uring_sqe { + fd: linux.fd_t, + mode: i32, + offset: u64, + len: u64, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_shutdown(sockfd, how); + sqe.prep_fallocate(fd, mode, offset, len); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `renameat2(2)`. +/// Queues (but does not submit) an SQE to perform an `openat(2)`. /// Returns a pointer to the SQE. -pub fn renameat( +/// Available since 5.6. +pub fn openat( self: *IoUring, user_data: u64, - old_dir_fd: linux.fd_t, - old_path: [*:0]const u8, - new_dir_fd: linux.fd_t, - new_path: [*:0]const u8, - flags: u32, -) !*linux.io_uring_sqe { + fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: linux.mode_t, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_renameat(old_dir_fd, old_path, new_dir_fd, new_path, flags); + sqe.prep_openat(fd, path, flags, mode); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `unlinkat(2)`. -/// Returns a pointer to the SQE. -pub fn unlinkat( +/// Queues an openat using direct (registered) file descriptors. +/// +/// To use an accept direct variant, the application must first have registered +/// a file table (with register_files()). An unused table index will be +/// dynamically chosen and returned in the CQE res field. +/// +/// After creation, they can be used by setting IOSQE_FIXED_FILE in the SQE +/// flags member, and setting the SQE fd field to the direct descriptor value +/// rather than the regular file descriptor. +/// +/// Available since 5.15 +pub fn openat_direct( self: *IoUring, user_data: u64, - dir_fd: linux.fd_t, + fd: linux.fd_t, path: [*:0]const u8, - flags: u32, -) !*linux.io_uring_sqe { + flags: linux.O, + mode: linux.mode_t, + file_index: u32, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_unlinkat(dir_fd, path, flags); + sqe.prep_openat_direct(fd, path, flags, mode, file_index); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `mkdirat(2)`. +/// Queues (but does not submit) an SQE to perform an `open(2)`. /// Returns a pointer to the SQE. -pub fn mkdirat( +pub fn open( self: *IoUring, user_data: u64, - dir_fd: linux.fd_t, path: [*:0]const u8, - mode: posix.mode_t, -) !*linux.io_uring_sqe { + flags: linux.O, + mode: linux.mode_t, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_mkdirat(dir_fd, path, mode); + sqe.prep_openat(linux.At.fdcwd, path, flags, mode); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `symlinkat(2)`. -/// Returns a pointer to the SQE. -pub fn symlinkat( +/// Queues an open using direct (registered) file descriptors. +/// +/// To use an accept direct variant, the application must first have registered +/// a file table (with register_files()). An unused table index will be +/// dynamically chosen and returned in the CQE res field. +/// +/// After creation, they can be used by setting IOSQE_FIXED_FILE in the SQE +/// flags member, and setting the SQE fd field to the direct descriptor value +/// rather than the regular file descriptor. +pub fn open_direct( self: *IoUring, user_data: u64, - target: [*:0]const u8, - new_dir_fd: linux.fd_t, - link_path: [*:0]const u8, -) !*linux.io_uring_sqe { + path: [*:0]const u8, + flags: linux.O, + mode: linux.mode_t, + file_index: u32, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_symlinkat(target, new_dir_fd, link_path); + sqe.prep_openat_direct(linux.At.fdcwd, path, flags, mode, file_index); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `linkat(2)`. +/// Queues (but does not submit) an SQE to perform a `close(2)`. /// Returns a pointer to the SQE. -pub fn linkat( - self: *IoUring, - user_data: u64, - old_dir_fd: linux.fd_t, - old_path: [*:0]const u8, - new_dir_fd: linux.fd_t, - new_path: [*:0]const u8, - flags: u32, -) !*linux.io_uring_sqe { +/// Available since 5.6. +pub fn close(self: *IoUring, user_data: u64, fd: linux.fd_t) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_linkat(old_dir_fd, old_path, new_dir_fd, new_path, flags); + sqe.prep_close(fd); + sqe.user_data = user_data; + return sqe; +} + +/// Queues close of registered file descriptor. +/// Available since 5.15 +pub fn close_direct(self: *IoUring, user_data: u64, file_index: u32) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_close_direct(file_index); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to provide a group of buffers used for commands that read/receive data. +/// Queues (but does not submit) an SQE to perform an `statx(2)`. /// Returns a pointer to the SQE. -/// -/// Provided buffers can be used in `read`, `recv` or `recvmsg` commands via .buffer_selection. -/// -/// The kernel expects a contiguous block of memory of size (buffers_count * buffer_size). -pub fn provide_buffers( +pub fn statx( self: *IoUring, user_data: u64, - buffers: [*]u8, - buffer_size: usize, - buffers_count: usize, - group_id: usize, - buffer_id: usize, -) !*linux.io_uring_sqe { + fd: linux.fd_t, + path: [:0]const u8, + flags: linux.At, + mask: linux.Statx.Mask, + buf: *linux.Statx, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_provide_buffers(buffers, buffer_size, buffers_count, group_id, buffer_id); + sqe.prep_statx(fd, path, flags, mask, buf); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to remove a group of provided buffers. +/// Queues (but does not submit) an SQE to perform an `posix_fadvise(2)`. /// Returns a pointer to the SQE. -pub fn remove_buffers( +pub fn fadvice( self: *IoUring, user_data: u64, - buffers_count: usize, - group_id: usize, -) !*linux.io_uring_sqe { + fd: linux.fd_t, + offset: u64, + len: u32, + advice: linux.Fadvise, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_remove_buffers(buffers_count, group_id); + sqe.prep_fadvice(fd, offset, len, advice); sqe.user_data = user_data; return sqe; } -/// Queues (but does not submit) an SQE to perform a `waitid(2)`. +/// Queues (but does not submit) an SQE to perform an `madvise(2)`. /// Returns a pointer to the SQE. -pub fn waitid( +pub fn madvice( self: *IoUring, user_data: u64, - id_type: linux.P, - id: i32, - infop: *linux.siginfo_t, - options: u32, - flags: u32, -) !*linux.io_uring_sqe { + memory: []u8, + advice: linux.Madvise, +) !*Sqe { const sqe = try self.get_sqe(); - sqe.prep_waitid(id_type, id, infop, options, flags); + sqe.prep_madvice(memory, advice); sqe.user_data = user_data; return sqe; } -/// Registers an array of file descriptors. -/// Every time a file descriptor is put in an SQE and submitted to the kernel, the kernel must -/// retrieve a reference to the file, and once I/O has completed the file reference must be -/// dropped. The atomic nature of this file reference can be a slowdown for high IOPS workloads. -/// This slowdown can be avoided by pre-registering file descriptors. -/// To refer to a registered file descriptor, IOSQE_FIXED_FILE must be set in the SQE's flags, -/// and the SQE's fd must be set to the index of the file descriptor in the registered array. -/// Registering file descriptors will wait for the ring to idle. -/// Files are automatically unregistered by the kernel when the ring is torn down. -/// An application need unregister only if it wants to register a new array of file descriptors. -pub fn register_files(self: *IoUring, fds: []const linux.fd_t) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .REGISTER_FILES, - @as(*const anyopaque, @ptrCast(fds.ptr)), - @as(u32, @intCast(fds.len)), - ); - try handle_registration_result(res); +/// Queues (but does not submit) an SQE to perform a `send(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.6 +pub fn send( + self: *IoUring, + user_data: u64, + sockfd: linux.fd_t, + buffer: []const u8, + flags: linux.Msg, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_send(sockfd, buffer, flags); + sqe.user_data = user_data; + return sqe; } -/// Updates registered file descriptors. -/// -/// Updates are applied starting at the provided offset in the original file descriptors slice. -/// There are three kind of updates: -/// * turning a sparse entry (where the fd is -1) into a real one -/// * removing an existing entry (set the fd to -1) -/// * replacing an existing entry with a new fd -/// Adding new file descriptors must be done with `register_files`. -pub fn register_files_update(self: *IoUring, offset: u32, fds: []const linux.fd_t) !void { - assert(self.fd >= 0); - - const FilesUpdate = extern struct { - offset: u32, - resv: u32, - fds: u64 align(8), - }; - var update = FilesUpdate{ - .offset = offset, - .resv = @as(u32, 0), - .fds = @as(u64, @intFromPtr(fds.ptr)), - }; - - const res = linux.io_uring_register( - self.fd, - .REGISTER_FILES_UPDATE, - @as(*const anyopaque, @ptrCast(&update)), - @as(u32, @intCast(fds.len)), - ); - try handle_registration_result(res); +/// Queues (but does not submit) an SQE to perform a bundled `send(2)`. +/// Returns a pointer to the SQE. +pub fn send_bundle( + self: *IoUring, + user_data: u64, + sockfd: linux.fd_t, + len: u64, + flags: linux.Msg, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_send_bundle(sockfd, len, flags); + sqe.user_data = user_data; + return sqe; } -/// Registers an empty (-1) file table of `nr_files` number of file descriptors. -pub fn register_files_sparse(self: *IoUring, nr_files: u32) !void { - assert(self.fd >= 0); - - const reg = &linux.io_uring_rsrc_register{ - .nr = nr_files, - .flags = linux.IORING_RSRC_REGISTER_SPARSE, - .resv2 = 0, - .data = 0, - .tags = 0, - }; +/// Queues (but does not submit) an SQE to perform a bundled `sendto(2)`. +/// Returns a pointer to the SQE. +pub fn send_to( + self: *IoUring, + user_data: u64, + sockfd: linux.fd_t, + buffer: []const u8, + flags: linux.Msg, + addr: *const linux.sockaddr, + addrlen: linux.socklen_t, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_send_to(sockfd, buffer, flags, addr, addrlen); + sqe.user_data = user_data; + return sqe; +} - const res = linux.io_uring_register( - self.fd, - .REGISTER_FILES2, - @ptrCast(reg), - @as(u32, @sizeOf(linux.io_uring_rsrc_register)), - ); +/// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`. +/// +/// This operation will most likely produce two CQEs. The flags field of the +/// first cqe may likely contain IORING_CQE_F_MORE, which means that there will +/// be a second cqe with the user_data field set to the same value. The user +/// must not modify the data buffer until the notification is posted. The first +/// cqe follows the usual rules and so its res field will contain the number of +/// bytes sent or a negative error code. The notification's res field will be +/// set to zero and the flags field will contain IORING_CQE_F_NOTIF. The two +/// step model is needed because the kernel may hold on to buffers for a long +/// time, e.g. waiting for a TCP ACK. Notifications responsible for controlling +/// the lifetime of the buffers. Even errored requests may generate a +/// notification. +/// +/// Available since 6.0 +pub fn send_zc( + self: *IoUring, + user_data: u64, + sockfd: linux.fd_t, + buffer: []const u8, + send_flags: linux.Msg, + zc_flags: Sqe.SendRecv, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_send_zc(sockfd, buffer, send_flags, zc_flags); + sqe.user_data = user_data; + return sqe; +} - return handle_registration_result(res); +/// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`. +/// Returns a pointer to the SQE. +/// Available since 6.0 +pub fn send_zc_fixed( + self: *IoUring, + user_data: u64, + sockfd: linux.fd_t, + buffer: []const u8, + send_flags: linux.Msg, + zc_flags: Sqe.SendRecv, + buf_index: u16, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_send_zc_fixed(sockfd, buffer, send_flags, zc_flags, buf_index); + sqe.user_data = user_data; + return sqe; } -// Registers range for fixed file allocations. -// Available since 6.0 -pub fn register_file_alloc_range(self: *IoUring, offset: u32, len: u32) !void { - assert(self.fd >= 0); +/// Queues (but does not submit) an SQE to perform an async zerocopy `sendmsg(2)`. +/// Returns a pointer to the SQE. +/// Available since 6.1 +pub fn sendmsg_zc( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + msg: *const linux.msghdr_const, + flags: linux.Msg, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_sendmsg_zc(fd, msg, flags); + sqe.user_data = user_data; + return sqe; +} - const range = &linux.io_uring_file_index_range{ - .off = offset, - .len = len, - .resv = 0, - }; +/// Queues (but does not submit) an SQE to perform an fixed async zerocopy +/// `sendmsg(2)`. Returns a pointer to the SQE. +pub fn sendmsg_zc_fixed( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + msg: *const linux.msghdr_const, + flags: linux.Msg, + buf_index: u16, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_sendmsg_zc_fixed(fd, msg, flags, buf_index); + sqe.user_data = user_data; + return sqe; +} - const res = linux.io_uring_register( - self.fd, - .REGISTER_FILE_ALLOC_RANGE, - @ptrCast(range), - @as(u32, @sizeOf(linux.io_uring_file_index_range)), - ); +/// Queues (but does not submit) an SQE to perform a `recv(2)`. +/// Returns a pointer to the SQE. +/// Available since 5.6 +pub fn recv( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + buffer: RecvBuffer, + flags: linux.Msg, +) !*Sqe { + const sqe = try self.get_sqe(); + switch (buffer) { + .buffer => |slice| sqe.prep_recv(fd, slice, flags), + .buffer_selection => |selection| { + sqe.prep_rw(.recv, fd, 0, selection.len, 0); + sqe.rw_flags = @bitCast(flags); + sqe.flags.buffer_select = true; + sqe.buf_index = selection.group_id; + }, + } + sqe.user_data = user_data; + return sqe; +} - return handle_registration_result(res); +pub fn recv_multishot( + self: *IoUring, + user_data: u64, + sockfd: linux.fd_t, + buffer: RecvBuffer, + flags: linux.Msg, +) !*Sqe { + const sqe = try self.get_sqe(); + switch (buffer) { + .buffer => |slice| sqe.prep_recv_multishot(sockfd, slice, flags), + .buffer_selection => |selection| { + sqe.prep_rw(.recv, sockfd, 0, selection.len, 0); + sqe.ioprio = .{ .send_recv = .{ .recv_multishot = true } }; + sqe.rw_flags = @bitCast(flags); + sqe.flags.buffer_select = true; + sqe.buf_index = selection.group_id; + }, + } + sqe.user_data = user_data; + return sqe; } -/// Registers the file descriptor for an eventfd that will be notified of completion events on -/// an io_uring instance. -/// Only a single a eventfd can be registered at any given point in time. -pub fn register_eventfd(self: *IoUring, fd: linux.fd_t) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .REGISTER_EVENTFD, - @as(*const anyopaque, @ptrCast(&fd)), - 1, +/// Queues (but does not submit) an SQE to provide a group of buffers used for +/// commands that read/receive data. Returns a pointer to the SQE. +/// +/// Provided buffers can be used in `read`, `recv` or `recvmsg` commands via +/// buffer_selection. +/// +/// The kernel expects a contiguous block of memory of size (buffers_count * +/// buffer_len). +pub fn provide_buffers( + self: *IoUring, + user_data: u64, + /// an array of `buffers_count` buffers of len `buffer_len` laid out as a + /// contiguous slice of memory + buffers: []u8, + /// lenght of each buffer in `buffers` + buffer_len: u32, + /// count of buffer in `buffers` + buffers_count: u32, + group_id: u32, + buffer_id: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_provide_buffers( + buffers, + buffer_len, + buffers_count, + group_id, + buffer_id, ); - try handle_registration_result(res); + sqe.user_data = user_data; + return sqe; } -/// Registers the file descriptor for an eventfd that will be notified of completion events on -/// an io_uring instance. Notifications are only posted for events that complete in an async manner. -/// This means that events that complete inline while being submitted do not trigger a notification event. -/// Only a single eventfd can be registered at any given point in time. -pub fn register_eventfd_async(self: *IoUring, fd: linux.fd_t) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .REGISTER_EVENTFD_ASYNC, - @as(*const anyopaque, @ptrCast(&fd)), - 1, - ); - try handle_registration_result(res); +/// Queues (but does not submit) an SQE to remove a group of provided buffers. +/// Returns a pointer to the SQE. +pub fn remove_buffers( + self: *IoUring, + user_data: u64, + buffers_count: u32, + group_id: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_remove_buffers(buffers_count, group_id); + sqe.user_data = user_data; + return sqe; } -/// Unregister the registered eventfd file descriptor. -pub fn unregister_eventfd(self: *IoUring) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .UNREGISTER_EVENTFD, - null, - 0, - ); - try handle_registration_result(res); +/// Queues (but does not submit) an SQE to perform a `shutdown(2)`. +/// Returns a pointer to the SQE. +/// +/// The operation is identified by its `user_data`. +pub fn shutdown( + self: *IoUring, + user_data: u64, + sockfd: linux.socket_t, + how: linux.Shut, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_shutdown(sockfd, how); + sqe.user_data = user_data; + return sqe; } -pub fn register_napi(self: *IoUring, napi: *linux.io_uring_napi) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .REGISTER_NAPI, napi, 1); - try handle_registration_result(res); +/// Queues (but does not submit) an SQE to perform a `unlinkat(2)`. +/// Returns a pointer to the SQE. +pub fn unlinkat( + self: *IoUring, + user_data: u64, + dir_fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.At, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_unlinkat(dir_fd, path, flags); + sqe.user_data = user_data; + return sqe; } -pub fn unregister_napi(self: *IoUring, napi: *linux.io_uring_napi) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .UNREGISTER_NAPI, napi, 1); - try handle_registration_result(res); +/// Queues (but does not submit) an SQE to perform a `unlink(2)`. +/// Returns a pointer to the SQE. +pub fn unlink( + self: *IoUring, + user_data: u64, + path: [*:0]const u8, + flags: linux.At, +) !*Sqe { + return try self.unlinkat(user_data, linux.At.fdcwd, path, flags); } -/// Registers an array of buffers for use with `read_fixed` and `write_fixed`. -pub fn register_buffers(self: *IoUring, buffers: []const posix.iovec) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register( - self.fd, - .REGISTER_BUFFERS, - buffers.ptr, - @as(u32, @intCast(buffers.len)), - ); - try handle_registration_result(res); +/// Queues (but does not submit) an SQE to perform a `renameat2(2)`. +/// Returns a pointer to the SQE. +pub fn renameat( + self: *IoUring, + user_data: u64, + old_dir_fd: linux.fd_t, + old_path: [*:0]const u8, + new_dir_fd: linux.fd_t, + new_path: [*:0]const u8, + flags: linux.Rename, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_renameat(old_dir_fd, old_path, new_dir_fd, new_path, flags); + sqe.user_data = user_data; + return sqe; } -/// Unregister the registered buffers. -pub fn unregister_buffers(self: *IoUring) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .UNREGISTER_BUFFERS, null, 0); - switch (linux.E.init(res)) { - .SUCCESS => {}, - .NXIO => return error.BuffersNotRegistered, - else => |errno| return posix.unexpectedErrno(errno), - } +/// Queues (but does not submit) an SQE to perform a `rename(2)`. +/// Returns a pointer to the SQE. +pub fn rename( + self: *IoUring, + user_data: u64, + old_path: [*:0]const u8, + new_path: [*:0]const u8, + flags: linux.Rename, +) !*Sqe { + return try self.renameat(user_data, linux.At.fdcwd, old_path, linux.At.fdcwd, new_path, flags); } -/// Returns a io_uring_probe which is used to probe the capabilities of the -/// io_uring subsystem of the running kernel. The io_uring_probe contains the -/// list of supported operations. -pub fn get_probe(self: *IoUring) !linux.io_uring_probe { - var probe = mem.zeroInit(linux.io_uring_probe, .{}); - const res = linux.io_uring_register(self.fd, .REGISTER_PROBE, &probe, probe.ops.len); - try handle_register_buf_ring_result(res); - return probe; +/// Queues (but does not submit) an SQE to perform a `sync_file_range(2)`. +/// Returns a pointer to the SQE. +pub fn sync_file_range( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + len: u32, + offset: u64, + flags: linux.SyncFileRange, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_sync_file_range(fd, len, offset, flags); + sqe.user_data = user_data; + return sqe; } -fn handle_registration_result(res: usize) !void { - switch (linux.E.init(res)) { - .SUCCESS => {}, - // One or more fds in the array are invalid, or the kernel does not support sparse sets: - .BADF => return error.FileDescriptorInvalid, - .BUSY => return error.FilesAlreadyRegistered, - .INVAL => return error.FilesEmpty, - // Adding `nr_args` file references would exceed the maximum allowed number of files the - // user is allowed to have according to the per-user RLIMIT_NOFILE resource limit and - // the CAP_SYS_RESOURCE capability is not set, or `nr_args` exceeds the maximum allowed - // for a fixed file set (older kernels have a limit of 1024 files vs 64K files): - .MFILE => return error.UserFdQuotaExceeded, - // Insufficient kernel resources, or the caller had a non-zero RLIMIT_MEMLOCK soft - // resource limit but tried to lock more memory than the limit permitted (not enforced - // when the process is privileged with CAP_IPC_LOCK): - .NOMEM => return error.SystemResources, - // Attempt to register files on a ring already registering files or being torn down: - .NXIO => return error.RingShuttingDownOrAlreadyRegisteringFiles, - else => |errno| return posix.unexpectedErrno(errno), +/// Queues (but does not submit) an SQE to perform a `mkdirat(2)`. +/// Returns a pointer to the SQE. +pub fn mkdirat( + self: *IoUring, + user_data: u64, + dir_fd: linux.fd_t, + path: [*:0]const u8, + mode: linux.mode_t, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_mkdirat(dir_fd, path, mode); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `mkdir(2)`. +/// Returns a pointer to the SQE. +pub fn mkdir( + self: *IoUring, + user_data: u64, + path: [*:0]const u8, + mode: linux.mode_t, +) !*Sqe { + return try self.mkdirat(user_data, linux.At.fdcwd, path, mode); +} + +/// Queues (but does not submit) an SQE to perform a `symlinkat(2)`. +/// Returns a pointer to the SQE. +pub fn symlinkat( + self: *IoUring, + user_data: u64, + target: [*:0]const u8, + new_dir_fd: linux.fd_t, + link_path: [*:0]const u8, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_symlinkat(target, new_dir_fd, link_path); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `symlink(2)`. +/// Returns a pointer to the SQE. +pub fn symlink( + self: *IoUring, + user_data: u64, + target: [*:0]const u8, + link_path: [*:0]const u8, +) !*Sqe { + return try self.symlinkat(user_data, target, linux.At.fdcwd, link_path); +} + +/// Queues (but does not submit) an SQE to perform a `linkat(2)`. +/// Returns a pointer to the SQE. +pub fn linkat( + self: *IoUring, + user_data: u64, + old_dir_fd: linux.fd_t, + old_path: [*:0]const u8, + new_dir_fd: linux.fd_t, + new_path: [*:0]const u8, + flags: linux.At, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_linkat(old_dir_fd, old_path, new_dir_fd, new_path, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `link(2)`. +/// Returns a pointer to the SQE. +pub fn link( + self: *IoUring, + user_data: u64, + old_path: [*:0]const u8, + new_path: [*:0]const u8, + flags: linux.At, +) !*Sqe { + return try self.linkat(user_data, linux.At.fdcwd, old_path, linux.At.fdcwd, new_path, flags); +} + +/// Queues (but does not submit) an SQE to send a CQE to an io_uring file +/// descriptor. The use case for this can be anything from simply waking up +/// someone waiting on the targeted ring, or it can be used to pass messages +/// between the two rings +/// Returns a pointer to the SQE. +pub fn msg_ring( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + len: u32, + data: u64, + flags: uflags.MsgRing, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_msg_ring(fd, len, data, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to send a CQE to an io_uring file +/// descriptor. See `msg_ring` +/// This has and additonal `cqe_flags` parameter that allows you to set the CQE +/// flags field cqe.flags when sending a message +/// Returns a pointer to the SQE. +pub fn msg_ring_cqe_flags( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + len: u32, + data: u64, + msg_flags: uflags.MsgRing, + cqe_flags: Cqe.Flags, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_msg_ring_cqe_flags( + fd, + len, + data, + msg_flags, + cqe_flags, + ); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to to send a direct file descriptor to +/// another ring. +/// This has and additonal `cqe_flags` parameter that allows you to set the CQE +/// flags field cqe.flags when sending a message +/// Returns a pointer to the SQE. +pub fn msg_ring_fd( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + source_fd: linux.fd_t, + target_fd: linux.fd_t, + data: u64, + flags: uflags.MsgRing, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_msg_ring_fd( + fd, + source_fd, + target_fd, + data, + flags, + ); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to send a direct file descriptor to +/// another ring. See `msg_ring_fd()` +/// `msg_ring_fd_alloc()` is similar to `msg_ring_fd()`, but doesn't specify a +/// target_fd for the descriptor. Instead, this target_fd is allocated in the +/// target ring and returned in the CQE res field. +/// Returns a pointer to the SQE. +pub fn msg_ring_fd_alloc( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + source_fd: linux.fd_t, + data: u64, + flags: uflags.MsgRing, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_msg_ring_fd_alloc( + fd, + source_fd, + data, + flags, + ); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to prepares a request to get an +/// extended attribute value +/// The `from` parameter is used to decide the source to get the extended +/// attributes from +/// Returns a pointer to the SQE. +pub fn getxattr( + self: *IoUring, + user_data: u64, + name: []const u8, + value: []const u8, + from: XattrSource, + len: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + switch (from) { + .path => |path_| sqe.prep_getxattr(name, value, path_, len), + .fd => |fd_| sqe.prep_fgetxattr(name, value, fd_, len), + } + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to prepares a request to set an +/// extended attribute value +/// The `on` parameter is used to decide the source to set the extended +/// attributes on +/// Returns a pointer to the SQE. +pub fn setxattr( + self: *IoUring, + user_data: u64, + name: []const u8, + value: []const u8, + on: XattrSource, + flags: linux.SetXattr, + len: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + switch (on) { + .path => |path_| sqe.prep_setxattr(name, value, path_, flags, len), + .fd => |fd_| sqe.prep_fsetxattr(name, value, fd_, flags, len), + } + sqe.user_data = user_data; + return sqe; +} + +/// Prepares a socket creation request. +/// New socket fd will be returned in completion result. +/// Available since 5.19 +pub fn socket( + self: *IoUring, + user_data: u64, + domain: linux.Af, + socket_type: linux.Sock, + protocol: linux.IpProto, + /// flags are currently unused + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_socket(domain, socket_type, protocol, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Prepares a socket creation request for registered file at index `file_index`. +/// Available since 5.19 +pub fn socket_direct( + self: *IoUring, + user_data: u64, + domain: linux.Af, + socket_type: linux.Sock, + protocol: linux.IpProto, + /// flags are currently unused + flags: u32, + file_index: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_socket_direct(domain, socket_type, protocol, flags, file_index); + sqe.user_data = user_data; + return sqe; +} + +/// Prepares a socket creation request for registered file, index chosen by +/// kernel (file index alloc). +/// File index will be returned in CQE res field. +/// Available since 5.19 +pub fn socket_direct_alloc( + self: *IoUring, + user_data: u64, + domain: linux.Af, + socket_type: linux.Sock, + protocol: linux.IpProto, + /// flags are currently unused + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_socket_direct_alloc(domain, socket_type, protocol, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Prepares an cmd request for a socket. +/// See: https://man7.org/linux/man-pages/man3/io_uring_prep_cmd.3.html +/// Available since 6.7. +pub fn cmd_sock( + self: *IoUring, + user_data: u64, + cmd_op: SocketOp, + fd: linux.fd_t, + level: linux.Sol, + optname: linux.So, + /// pointer to the option value + optval: u64, + /// size of the option value + optlen: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_cmd_sock(cmd_op, fd, level, optname, optval, optlen); + sqe.user_data = user_data; + return sqe; +} + +/// Queues (but does not submit) an SQE to perform a `waitid(2)`. +/// Returns a pointer to the SQE. +pub fn waitid( + self: *IoUring, + user_data: u64, + id_type: linux.P, + id: i32, + infop: *linux.siginfo_t, + options: linux.W, + /// They are currently unused, and hence 0 should be passed + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_waitid(id_type, id, infop, options, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Available since kernel 6.7 +pub fn futex_wake( + self: *IoUring, + user_data: u64, + futex: *u32, + max_wake_count: u64, + mask: linux.Futex2.Bitset, + futex_flags: linux.Futex2.Wake, + flags: u32, // They are currently unused, and hence 0 should be passed +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_futex_wake(futex, max_wake_count, mask, futex_flags, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Available since kernel 6.7 +pub fn futex_wait( + self: *IoUring, + user_data: u64, + futex: *u32, + max_wake_count: u64, + mask: linux.Futex2.Bitset, + futex_flags: linux.Futex2.Wait, + /// They are currently unused, and hence 0 should be passed + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_futex_wait(futex, max_wake_count, mask, futex_flags, flags); + sqe.user_data = user_data; + return sqe; +} + +/// Available since kernel 6.7 +pub fn futex_waitv( + self: *IoUring, + user_data: u64, + futexv: []linux.Futex2.WaitOne, + /// They are currently unused, and hence 0 should be passed + flags: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_futex_waitv(futexv, flags); + sqe.user_data = user_data; + return sqe; +} + +pub fn fixed_fd_install( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + flags: uflags.FixedFd, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_fixed_fd_install(fd, flags); + sqe.user_data = user_data; + return sqe; +} + +pub fn ftruncate( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + offset: u64, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_ftruncate(fd, offset); + sqe.user_data = user_data; + return sqe; +} + +pub fn cmd_discard( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + offset: u64, + nbytes: u64, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_cmd_discard(fd, offset, nbytes); + sqe.user_data = user_data; + return sqe; +} + +pub fn pipe( + self: *IoUring, + user_data: u64, + fds: *[2]linux.fd_t, + flags: linux.Pipe2, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_pipe(fds, flags); + sqe.user_data = user_data; + return sqe; +} + +pub fn pipe_direct( + self: *IoUring, + user_data: u64, + fds: *[2]linux.fd_t, + flags: linux.Pipe2, + file_index: u32, +) !*Sqe { + const sqe = try self.get_sqe(); + sqe.prep_pipe_direct(fds, flags, file_index); + sqe.user_data = user_data; + return sqe; +} + +pub fn register_buffers_sparse(self: *IoUring, nr: u32) !void { + assert(self.fd >= 0); + + const reg: RsrcRegister = .{ + .flags = .{ .register_sparse = true }, + .nr = nr, + }; + + const res = linux.io_uring_register(self.fd, .register_buffers2, ®, @sizeOf(RsrcRegister)); + try handle_registration_result(res); +} + +/// Registers an array of buffers for use with `read_fixed`, `readv_fixed`, +/// `write_fixed` and `writev_fixed`. +pub fn register_buffers(self: *IoUring, buffers: []const posix.iovec) !void { + assert(self.fd >= 0); + + const res = linux.io_uring_register(self.fd, .register_buffers, buffers.ptr, @intCast(buffers.len)); + try handle_registration_result(res); +} + +/// Unregister the registered buffers. +pub fn unregister_buffers(self: *IoUring) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .unregister_buffers, null, 0); + switch (linux.E.init(res)) { + .SUCCESS => {}, + .NXIO => return error.BuffersNotRegistered, + else => |errno| return posix.unexpectedErrno(errno), + } +} + +/// Updates registered file descriptors. +/// +/// Updates are applied starting at the provided offset in the original file +/// descriptors slice. +/// There are three kind of updates: +/// * turning a sparse entry (where the fd is -1) into a real one +/// * removing an existing entry (set the fd to -1) +/// * replacing an existing entry with a new fd +/// +/// Adding new file descriptors must be done with `register_files`. +pub fn register_files_update(self: *IoUring, offset: u32, fds: []const linux.fd_t) !void { + assert(self.fd >= 0); + + var update = mem.zeroInit(RsrcUpdate, .{ + .offset = offset, + .data = @intFromPtr(fds.ptr), + }); + + const res = linux.io_uring_register(self.fd, .register_files_update, &update, @intCast(fds.len)); + try handle_registration_result(res); +} + +/// Registers an empty (-1) file table of `nr_files` number of file descriptors. +pub fn register_files_sparse(self: *IoUring, nr_files: u32) !void { + assert(self.fd >= 0); + + const reg = mem.zeroInit(RsrcRegister, .{ + .nr = nr_files, + .flags = .{ .register_sparse = true }, + }); + + const res = linux.io_uring_register(self.fd, .register_files2, ®, @sizeOf(RsrcRegister)); + + return handle_registration_result(res); +} +/// Registers an array of file descriptors. +/// +/// Every time a file descriptor is put in an SQE and submitted to the kernel, +/// the kernel must retrieve a reference to the file, and once I/O has +/// completed, the file reference must be dropped. The atomic nature of this +/// file reference can be a slowdown for high IOPS workloads. This slowdown can +/// be avoided by pre-registering file descriptors. +/// +/// To refer to a registered file descriptor, IOSQE_FIXED_FILE must be set in +/// the SQE's flags, and the SQE's fd must be set to the index of the file +/// descriptor in the registered array. +/// +/// Registering file descriptors will wait for the ring to idle and files are +/// automatically unregistered by the kernel when the ring is torn down. +/// +/// An application need unregister only if it wants to register a new array of +/// file descriptors. +pub fn register_files(self: *IoUring, fds: []const linux.fd_t) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .register_files, fds.ptr, @intCast(fds.len)); + try handle_registration_result(res); +} + +/// Unregisters all registered file descriptors previously associated with the +/// ring. +pub fn unregister_files(self: *IoUring) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .unregister_files, null, 0); + switch (linux.E.init(res)) { + .SUCCESS => {}, + .NXIO => return error.FilesNotRegistered, + else => |errno| return posix.unexpectedErrno(errno), + } +} + +/// Registers the file descriptor for an eventfd that will be notified of +/// completion events on an io_uring instance. +/// Only a single a eventfd can be registered at any given point in time. +pub fn register_eventfd(self: *IoUring, fd: linux.fd_t) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .register_eventfd, &fd, 1); + try handle_registration_result(res); +} + +/// Registers the file descriptor for an eventfd that will be notified of +/// completion events on an io_uring instance. Notifications are only posted +/// for events that complete in an async manner. This means that events that +/// complete inline while being submitted do not trigger a notification event. +/// Only a single eventfd can be registered at any given point in time. +pub fn register_eventfd_async(self: *IoUring, fd: linux.fd_t) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .register_eventfd_async, &fd, 1); + try handle_registration_result(res); +} + +/// Unregister the registered eventfd file descriptor. +pub fn unregister_eventfd(self: *IoUring) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .unregister_eventfd, null, 0); + try handle_registration_result(res); +} + +pub fn register_probe(self: *IoUring, probe: []Probe) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .register_probe, probe.ptr, @intCast(probe.len)); + try handle_registration_result(res); +} + +/// See https://github.com/axboe/liburing/issues/357 for how to use personality +/// matches `io_uring_register_personality()` in liburing +pub fn register_personality(self: *IoUring) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .register_personality, null, 0); + try handle_registration_result(res); +} + +pub fn unregister_personality(self: *IoUring, credential_id: u32) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .unregister_personality, null, credential_id); + try handle_registration_result(res); +} + +pub fn register_restrictions(self: *IoUring, restriction: []Restriction) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .register_restrictions, restriction.ptr, @intCast(restriction.len)); + try handle_registration_result(res); +} + +pub fn enable_rings(self: *IoUring) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .register_enable_rings, null, 0); + try handle_registration_result(res); +} + +pub fn register_iowq_aff(self: *IoUring, cpusz: u32, mask: *linux.cpu_set_t) !void { + assert(self.fd >= 0); + + if (cpusz >= math.maxInt(u32)) return error.ArgumentsInvalid; + + const res = linux.io_uring_register(self.fd, .register_iowq_aff, mask, cpusz); + try handle_registration_result(res); +} + +pub fn unregister_iowq_aff(self: *IoUring) !void { + assert(self.fd >= 0); + + const res = linux.io_uring_register(self.fd, .unregister_iowq_aff, null, 0); + try handle_registration_result(res); +} + +/// `max_workers`: `max_workers[0]` should contain the maximum number of +/// desired bounded workers, and the `max_workers[1]` the maximum number of +/// desired unbounded workers. +/// If both values are set to 0, the existing values are returned +/// Read `io_uring_register_iowq_max_workers(3)` for more info +pub fn register_iowq_max_workers(self: *IoUring, max_workers: [2]u32) !void { + assert(self.fd >= 0); + + const res = linux.io_uring_register(self.fd, .register_iowq_max_workers, &max_workers, 2); + try handle_registration_result(res); +} + +/// See `io_uring_register_sync_cancel(3)` +pub fn register_sync_cancel(self: *IoUring, cancel_reg: *SyncCancelRegister) !void { + assert(self.fd >= 0); + + const res = linux.io_uring_register(self.fd, .register_sync_cancel, cancel_reg, 1); + try handle_registration_result(res); +} + +/// See `io_uring_register_sync_msg(3)` +pub fn register_sync_msg(self: *IoUring, sqe: *Sqe) !void { + assert(self.fd >= 0); + + const res = linux.io_uring_register(-1, .register_send_msg_ring, sqe, 1); + try handle_registration_result(res); +} + +/// Registers range for fixed file allocations. +/// Available since 6.0 +pub fn register_file_alloc_range(self: *IoUring, offset: u32, len: u32) !void { + assert(self.fd >= 0); + + const range: FileIndexRange = .{ + .off = offset, + .len = len, + .resv = 0, + }; + + const res = linux.io_uring_register(self.fd, .register_file_alloc_range, &range, 0); + + return handle_registration_result(res); +} + +pub fn register_napi(self: *IoUring, napi: *Napi) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .register_napi, napi, 1); + try handle_registration_result(res); +} + +pub fn unregister_napi(self: *IoUring, napi: *Napi) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .unregister_napi, napi, 1); + try handle_registration_result(res); +} + +pub fn register_clock(self: *IoUring, clock_reg: *ClockRegister) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .register_clock, clock_reg, 0); + try handle_registration_result(res); +} + +pub fn register_ifq(self: *IoUring, ifq_reg: *ZcrxIfqRegister) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .register_zcrx_ifq, ifq_reg, 1); + try handle_registration_result(res); +} + +pub fn register_resize_rings(self: *IoUring, _: *Params) !void { + assert(self.fd >= 0); + return error.Unimplemented; +} + +pub fn register_region(self: *IoUring, mem_reg: *MemRegionRegister) !void { + assert(self.fd >= 0); + const res = linux.io_uring_register(self.fd, .register_mem_region, mem_reg, 1); + try handle_registration_result(res); +} + +/// Returns a Probe which is used to probe the capabilities of the +/// io_uring subsystem of the running kernel. The Probe contains the +/// list of supported operations. +pub fn get_probe(self: *IoUring) !Probe { + var probe = mem.zeroInit(Probe, .{}); + const res = linux.io_uring_register(self.fd, .register_probe, &probe, probe.ops.len); + try handle_register_buf_ring_result(res); + return probe; +} + +fn handle_registration_result(res: usize) !void { + switch (linux.E.init(res)) { + .SUCCESS => {}, + // One or more fds in the array are invalid, or the kernel does not + // support sparse sets: + .BADF => return error.FileDescriptorInvalid, + .BUSY => return error.FilesAlreadyRegistered, + .INVAL => return error.FilesEmpty, + // Adding `nr_args` file references would exceed the maximum allowed + // number of files the user is allowed to have according to the + // per-user RLIMIT_NOFILE resource limit and the CAP_SYS_RESOURCE + // capability is not set, or `nr_args` exceeds the maximum allowed + // for a fixed file set (older kernels have a limit of 1024 files vs + // 64K files): + .MFILE => return error.UserFdQuotaExceeded, + // Insufficient kernel resources, or the caller had a non-zero + // RLIMIT_MEMLOCK soft resource limit but tried to lock more memory + // than the limit permitted (not enforced when the process is + // privileged with CAP_IPC_LOCK): + .NOMEM => return error.SystemResources, + // Attempt to register files on a ring already registering files or + // being torn down: + .NXIO => return error.RingShuttingDownOrAlreadyRegisteringFiles, + else => |errno| return posix.unexpectedErrno(errno), + } +} + +/// Prepares set socket option for the optname argument, at the protocol +/// level specified by the level argument. +/// Available since 6.7.n +pub fn setsockopt( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + level: linux.Sol, + optname: linux.So, + opt: []const u8, +) !*Sqe { + return try self.cmd_sock( + user_data, + .setsockopt, + fd, + level, + optname, + @intFromPtr(opt.ptr), + @intCast(opt.len), + ); +} + +/// Prepares get socket option to retrieve the value for the option specified by +/// the option_name argument for the socket specified by the fd argument. +/// Available since 6.7. +pub fn getsockopt( + self: *IoUring, + user_data: u64, + fd: linux.fd_t, + level: linux.Sol, + optname: linux.So, + opt: []u8, +) !*Sqe { + return try self.cmd_sock( + user_data, + .getsockopt, + fd, + level, + optname, + @intFromPtr(opt.ptr), + @intCast(opt.len), + ); +} + +/// Registers a shared buffer ring to be used with provided buffers. `entries` +/// number of `io_uring_buf` structures is mem mapped and shared by kernel. +/// +/// `entries` is the number of entries requested in the buffer ring and must be +/// a power of 2. +/// `fd` is IO_Uring.fd for which the provided buffer ring is being registered. +/// `group_id` is the chosen buffer group ID, unique in IO_Uring. +/// matches `io_uring_setup_buf_ring()` in liburing +pub fn init_buffer_ring( + self: *IoUring, + entries: u16, + group_id: u16, + flags: BufferRegister.Flags, +) !*align(page_size_min) BufferRing { + assert(self.fd >= 0); + if (entries == 0 or entries > math.maxInt(u16)) return error.EntriesNotInRange; + if (!math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; + + const mmap_size: usize = entries * @sizeOf(Buffer); + const mmap = try posix.mmap( + null, + mmap_size, + posix.PROT.READ | posix.PROT.WRITE, + .{ .TYPE = .PRIVATE, .ANONYMOUS = true }, + -1, + 0, + ); + errdefer posix.munmap(mmap); + assert(mmap.len == mmap_size); + + const buffer_ring: *align(page_size_min) BufferRing = @ptrCast(mmap.ptr); + var reg = mem.zeroInit(BufferRegister, .{ + .ring_addr = @intFromPtr(buffer_ring), + .ring_entries = entries, + .bgid = group_id, + .flags = flags, + }); + try self.register_buffer_ring(®); + buffer_ring.init(); + return buffer_ring; +} + +/// matches `io_uring_register_buf_ring` +pub fn register_buffer_ring(self: *IoUring, buf_reg: *BufferRegister) !void { + var res = linux.io_uring_register(self.fd, .register_pbuf_ring, buf_reg, 1); + if (linux.E.init(res) == .INVAL and buf_reg.flags.iou_pbuf_ring_inc) { + // Retry without incremental buffer consumption. + // It is available since kernel 6.12. returns INVAL on older. + buf_reg.flags.iou_pbuf_ring_inc = false; + res = linux.io_uring_register(self.fd, .register_pbuf_ring, buf_reg, 1); + } + try handle_register_buf_ring_result(res); +} + +/// matches `io_uring_unregister_buf_ring` +pub fn unregister_buffer_ring(self: *IoUring, buf_group_id: u16) !void { + var reg = mem.zeroInit(BufferRegister, .{ + .bgid = buf_group_id, + }); + const res = linux.io_uring_register(self.fd, .unregister_pbuf_ring, ®, 1); + try handle_register_buf_ring_result(res); +} + +fn handle_register_buf_ring_result(res: usize) !void { + switch (linux.E.init(res)) { + .SUCCESS => {}, + .INVAL => return error.ArgumentsInvalid, + else => |errno| return posix.unexpectedErrno(errno), + } +} + +/// IO completion data structure (Completion Queue Entry) +pub const Cqe = extern struct { + /// sqe.user_data value passed back + user_data: u64, + /// result code for this event + res: i32, + flags: Flags, + // TODO: add support for the IORING_SETUP_CQE32 case + /// If the ring is initialized with IORING_SETUP_CQE32, then this field + /// contains 16-bytes of padding, doubling the size of the CQE. + // big_cqe: ?[2]u64, + + /// cqe.flags + pub const Flags = packed struct(u32) { + /// IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID + f_buffer: bool = false, + /// IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries + f_more: bool = false, + /// IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket + /// recv + f_sock_nonempty: bool = false, + /// IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to + /// distinct them from sends. + f_notif: bool = false, + /// IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion + /// will get more completions. In other words, the buffer is being + /// partially consumed, and will be used by the kernel for more + /// completions. This is only set for buffers used via the incremental + /// buffer consumption, as provided by a ring buffer setup with + /// IOU_PBUF_RING_INC. For any other provided buffer type, all + /// completions with a buffer passed back is automatically returned to + /// the application. + f_buf_more: bool = false, + /// IORING_CQE_F_SKIP If set, then the application/liburing must ignore + /// this CQE. It's only purpose is to fill a gap in the ring, if a + /// large CQE is attempted posted when the ring has just a single small + /// CQE worth of space left before wrapping. + f_skip: bool = false, + _7: u9 = 0, + /// IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with + /// rings setup in a mixed CQE mode, where both 16b and 32b CQEs may be + /// posted to the CQ ring. + f_32: bool = false, + _17: u16 = 0, + }; + + /// Retrive the 64-bit cqe `user_data`, as `*T` after completion of an Sqe + /// this data is passed through `Sqe` -> `Cqe` unchanged + pub fn get_data(cqe: Cqe, comptime T: type) *T { + return @ptrFromInt(cqe.user_data); + } + + pub fn err(self: Cqe) linux.E { + if (self.res > -4096 and self.res < 0) { + return @enumFromInt(-self.res); + } + return .SUCCESS; + } + + /// On successful completion of the provided buffers IO request, the CQE + /// flags field will have IORING_CQE_F_BUFFER set and the selected buffer + /// ID will be indicated by the upper 16-bits of the flags field. + pub fn buffer_id(self: Cqe) !u16 { + if (!self.flags.f_buffer) { + return error.NoBufferSelected; + } + return @intCast(@as(u32, @bitCast(self.flags)) >> constants.CQE_BUFFER_SHIFT); + } +}; + +/// IO submission data structure (Submission Queue Entry) +/// matches `io_uring_sqe` in liburing +pub const Sqe = extern struct { + /// type of operation for this sqe + opcode: Op, + /// IOSQE_* flags + flags: IoSqe, + /// ioprio for the request + ioprio: packed union { + send_recv: SendRecv, + accept: Accept, + const Ioprio = @This(); + + pub fn init_empty() Ioprio { + return @bitCast(@as(u16, 0)); + } + }, + /// file descriptor to do IO on + fd: i32, + /// offset into file + off: u64, + /// pointer to buffer or iovecs + addr: u64, + /// buffer size or number of iovecs + len: u32, + /// flags for any Sqe operation + /// rw_flags | fsync_flags | poll_event | poll32_event | sync_range_flags | + /// msg_flags | timeout_flags | accept_flags | cancel_flags | open_flags | + /// statx_flags | fadvise_advice | splice_flags | rename_flags | + /// unlink_flags | hardlink_flags xattr_flags | msg_ring_flags | + /// uring_cmd_flags | waitid_flags | futex_flags | install_fd_flags | + /// nop_flags | pipe_flags + rw_flags: u32, + /// data to be passed back at completion time + user_data: u64, + /// index into fixed buffers or for grouped buffer selection + buf_index: u16, + personality: u16, + splice_fd_in: i32, + addr3: u64, + resv: u64, + + /// sqe.flags + pub const IoSqe = packed struct(u8) { + /// use fixed fileset + fixed_file: bool = false, + /// issue after inflight IO + io_drain: bool = false, + /// links next sqe + io_link: bool = false, + /// like LINK, but stronger + io_hardlink: bool = false, + /// always go async + async: bool = false, + /// select buffer from sqe->buf_group + buffer_select: bool = false, + /// don't post CQE if request succeeded + cqe_skip_success: bool = false, + _: u1 = 0, + }; + + /// send/sendmsg and recv/recvmsg flags (sqe.ioprio) + pub const SendRecv = packed struct(u16) { + /// IORING_RECVSEND_POLL_FIRST + /// If set, instead of first attempting to send or receive and arm poll + /// if that yields an -EAGAIN result, arm poll upfront and skip the + /// initial transfer attempt. + recvsend_poll_first: bool = false, + /// IORING_RECV_MULTISHOT + /// Multishot recv. Sets IORING_CQE_F_MORE if the handler will continue + /// to report CQEs on behalf of the same SQE. + recv_multishot: bool = false, + /// IORING_RECVSEND_FIXED_BUF + /// Use registered buffers, the index is stored in the buf_index field. + recvsend_fixed_buf: bool = false, + /// IORING_SEND_ZC_REPORT_USAGE + /// If set, SEND[MSG]_ZC should report the zerocopy usage in cqe.res + /// for the IORING_CQE_F_NOTIF cqe. 0 is reported if zerocopy was + /// actually possible. IORING_NOTIF_USAGE_ZC_COPIED if data was copied + /// (at least partially). + send_zc_report_usage: bool = false, + /// IORING_RECVSEND_BUNDLE + /// Used with IOSQE_BUFFER_SELECT. If set, send or recv will grab as + /// many buffers from the buffer group ID given and send them all. + /// The completion result will be the number of buffers send, with the + /// starting buffer ID in cqe.flags as per usual for provided buffer + /// usage. The buffers will be contiguous from the starting buffer ID. + recvsend_bundle: bool = false, + /// IORING_SEND_VECTORIZED + /// If set, SEND[_ZC] will take a pointer to a io_vec to allow + /// vectorized send operations. + send_vectorized: bool = false, + _: u10 = 0, + }; + + /// accept flags stored in sqe.ioprio + pub const Accept = packed struct(u16) { + multishot: bool = false, + dontwait: bool = false, + poll_first: bool = false, + _: u13 = 0, + }; + + pub fn prep_nop(sqe: *Sqe) void { + sqe.* = .{ + .opcode = .nop, + .flags = .{}, + .ioprio = .init_empty(), + .fd = 0, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_fsync(sqe: *Sqe, fd: linux.fd_t, flags: uflags.Fsync) void { + sqe.* = .{ + .opcode = .fsync, + .flags = .{}, + .ioprio = .init_empty(), + .fd = fd, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = @bitCast(flags), + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_rw( + sqe: *Sqe, + op: Op, + fd: linux.fd_t, + addr: u64, + len: usize, + offset: u64, + ) void { + sqe.* = .{ + .opcode = op, + .flags = .{}, + .ioprio = .init_empty(), + .fd = fd, + .off = offset, + .addr = addr, + .len = @intCast(len), + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_write(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, offset: u64) void { + sqe.prep_rw(.write, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + } + + pub fn prep_writev( + sqe: *Sqe, + fd: linux.fd_t, + iovecs: []const posix.iovec_const, + offset: u64, + ) void { + sqe.prep_rw(.writev, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + } + + pub fn prep_write_fixed( + sqe: *Sqe, + fd: linux.fd_t, + buffer: []const u8, + offset: u64, + buffer_index: u16, + ) void { + sqe.prep_rw(.write_fixed, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + sqe.buf_index = buffer_index; + } + + pub fn prep_writev_fixed( + sqe: *Sqe, + fd: linux.fd_t, + iovecs: []const posix.iovec_const, + offset: u64, + buffer_index: u16, + ) void { + sqe.prep_rw(.write_fixed, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + sqe.buf_index = buffer_index; + } + + pub fn prep_splice( + sqe: *Sqe, + fd_in: linux.fd_t, + off_in: u64, + fd_out: linux.fd_t, + off_out: u64, + len: u32, + flags: uflags.Splice, + ) void { + sqe.prep_rw(.splice, fd_out, undefined, len, off_out); + sqe.addr = off_in; + sqe.splice_fd_in = fd_in; + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_tee( + sqe: *Sqe, + fd_in: linux.fd_t, + fd_out: linux.fd_t, + len: u32, + flags: uflags.Splice, + ) void { + sqe.prep_rw(.tee, fd_out, undefined, len, 0); + sqe.addr = undefined; + sqe.splice_fd_in = fd_in; + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_read(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, offset: u64) void { + sqe.prep_rw(.read, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + } + + pub fn prep_readv( + sqe: *Sqe, + fd: linux.fd_t, + iovecs: []const posix.iovec, + offset: u64, + ) void { + sqe.prep_rw(.readv, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + } + + pub fn prep_read_fixed( + sqe: *Sqe, + fd: linux.fd_t, + buffer: []u8, + offset: u64, + buffer_index: u16, + ) void { + sqe.prep_rw(.read_fixed, fd, @intFromPtr(buffer.ptr), buffer.len, offset); + sqe.buf_index = buffer_index; + } + + pub fn prep_readv_fixed( + sqe: *Sqe, + fd: linux.fd_t, + iovecs: []const posix.iovec, + offset: u64, + buffer_index: u16, + ) void { + sqe.prep_rw(.read_fixed, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); + sqe.buf_index = buffer_index; + } + + pub fn prep_accept( + sqe: *Sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.Sock, + ) void { + // `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to + // socklen_t`. + // `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only + // a u32). + sqe.prep_rw(.accept, fd, @intFromPtr(addr), 0, @intFromPtr(addrlen)); + sqe.rw_flags = @bitCast(flags); + } + + /// accept directly into the fixed file table + pub fn prep_accept_direct( + sqe: *Sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.Sock, + file_index: u32, + ) void { + prep_accept(sqe, fd, addr, addrlen, flags); + set_target_fixed_file(sqe, file_index); + } + + pub fn prep_multishot_accept( + sqe: *Sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.Sock, + ) void { + prep_accept(sqe, fd, addr, addrlen, flags); + sqe.ioprio = .{ .accept = .{ .multishot = true } }; + } + + /// multishot accept directly into the fixed file table + pub fn prep_multishot_accept_direct( + sqe: *Sqe, + fd: linux.fd_t, + addr: ?*linux.sockaddr, + addrlen: ?*linux.socklen_t, + flags: linux.Sock, + ) void { + prep_multishot_accept(sqe, fd, addr, addrlen, flags); + set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC); + } + + fn set_target_fixed_file(sqe: *Sqe, file_index: u32) void { + const sqe_file_index: u32 = if (file_index == constants.FILE_INDEX_ALLOC) + constants.FILE_INDEX_ALLOC + else + // 0 means no fixed files, indexes should be encoded as "index + 1" + file_index + 1; + // This filed is overloaded in liburing: + // splice_fd_in: i32 + // sqe_file_index: u32 + sqe.splice_fd_in = @bitCast(sqe_file_index); + } + + pub fn prep_connect( + sqe: *Sqe, + fd: linux.fd_t, + addr: *const linux.sockaddr, + addrlen: linux.socklen_t, + ) void { + // `addrlen` maps to `sqe.off` (u64) instead of `sqe.len` (which is + // only a u32). + sqe.prep_rw(.connect, fd, @intFromPtr(addr), 0, addrlen); + } + + pub fn prep_epoll_wait( + sqe: *Sqe, + fd: linux.fd_t, + event: ?*linux.epoll_event, + max_events: u32, + flags: linux.Epoll, + ) void { + sqe.prep_rw(.epoll_wait, fd, @intFromPtr(event), max_events, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_epoll_ctl( + sqe: *Sqe, + epfd: linux.fd_t, + fd: linux.fd_t, + op: linux.EpollOp, + ev: ?*linux.epoll_event, + ) void { + sqe.prep_rw(.epoll_ctl, epfd, @intFromPtr(ev), @intFromEnum(op), @intCast(fd)); + } + + pub fn prep_recv(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, flags: linux.Msg) void { + sqe.prep_rw(.recv, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_recv_multishot( + sqe: *Sqe, + fd: linux.fd_t, + buffer: []u8, + flags: linux.Msg, + ) void { + sqe.prep_recv(fd, buffer, flags); + sqe.ioprio = .{ .send_recv = .{ .recv_multishot = true } }; + } + + pub fn prep_recvmsg( + sqe: *Sqe, + fd: linux.fd_t, + msg: *linux.msghdr, + flags: linux.Msg, + ) void { + sqe.prep_rw(.recvmsg, fd, @intFromPtr(msg), 1, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_recvmsg_multishot( + sqe: *Sqe, + fd: linux.fd_t, + msg: *linux.msghdr, + flags: linux.Msg, + ) void { + sqe.prep_recvmsg(fd, msg, flags); + sqe.ioprio = .{ .send_recv = .{ .recv_multishot = true } }; + } + + pub fn prep_send(sqe: *Sqe, sockfd: linux.fd_t, buffer: []const u8, flags: linux.Msg) void { + sqe.prep_rw(.send, sockfd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_send_bundle(sqe: *Sqe, sockfd: linux.fd_t, len: u64, flags: linux.Msg) void { + sqe.prep_rw(.send, sockfd, undefined, len, 0); + sqe.rw_flags = @bitCast(flags); + sqe.ioprio = .{ .send_recv = .{ .recvsend_bundle = true } }; + } + + pub fn prep_send_to( + sqe: *Sqe, + sockfd: linux.fd_t, + buffer: []const u8, + flags: linux.Msg, + addr: *const linux.sockaddr, + addrlen: linux.socklen_t, + ) void { + // addr2 maps to sqe.off and addr_len maps to sqe.splice_fd_in + sqe.prep_send(.send, sockfd, buffer, flags); + sqe.off = @intFromPtr(addr); + sqe.splice_fd_in = @intCast(addrlen); + } + + pub fn prep_send_zc(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.Msg, zc_flags: Sqe.SendRecv) void { + sqe.prep_rw(.send_zc, fd, @intFromPtr(buffer.ptr), buffer.len, 0); + sqe.rw_flags = @bitCast(flags); + sqe.ioprio = .{ .send_recv = zc_flags }; + } + + pub fn prep_send_zc_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.Msg, zc_flags: Sqe.SendRecv, buf_index: u16) void { + const zc_flags_fixed = if (zc_flags.recvsend_fixed_buf) zc_flags else blk: { + var updated_flags = zc_flags; + updated_flags.recvsend_fixed_buf = true; + break :blk updated_flags; + }; + sqe.prep_send_zc(fd, buffer, flags, zc_flags_fixed); + sqe.buf_index = buf_index; + } + + pub fn prep_sendmsg( + sqe: *Sqe, + fd: linux.fd_t, + msg: *const linux.msghdr_const, + flags: linux.Msg, + ) void { + sqe.prep_rw(.sendmsg, fd, @intFromPtr(msg), 1, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_sendmsg_zc( + sqe: *Sqe, + fd: linux.fd_t, + msg: *const linux.msghdr_const, + flags: linux.Msg, + ) void { + sqe.prep_sendmsg(fd, msg, flags); + sqe.opcode = .sendmsg_zc; + } + + pub fn prep_sendmsg_zc_fixed( + sqe: *Sqe, + fd: linux.fd_t, + msg: *const linux.msghdr_const, + flags: linux.Msg, + buf_index: u16, + ) void { + sqe.prep_sendmsg_zc(fd, msg, flags); + sqe.ioprio = .{ .send_recv = .{ .recvsend_fixed_buf = true } }; + sqe.buf_index = buf_index; + } + pub fn prep_openat( + sqe: *Sqe, + fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: linux.mode_t, + ) void { + sqe.prep_rw(.openat, fd, @intFromPtr(path), mode, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_openat_direct( + sqe: *Sqe, + fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.O, + mode: linux.mode_t, + file_index: u32, + ) void { + prep_openat(sqe, fd, path, flags, mode); + set_target_fixed_file(sqe, file_index); + } + + pub fn prep_close(sqe: *Sqe, fd: linux.fd_t) void { + sqe.* = .{ + .opcode = .close, + .flags = .{}, + .ioprio = .init_empty(), + .fd = fd, + .off = 0, + .addr = 0, + .len = 0, + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_close_direct(sqe: *Sqe, file_index: u32) void { + prep_close(sqe, 0); + set_target_fixed_file(sqe, file_index); + } + + pub fn prep_timeout( + sqe: *Sqe, + ts: *const linux.kernel_timespec, + count: u32, + flags: uflags.Timeout, + ) void { + sqe.prep_rw(.timeout, -1, @intFromPtr(ts), 1, count); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_timeout_remove(sqe: *Sqe, timeout_user_data: u64, flags: uflags.Timeout) void { + sqe.prep_rw(.timeout_remove, -1, timeout_user_data, 0, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_timeout_update(sqe: *Sqe, timeout_user_data: u64, ts: *const linux.kernel_timespec, flags: uflags.Timeout) void { + sqe.prep_rw(.timeout_remove, -1, timeout_user_data, 0, @intFromPtr(ts)); + const enable_timeout_update = if (flags.timeout_update) flags else blk: { + var tflags = flags; + tflags.timeout_update = true; + break :blk tflags; + }; + sqe.rw_flags = @bitCast(enable_timeout_update); + } + + pub fn prep_link_timeout( + sqe: *Sqe, + ts: *const linux.kernel_timespec, + flags: uflags.Timeout, + ) void { + sqe.prep_rw(.link_timeout, -1, @intFromPtr(ts), 1, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_poll_add( + sqe: *Sqe, + fd: linux.fd_t, + poll_mask: linux.Epoll, + ) void { + sqe.prep_rw(.poll_add, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0); + // Poll masks previously used to comprise of 16 bits in the flags union + // of a SQE, but were then extended to comprise of 32 bits in order to + // make room for additional option flags. To ensure that the correct + // bits of poll masks are consistently and properly read across + // multiple kernel versions, poll masks are enforced to be + // little-endian. https://www.spinics.net/lists/io-uring/msg02848.html + sqe.rw_flags = std.mem.nativeToLittle(u32, @bitCast(poll_mask)); + } + + pub fn prep_poll_remove( + sqe: *Sqe, + target_user_data: u64, + ) void { + sqe.prep_rw(.poll_remove, -1, target_user_data, 0, 0); + } + + pub fn prep_poll_update( + sqe: *Sqe, + old_user_data: u64, + new_user_data: u64, + poll_mask: linux.Epoll, + flags: uflags.Poll, + ) void { + sqe.prep_rw(.poll_remove, -1, old_user_data, flags, new_user_data); + // Poll masks previously used to comprise of 16 bits in the flags union + // of a SQE, but were then extended to comprise of 32 bits in order to + // make room for additional option flags. To ensure that the correct + // bits of poll masks are consistently and properly read across + // multiple kernel versions, poll masks are enforced to be + // little-endian. https://www.spinics.net/lists/io-uring/msg02848.html + sqe.rw_flags = std.mem.nativeToLittle(u32, @bitCast(poll_mask)); + } + + pub fn prep_fallocate( + sqe: *Sqe, + fd: linux.fd_t, + mode: i32, + offset: u64, + len: u64, + ) void { + sqe.* = .{ + .opcode = .fallocate, + .flags = .{}, + .ioprio = .init_empty(), + .fd = fd, + .off = offset, + .addr = len, + .len = @intCast(mode), + .rw_flags = 0, + .user_data = 0, + .buf_index = 0, + .personality = 0, + .splice_fd_in = 0, + .addr3 = 0, + .resv = 0, + }; + } + + pub fn prep_statx( + sqe: *Sqe, + fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.At, + mask: linux.Statx.Mask, + buf: *linux.Statx, + ) void { + sqe.prep_rw(.statx, fd, @intFromPtr(path), @as(u32, @bitCast(mask)), @intFromPtr(buf)); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_fadvice( + sqe: *Sqe, + fd: linux.fd_t, + offset: u64, + len: u32, + advice: linux.Fadvise, + ) void { + sqe.prep_rw(.fadvise, fd, undefined, len, offset); + sqe.rw_flags = @intFromEnum(advice); + } + + pub fn prep_madvice( + sqe: *Sqe, + memory: []u8, + advice: linux.Madvise, + ) void { + sqe.prep_rw(.madvise, -1, @intFromPtr(memory.ptr), memory.len, 0); + sqe.rw_flags = @intFromEnum(advice); + } + + pub fn prep_cancel( + sqe: *Sqe, + cancel_user_data: u64, + flags: uflags.AsyncCancel, + ) void { + sqe.prep_rw(.async_cancel, -1, cancel_user_data, 0, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_cancel_fd( + sqe: *Sqe, + fd: linux.fd_t, + flags: uflags.AsyncCancel, + ) void { + sqe.prep_rw(.async_cancel, fd, undefined, 0, 0); + const enable_cancel_fd = if (flags.cancel_fd) flags else blk: { + var cancel_flags = flags; + cancel_flags.cancel_fd = true; + break :blk cancel_flags; + }; + sqe.rw_flags = @bitCast(enable_cancel_fd); + } + + pub fn prep_shutdown( + sqe: *Sqe, + sockfd: linux.socket_t, + how: linux.Shut, + ) void { + sqe.prep_rw(.shutdown, sockfd, 0, @intFromEnum(how), 0); + } + + pub fn prep_renameat( + sqe: *Sqe, + old_dir_fd: linux.fd_t, + old_path: [*:0]const u8, + new_dir_fd: linux.fd_t, + new_path: [*:0]const u8, + flags: linux.Rename, + ) void { + sqe.prep_rw( + .renameat, + old_dir_fd, + @intFromPtr(old_path), + 0, + @intFromPtr(new_path), + ); + sqe.len = @bitCast(new_dir_fd); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_unlinkat( + sqe: *Sqe, + dir_fd: linux.fd_t, + path: [*:0]const u8, + flags: linux.At, + ) void { + sqe.prep_rw(.unlinkat, dir_fd, @intFromPtr(path), 0, 0); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_sync_file_range( + sqe: *Sqe, + fd: linux.fd_t, + len: u32, + offset: u64, + flags: linux.SyncFileRange, + ) void { + sqe.prep_rw(.sync_file_range, fd, undefined, len, offset); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_mkdirat( + sqe: *Sqe, + dir_fd: linux.fd_t, + path: [*:0]const u8, + mode: linux.mode_t, + ) void { + sqe.prep_rw(.mkdirat, dir_fd, @intFromPtr(path), mode, 0); + } + + pub fn prep_symlinkat( + sqe: *Sqe, + target: [*:0]const u8, + new_dir_fd: linux.fd_t, + link_path: [*:0]const u8, + ) void { + sqe.prep_rw( + .symlinkat, + new_dir_fd, + @intFromPtr(target), + 0, + @intFromPtr(link_path), + ); + } + + pub fn prep_linkat( + sqe: *Sqe, + old_dir_fd: linux.fd_t, + old_path: [*:0]const u8, + new_dir_fd: linux.fd_t, + new_path: [*:0]const u8, + flags: linux.At, + ) void { + sqe.prep_rw( + .linkat, + old_dir_fd, + @intFromPtr(old_path), + 0, + @intFromPtr(new_path), + ); + sqe.len = @bitCast(new_dir_fd); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_msg_ring( + sqe: *Sqe, + fd: linux.fd_t, + len: u32, + data: u64, + flags: uflags.MsgRing, + ) void { + sqe.prep_rw( + .msg_ring, + fd, + undefined, + len, + data, + ); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_msg_ring_cqe_flags( + sqe: *Sqe, + fd: linux.fd_t, + len: u32, + data: u64, + msg_flags: uflags.MsgRing, + cqe_flags: Cqe.Flags, + ) void { + const enable_flags_pass = blk: { + var flags = msg_flags; + flags.flags_pass = true; + break :blk flags; + }; + sqe.prep_msg_ring(fd, len, data, enable_flags_pass); + // sqe.file_index in liburing maps to splice_fd_in in Zig sqe + sqe.splice_fd_in = @intCast(@as(u32, @bitCast(cqe_flags))); + } + + pub fn prep_msg_ring_fd( + sqe: *Sqe, + fd: linux.fd_t, + source_fd: linux.fd_t, + target_fd: linux.fd_t, + data: u64, + flags: uflags.MsgRing, + ) void { + sqe.prep_rw( + .msg_ring, + fd, + @ptrFromInt(@intFromEnum(MsgRingCmd.send_fd)), + 0, + data, + ); + sqe.addr3 = @intCast(source_fd); + sqe.rw_flags = @bitCast(flags); + sqe.set_target_fixed_file(@intCast(target_fd)); + } + + pub fn prep_msg_ring_fd_alloc( + sqe: *Sqe, + fd: linux.fd_t, + source_fd: linux.fd_t, + data: u64, + flags: uflags.MsgRing, + ) void { + sqe.prep_rw( + .msg_ring, + fd, + @ptrFromInt(@intFromEnum(MsgRingCmd.send_fd)), + 0, + data, + ); + sqe.addr3 = @intCast(source_fd); + sqe.rw_flags = @bitCast(flags); + sqe.set_target_fixed_file(constants.FILE_INDEX_ALLOC); + } + + pub fn prep_getxattr( + sqe: *Sqe, + name: []const u8, + value: []const u8, + path: []const u8, + len: u32, + ) void { + sqe.prep_rw( + .getxattr, + 0, + @intFromPtr(name.ptr), + len, + @intFromPtr(value.ptr), + ); + sqe.addr3 = @intFromPtr(path.ptr); + } + + pub fn prep_fgetxattr( + sqe: *Sqe, + name: []const u8, + value: []const u8, + fd: linux.fd_t, + len: u32, + ) void { + sqe.prep_rw( + .fgetxattr, + fd, + @intFromPtr(name.ptr), + len, + @intFromPtr(value.ptr), + ); + } + + pub fn prep_setxattr( + sqe: *Sqe, + name: []const u8, + value: []const u8, + path: []const u8, + flags: linux.SetXattr, + len: u32, + ) void { + sqe.prep_rw( + .setxattr, + 0, + @intFromPtr(name.ptr), + len, + @intFromPtr(value.ptr), + ); + sqe.addr3 = @intFromPtr(path.ptr); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_fsetxattr( + sqe: *Sqe, + name: []const u8, + value: []const u8, + fd: linux.fd_t, + flags: linux.SetXattr, + len: u32, + ) void { + sqe.prep_rw( + .fsetxattr, + fd, + @intFromPtr(name.ptr), + len, + @intFromPtr(value.ptr), + ); + sqe.rw_flags = @bitCast(flags); + } + + pub fn prep_files_update( + sqe: *Sqe, + fds: []const linux.fd_t, + offset: u32, + ) void { + sqe.prep_rw(.files_update, -1, @intFromPtr(fds.ptr), fds.len, @intCast(offset)); + } + + pub fn prep_files_update_alloc( + sqe: *Sqe, + fds: []linux.fd_t, + ) void { + sqe.prep_rw(.files_update, -1, @intFromPtr(fds.ptr), fds.len, constants.FILE_INDEX_ALLOC); + } + + pub fn prep_provide_buffers( + sqe: *Sqe, + buffers: []u8, + buffer_len: u32, + buffers_count: u32, + group_id: u32, + buffer_id: u32, + ) void { + assert(buffers.len == buffer_len * buffers_count); + sqe.prep_rw( + .provide_buffers, + @intCast(buffers_count), + @intFromPtr(buffers.ptr), + buffer_len, + buffer_id, + ); + sqe.buf_index = @intCast(group_id); + } + + pub fn prep_remove_buffers( + sqe: *Sqe, + num: u32, + group_id: u32, + ) void { + sqe.prep_rw(.remove_buffers, @intCast(num), 0, 0, 0); + sqe.buf_index = @intCast(group_id); + } + + pub fn prep_socket( + sqe: *Sqe, + domain: linux.Af, + socket_type: linux.Sock, + protocol: linux.IpProto, + /// flags is unused + flags: u32, + ) void { + sqe.prep_rw(.socket, @intFromEnum(domain), 0, @intFromEnum(protocol), @as(u32, @bitCast(socket_type))); + sqe.rw_flags = flags; + } + + pub fn prep_socket_direct( + sqe: *Sqe, + domain: linux.Af, + socket_type: linux.Sock, + protocol: linux.IpProto, + /// flags is unused + flags: u32, + file_index: u32, + ) void { + prep_socket(sqe, domain, socket_type, protocol, flags); + set_target_fixed_file(sqe, file_index); + } + + pub fn prep_socket_direct_alloc( + sqe: *Sqe, + domain: linux.Af, + socket_type: linux.Sock, + protocol: linux.IpProto, + /// flags is unused + flags: u32, + ) void { + prep_socket(sqe, domain, socket_type, protocol, flags); + set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC); + } + + pub fn prep_waitid( + sqe: *Sqe, + id_type: linux.P, + id: i32, + infop: *linux.siginfo_t, + options: linux.W, + /// flags is unused + flags: u32, + ) void { + sqe.prep_rw(.waitid, id, 0, @intFromEnum(id_type), @intFromPtr(infop)); + sqe.rw_flags = flags; + sqe.splice_fd_in = @bitCast(options); + } + + pub fn prep_futex_wake( + sqe: *Sqe, + futex: *u32, + max_wake_count: u64, + mask: linux.Futex2.Bitset, + futex_flags: linux.Futex2.Wake, + flags: u32, // They are currently unused, and hence 0 should be passed + ) void { + sqe.prep_rw( + .futex_wake, + @intCast(@as(u32, @bitCast(futex_flags))), + @intFromPtr(futex), + 0, + max_wake_count, + ); + sqe.rw_flags = flags; + sqe.addr3 = @intFromEnum(mask); + } + + pub fn prep_futex_wait( + sqe: *Sqe, + futex: *u32, + max_wake_count: u64, + mask: linux.Futex2.Bitset, + futex_flags: linux.Futex2.Wait, + /// They are currently unused, and hence 0 should be passed + flags: u32, + ) void { + sqe.prep_rw( + .futex_wait, + @intCast(@as(u32, @bitCast(futex_flags))), + @intFromPtr(futex), + 0, + max_wake_count, + ); + sqe.rw_flags = flags; + sqe.addr3 = @intFromEnum(mask); + } + + pub fn prep_futex_waitv( + sqe: *Sqe, + futexv: []linux.Futex2.WaitOne, + /// They are currently unused, and hence 0 should be passed + flags: u32, + ) void { + sqe.prep_rw( + .futex_waitv, + 0, + @intFromPtr(futexv.ptr), + futexv.len, + 0, + ); + sqe.rw_flags = flags; + } + + pub fn prep_fixed_fd_install( + sqe: *Sqe, + fd: linux.fd_t, + flags: uflags.FixedFd, + ) void { + sqe.prep_rw( + .fixed_fd_install, + fd, + undefined, + 0, + 0, + ); + sqe.flags = .{ .fixed_file = true }; + sqe.rw_flags = @bitCast(flags); } -} -/// Unregisters all registered file descriptors previously associated with the ring. -pub fn unregister_files(self: *IoUring) !void { - assert(self.fd >= 0); - const res = linux.io_uring_register(self.fd, .UNREGISTER_FILES, null, 0); - switch (linux.E.init(res)) { - .SUCCESS => {}, - .NXIO => return error.FilesNotRegistered, - else => |errno| return posix.unexpectedErrno(errno), + pub fn prep_ftruncate( + sqe: *Sqe, + fd: linux.fd_t, + offset: u64, + ) void { + sqe.prep_rw( + .ftruncate, + fd, + undefined, + 0, + offset, + ); } -} -/// Prepares a socket creation request. -/// New socket fd will be returned in completion result. -/// Available since 5.19 -pub fn socket( - self: *IoUring, - user_data: u64, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, -) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - sqe.prep_socket(domain, socket_type, protocol, flags); - sqe.user_data = user_data; - return sqe; -} + pub fn prep_cmd_discard( + sqe: *Sqe, + fd: linux.fd_t, + offset: u64, + nbytes: u64, + ) void { + sqe.prep_rw( + .uring_cmd, + fd, + undefined, + 0, + 0, + ); + // sqe.off maps to sqe.cmd_op in liburing + sqe.off = constants.BLOCK_URING_CMD_DISCARD; + sqe.addr = offset; + sqe.addr3 = nbytes; + } -/// Prepares a socket creation request for registered file at index `file_index`. -/// Available since 5.19 -pub fn socket_direct( - self: *IoUring, - user_data: u64, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, - file_index: u32, -) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - sqe.prep_socket_direct(domain, socket_type, protocol, flags, file_index); - sqe.user_data = user_data; - return sqe; -} + pub fn prep_pipe( + sqe: *Sqe, + fds: *[2]linux.fd_t, + flags: linux.Pipe2, + ) void { + sqe.prep_rw( + .pipe, + 0, + @intFromPtr(fds), + 0, + 0, + ); + sqe.rw_flags = @bitCast(flags); + } -/// Prepares a socket creation request for registered file, index chosen by kernel (file index alloc). -/// File index will be returned in CQE res field. -/// Available since 5.19 -pub fn socket_direct_alloc( - self: *IoUring, - user_data: u64, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, -) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - sqe.prep_socket_direct_alloc(domain, socket_type, protocol, flags); - sqe.user_data = user_data; - return sqe; -} + pub fn prep_pipe_direct( + sqe: *Sqe, + fds: *[2]linux.fd_t, + flags: linux.Pipe2, + file_index: u32, + ) void { + sqe.prep_pipe(fds, flags); + sqe.set_target_fixed_file(file_index); + } -/// Queues (but does not submit) an SQE to perform an `bind(2)` on a socket. -/// Returns a pointer to the SQE. -/// Available since 6.11 -pub fn bind( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - addr: *const posix.sockaddr, - addrlen: posix.socklen_t, - flags: u32, -) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - sqe.prep_bind(fd, addr, addrlen, flags); - sqe.user_data = user_data; - return sqe; -} + pub fn prep_bind( + sqe: *Sqe, + fd: linux.fd_t, + addr: *const linux.sockaddr, + addrlen: linux.socklen_t, + ) void { + sqe.prep_rw(.bind, fd, @intFromPtr(addr), 0, addrlen); + } -/// Queues (but does not submit) an SQE to perform an `listen(2)` on a socket. -/// Returns a pointer to the SQE. -/// Available since 6.11 -pub fn listen( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - backlog: usize, - flags: u32, -) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - sqe.prep_listen(fd, backlog, flags); - sqe.user_data = user_data; - return sqe; -} + pub fn prep_listen( + sqe: *Sqe, + fd: linux.fd_t, + backlog: u32, + ) void { + sqe.prep_rw(.listen, fd, 0, backlog, 0); + } -/// Prepares an cmd request for a socket. -/// See: https://man7.org/linux/man-pages/man3/io_uring_prep_cmd.3.html -/// Available since 6.7. -pub fn cmd_sock( - self: *IoUring, - user_data: u64, - cmd_op: linux.IO_URING_SOCKET_OP, - fd: linux.fd_t, - level: u32, // linux.SOL - optname: u32, // linux.SO - optval: u64, // pointer to the option value - optlen: u32, // size of the option value -) !*linux.io_uring_sqe { - const sqe = try self.get_sqe(); - sqe.prep_cmd_sock(cmd_op, fd, level, optname, optval, optlen); - sqe.user_data = user_data; - return sqe; -} + pub fn prep_cmd_sock( + sqe: *Sqe, + cmd_op: SocketOp, + fd: linux.fd_t, + level: linux.Sol, + optname: linux.So, + optval: u64, + optlen: u32, + ) void { + sqe.prep_rw(.uring_cmd, fd, 0, 0, 0); + // off is overloaded with cmd_op, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L39 + sqe.off = @intFromEnum(cmd_op); + // addr is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L46 + sqe.addr = @bitCast(packed struct { + level: u32, + optname: u32, + }{ + .level = @intFromEnum(level), + .optname = @intFromEnum(optname), + }); + // splice_fd_in if overloaded u32 -> i32 + sqe.splice_fd_in = @bitCast(optlen); + // addr3 is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L102 + sqe.addr3 = optval; + } -/// Prepares set socket option for the optname argument, at the protocol -/// level specified by the level argument. -/// Available since 6.7.n -pub fn setsockopt( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - level: u32, // linux.SOL - optname: u32, // linux.SO - opt: []const u8, -) !*linux.io_uring_sqe { - return try self.cmd_sock( - user_data, - .SETSOCKOPT, - fd, - level, - optname, - @intFromPtr(opt.ptr), - @intCast(opt.len), - ); -} + pub fn set_flags(sqe: *Sqe, flags: Sqe.IoSqe) void { + const updated_flags = @as(u8, @bitCast(sqe.flags)) | @as(u8, @bitCast(flags)); + sqe.flags = @bitCast(updated_flags); + } -/// Prepares get socket option to retrieve the value for the option specified by -/// the option_name argument for the socket specified by the fd argument. -/// Available since 6.7. -pub fn getsockopt( - self: *IoUring, - user_data: u64, - fd: linux.fd_t, - level: u32, // linux.SOL - optname: u32, // linux.SO - opt: []u8, -) !*linux.io_uring_sqe { - return try self.cmd_sock( - user_data, - .GETSOCKOPT, - fd, - level, - optname, - @intFromPtr(opt.ptr), - @intCast(opt.len), - ); -} + /// This SQE forms a link with the next SQE in the submission ring. Next SQE + /// will not be started before this one completes. Forms a chain of SQEs. + pub fn link_next(sqe: *Sqe) void { + sqe.flags.io_link = true; + } +}; -pub const SubmissionQueue = struct { +/// matches `io_uring_sq` in liburing +pub const Sq = struct { head: *u32, tail: *u32, mask: u32, - flags: *u32, + flags: *Flags, dropped: *u32, array: []u32, - sqes: []linux.io_uring_sqe, + sqes: []Sqe, mmap: []align(page_size_min) u8, mmap_sqes: []align(page_size_min) u8, - // We use `sqe_head` and `sqe_tail` in the same way as liburing: // We increment `sqe_tail` (but not `tail`) for each call to `get_sqe()`. - // We then set `tail` to `sqe_tail` once, only when these events are actually submitted. - // This allows us to amortize the cost of the @atomicStore to `tail` across multiple SQEs. + // We then set `tail` to `sqe_tail` once, only when these events are + // actually submitted. This allows us to amortize the cost of the + // @atomicStore to `tail` across multiple SQEs. sqe_head: u32 = 0, sqe_tail: u32 = 0, - pub fn init(fd: linux.fd_t, p: linux.io_uring_params) !SubmissionQueue { + /// sq_ring.flags + pub const Flags = packed struct(u32) { + /// needs io_uring_enter wakeup + need_wakeup: bool = false, + /// CQ ring is overflown + cq_overflow: bool = false, + /// task should enter the kernel + taskrun: bool = false, + _: u29 = 0, + }; + + pub fn init(fd: linux.fd_t, p: Params) !Sq { assert(fd >= 0); - assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) != 0); + assert(p.features.single_mmap); const size = @max( p.sq_off.array + p.sq_entries * @sizeOf(u32), - p.cq_off.cqes + p.cq_entries * @sizeOf(linux.io_uring_cqe), + p.cq_off.cqes + p.cq_entries * @sizeOf(Cqe), ); const mmap = try posix.mmap( null, @@ -1526,31 +3589,32 @@ pub const SubmissionQueue = struct { posix.PROT.READ | posix.PROT.WRITE, .{ .TYPE = .SHARED, .POPULATE = true }, fd, - linux.IORING_OFF_SQ_RING, + constants.OFF_SQ_RING, ); errdefer posix.munmap(mmap); assert(mmap.len == size); - // The motivation for the `sqes` and `array` indirection is to make it possible for the - // application to preallocate static linux.io_uring_sqe entries and then replay them when needed. - const size_sqes = p.sq_entries * @sizeOf(linux.io_uring_sqe); + // The motivation for the `sqes` and `array` indirection is to make it + // possible for the application to preallocate static io_uring_sqe + // entries and then replay them when needed. + const size_sqes = p.sq_entries * @sizeOf(Sqe); const mmap_sqes = try posix.mmap( null, size_sqes, posix.PROT.READ | posix.PROT.WRITE, .{ .TYPE = .SHARED, .POPULATE = true }, fd, - linux.IORING_OFF_SQES, + constants.OFF_SQES, ); errdefer posix.munmap(mmap_sqes); assert(mmap_sqes.len == size_sqes); const array: [*]u32 = @ptrCast(@alignCast(&mmap[p.sq_off.array])); - const sqes: [*]linux.io_uring_sqe = @ptrCast(@alignCast(&mmap_sqes[0])); - // We expect the kernel copies p.sq_entries to the u32 pointed to by p.sq_off.ring_entries, - // see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L7843-L7844. + const sqes: [*]Sqe = @ptrCast(@alignCast(&mmap_sqes[0])); + // We expect the kernel copies p.sq_entries to the u32 pointed to by + // p.sq_off.ring_entries, See https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L7843-L7844. assert(p.sq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_entries]))).*); - return SubmissionQueue{ + return .{ .head = @ptrCast(@alignCast(&mmap[p.sq_off.head])), .tail = @ptrCast(@alignCast(&mmap[p.sq_off.tail])), .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_mask]))).*, @@ -1563,26 +3627,34 @@ pub const SubmissionQueue = struct { }; } - pub fn deinit(self: *SubmissionQueue) void { + pub fn deinit(self: *Sq) void { posix.munmap(self.mmap_sqes); posix.munmap(self.mmap); } }; -pub const CompletionQueue = struct { +/// matches `io_uring_cq` in liburing +pub const Cq = struct { head: *u32, tail: *u32, mask: u32, overflow: *u32, - cqes: []linux.io_uring_cqe, + cqes: []Cqe, + + /// cq_ring.flags + pub const Flags = packed struct(u32) { + /// disable eventfd notifications + eventfd_disabled: bool = false, + _: u31 = 0, + }; - pub fn init(fd: linux.fd_t, p: linux.io_uring_params, sq: SubmissionQueue) !CompletionQueue { + pub fn init(fd: linux.fd_t, p: Params, sq: Sq) !Cq { assert(fd >= 0); - assert((p.features & linux.IORING_FEAT_SINGLE_MMAP) != 0); + assert(p.features.single_mmap); const mmap = sq.mmap; - const cqes: [*]linux.io_uring_cqe = @ptrCast(@alignCast(&mmap[p.cq_off.cqes])); + const cqes: [*]Cqe = @ptrCast(@alignCast(&mmap[p.cq_off.cqes])); assert(p.cq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_entries]))).*); - return CompletionQueue{ + return .{ .head = @ptrCast(@alignCast(&mmap[p.cq_off.head])), .tail = @ptrCast(@alignCast(&mmap[p.cq_off.tail])), .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_mask]))).*, @@ -1591,10 +3663,11 @@ pub const CompletionQueue = struct { }; } - pub fn deinit(self: *CompletionQueue) void { + pub fn deinit(self: *Cq) void { _ = self; // A no-op since we now share the mmap with the submission queue. - // Here for symmetry with the submission queue, and for any future feature support. + // Here for symmetry with the submission queue, and for any future + // feature support. } }; @@ -1609,20 +3682,19 @@ pub const CompletionQueue = struct { /// ready to receive data, a buffer is picked automatically and the resulting /// CQE will contain the buffer ID in `cqe.buffer_id()`. Use `get` method to get /// buffer for buffer ID identified by CQE. Once the application has processed -/// the buffer, it may hand ownership back to the kernel, by calling `put` +/// the buffer, it may hand ownership back to the kernel, by calling `put()` /// allowing the cycle to repeat. /// /// Depending on the rate of arrival of data, it is possible that a given buffer /// group will run out of buffers before those in CQEs can be put back to the /// kernel. If this happens, a `cqe.err()` will have ENOBUFS as the error value. -/// pub const BufferGroup = struct { /// Parent ring for which this group is registered. ring: *IoUring, /// Pointer to the memory shared by the kernel. /// `buffers_count` of `io_uring_buf` structures are shared by the kernel. /// First `io_uring_buf` is overlaid by `io_uring_buf_ring` struct. - br: *align(page_size_min) linux.io_uring_buf_ring, + br: *align(page_size_min) BufferRing, /// Contiguous block of memory of size (buffers_count * buffer_size). buffers: []u8, /// Size of each buffer in buffers. @@ -1646,20 +3718,19 @@ pub const BufferGroup = struct { const heads = try allocator.alloc(u32, buffers_count); errdefer allocator.free(heads); - const br = try setup_buf_ring(ring.fd, buffers_count, group_id, .{ .inc = true }); - buf_ring_init(br); + const br = try ring.init_buffer_ring(buffers_count, group_id, .{ .iou_pbuf_ring_inc = true }); - const mask = buf_ring_mask(buffers_count); + const mask = br.mask(buffers_count); var i: u16 = 0; while (i < buffers_count) : (i += 1) { const pos = buffer_size * i; const buf = buffers[pos .. pos + buffer_size]; heads[i] = 0; - buf_ring_add(br, buf, i, mask, i); + br.add(buf, i, mask, i); } - buf_ring_advance(br, buffers_count); + br.advance(buffers_count); - return BufferGroup{ + return .{ .ring = ring, .group_id = group_id, .br = br, @@ -1670,217 +3741,1063 @@ pub const BufferGroup = struct { }; } - pub fn deinit(self: *BufferGroup, allocator: mem.Allocator) void { - free_buf_ring(self.ring.fd, self.br, self.buffers_count, self.group_id); - allocator.free(self.buffers); - allocator.free(self.heads); + pub fn deinit(self: *BufferGroup, allocator: mem.Allocator) void { + self.br.deinit(self.ring, self.buffers_count, self.group_id); + allocator.free(self.buffers); + allocator.free(self.heads); + } + + /// Prepare multishot read operation which will select buffer from this + /// group. + pub fn read_multishot( + self: *BufferGroup, + user_data: u64, + fd: linux.fd_t, + nbytes: u32, + offset: u64, + ) !*Sqe { + var sqe = try self.ring.get_sqe(); + sqe.prep_rw(.read_multishot, fd, undefined, nbytes, offset); + sqe.flags.buffer_select = true; + sqe.buf_index = self.group_id; + sqe.user_data = user_data; + return sqe; + } + + /// Prepare recv operation which will select buffer from this group. + pub fn recv( + self: *BufferGroup, + user_data: u64, + fd: linux.fd_t, + flags: linux.Msg, + ) !*Sqe { + var sqe = try self.ring.get_sqe(); + sqe.prep_rw(.recv, fd, 0, 0, 0); + sqe.rw_flags = @bitCast(flags); + sqe.flags.buffer_select = true; + sqe.buf_index = self.group_id; + sqe.user_data = user_data; + return sqe; + } + + /// Prepare multishot recv operation which will select buffer from this + /// group. + pub fn recv_multishot( + self: *BufferGroup, + user_data: u64, + fd: linux.fd_t, + flags: linux.Msg, + ) !*Sqe { + var sqe = try self.recv(user_data, fd, flags); + sqe.ioprio.send_recv.recv_multishot = true; + return sqe; + } + + // Get buffer by id. + fn get_by_id(self: *BufferGroup, buffer_id: u16) []u8 { + const pos = self.buffer_size * buffer_id; + return self.buffers[pos .. pos + self.buffer_size][self.heads[buffer_id]..]; + } + + /// Get buffer by CQE. + pub fn get(self: *BufferGroup, cqe: Cqe) ![]u8 { + const buffer_id = try cqe.buffer_id(); + const used_len: usize = @intCast(cqe.res); + return self.get_by_id(buffer_id)[0..used_len]; + } + + /// Release buffer from CQE to the kernel. + pub fn put(self: *BufferGroup, cqe: Cqe) !void { + const buffer_id = try cqe.buffer_id(); + if (cqe.flags.f_buf_more) { + // Incremental consumption active, kernel will write to the this + // buffer again + const used_len: u32 = @intCast(cqe.res); + // Track what part of the buffer is used + self.heads[buffer_id] += used_len; + return; + } + self.heads[buffer_id] = 0; + + // Release buffer to the kernel. + const mask = self.br.mask(self.buffers_count); + self.br.add(self.get_by_id(buffer_id), buffer_id, mask, 0); + self.br.advance(1); + } +}; + +/// Used to select how the read should be handled. +pub const ReadBuffer = union(enum) { + /// io_uring will read directly into this buffer + buffer: []u8, + /// io_uring will read directly into these buffers using readv. + iovecs: []const posix.iovec, + /// io_uring will select a buffer that has previously been provided with + /// `provide_buffers`. + /// `group_id` must contain at least one buffer for the read to work. + /// `len` controls the number of bytes to read into the selected buffer. + buffer_selection: struct { + group_id: u16, + len: usize, + }, +}; + +/// Used to select how the write should be handled. +pub const WriteBuffer = union(enum) { + /// io_uring will write data from this buffer into fd. + buffer: []const u8, + /// io_uring will write data from iovecs into fd using pwritev. + iovecs: []const posix.iovec_const, +}; + +/// Used to select how get/setxttr should be handled. +pub const XattrSource = union(enum) { + /// Get/Set xattr associated with the given path in the filesystem + path: []const u8, + /// Get/Set xattr for the opened file referenced by this fd + fd: linux.fd_t, +}; + +/// Used to select how the recv call should be handled. +pub const RecvBuffer = union(enum) { + /// io_uring will recv directly into this buffer + buffer: []u8, + /// io_uring will select a buffer that has previously been provided with + /// `provide_buffers`. + /// `group_id` must contain at least one buffer for the recv call to work. + /// `len` controls the number of bytes to read into the selected buffer. + buffer_selection: struct { + group_id: u16, + len: usize, + }, +}; + +/// Filled with the offset for `mmap(2)` +/// matches `io_sqring_offsets` in liburing +pub const SqOffsets = extern struct { + /// offset of ring head + head: u32, + /// offset of ring tail + tail: u32, + /// ring mask value + ring_mask: u32, + /// entries in ring + ring_entries: u32, + /// ring flags index + flags: u32, + /// number of sqes not submitted + dropped: u32, + /// sqe index array + array: u32, + resv1: u32, + user_addr: u64, +}; + +/// matches `io_cqring_offsets` in liburing +pub const CqOffsets = extern struct { + head: u32, + tail: u32, + ring_mask: u32, + ring_entries: u32, + overflow: u32, + cqes: u32, + flags: u32, // flags index + resv: u32, + user_addr: u64, +}; + +/// Passed in for `io_uring_setup(2)`. Copied back with updated info on success +/// matches `io_uring_params` in liburing +pub const Params = extern struct { + sq_entries: u32, + cq_entries: u32, + flags: uflags.Setup, + sq_thread_cpu: u32, + sq_thread_idle: u32, + features: uflags.Features, + wq_fd: u32, + resv: [3]u32, + sq_off: SqOffsets, + cq_off: CqOffsets, +}; + +/// matches `io_uring_region_desc` in liburing +pub const RegionDesc = extern struct { + user_addr: u64, + size: u64, + flags: Flags, + id: u32, + mmap_offset: u64, + __resv: [4]u64, + + /// initialise with user provided memory pointed by user_addr + pub const Flags = packed struct(u32) { + type_user: bool = false, + _: u31 = 0, + }; +}; + +/// matches `io_uring_mem_region_reg` in liburing +pub const MemRegionRegister = extern struct { + /// struct io_uring_region_desc (RegionDesc in Zig) + region_uptr: u64, + flags: Flags, + __resv: [2]u64, + + /// expose the region as registered wait arguments + pub const Flags = packed struct(u64) { + reg_wait_arg: bool = false, + _: u63 = 0, + }; +}; + +/// matches `io_uring_rsrc_register` in liburing +pub const RsrcRegister = extern struct { + nr: u32, + flags: Flags, + resv2: u64, + data: u64, + tags: u64, + + pub const Flags = packed struct(u32) { + /// Register a fully sparse file space, rather than pass in an array of + /// all -1 file descriptors. + register_sparse: bool = false, + _: 31 = 0, + }; +}; + +/// matches `io_uring_rsrc_update` in liburing +pub const RsrcUpdate = extern struct { + offset: u32, + resv: u32, + data: u64, +}; + +/// matches `io_uring_rsrc_update2` in liburing +pub const RsrcUpdate2 = extern struct { + offset: u32, + resv: u32, + data: u64, + tags: u64, + nr: u32, + resv2: u32, +}; + +/// matches `io_uring_probe_op` in liburing +pub const ProbeOp = extern struct { + op: Op, + resv: u8, + flags: Flags, + resv2: u32, + + pub const Flags = packed struct(u16) { + op_supported: bool = false, + _: u15 = 0, + }; + + pub fn is_supported(self: ProbeOp) bool { + return self.flags.op_supported; + } +}; + +/// matches `io_uring_probe` in liburing +pub const Probe = extern struct { + /// Last opcode supported + last_op: Op, + /// Length of ops[] array below + ops_len: u8, + resv: u16, + resv2: [3]u32, + ops: [256]ProbeOp, + + /// Is the operation supported on the running kernel. + pub fn is_supported(self: *const Probe, op: Op) bool { + const i = @intFromEnum(op); + if (i > @intFromEnum(self.last_op) or i >= self.ops_len) + return false; + return self.ops[i].is_supported(); + } +}; + +// RegisterOp is actually u8 +/// matches `io_uring_restriction` in liburing +pub const Restriction = extern struct { + opcode: RestrictionOp, + arg: extern union { + /// IORING_RESTRICTION_REGISTER_OP + register_op: RegisterOp, + /// IORING_RESTRICTION_SQE_OP + sqe_op: Op, + /// IORING_RESTRICTION_SQE_FLAGS_* + sqe_flags: Sqe.IoSqe, + }, + resv: u8, + resv2: [3]u32, +}; + +/// matches `io_uring_clock_register` in liburing +pub const ClockRegister = extern struct { + clockid: u32, + __resv: [3]u32, +}; + +/// matches `io_uring_clone_buffers` in liburing +pub const CloneBuffers = extern struct { + src_fd: u32, + flags: Flags, + src_off: u32, + dst_off: u32, + nr: u32, + pad: [3]u32, + + pub const Flags = packed struct(u32) { + register_src_registered: bool = false, + register_dst_replace: bool = false, + _: u30 = 0, + }; +}; + +/// matches `io_uring_buf` in liburing +pub const Buffer = extern struct { + addr: u64, + len: u32, + bid: u16, + resv: u16, +}; + +/// matches `io_uring_buf_ring` in liburing +pub const BufferRing = extern struct { + resv1: u64, + resv2: u32, + resv3: u16, + tail: u16, + + /// Initialises `br` so that it is ready to be used. + /// matches `io_uring_buf_ring_init` in liburing + fn init(br: *align(page_size_min) BufferRing) void { + br.tail = 0; } - // Prepare recv operation which will select buffer from this group. - pub fn recv(self: *BufferGroup, user_data: u64, fd: linux.fd_t, flags: u32) !*linux.io_uring_sqe { - var sqe = try self.ring.get_sqe(); - sqe.prep_rw(.RECV, fd, 0, 0, 0); - sqe.rw_flags = flags; - sqe.flags |= linux.IOSQE_BUFFER_SELECT; - sqe.buf_index = self.group_id; - sqe.user_data = user_data; - return sqe; + // Unregisters a previously registered shared buffer ring, returned from + // io_uring_setup_buf_ring. + pub fn deinit(br: *align(page_size_min) BufferRing, uring: *IoUring, entries: u32, group_id: u16) void { + uring.unregister_buffer_ring(group_id) catch {}; + var mmap: []align(page_size_min) u8 = undefined; + mmap.ptr = @ptrCast(br); + mmap.len = entries * @sizeOf(Buffer); + posix.munmap(mmap); } - // Prepare multishot recv operation which will select buffer from this group. - pub fn recv_multishot(self: *BufferGroup, user_data: u64, fd: linux.fd_t, flags: u32) !*linux.io_uring_sqe { - var sqe = try self.recv(user_data, fd, flags); - sqe.ioprio |= linux.IORING_RECV_MULTISHOT; - return sqe; + /// Calculates the appropriate size mask for a buffer ring. + /// `entries` is the ring entries as specified in io_uring_register_buf_ring + pub fn mask(_: *align(page_size_min) BufferRing, entries: u16) u16 { + return entries - 1; } - // Get buffer by id. - fn get_by_id(self: *BufferGroup, buffer_id: u16) []u8 { - const pos = self.buffer_size * buffer_id; - return self.buffers[pos .. pos + self.buffer_size][self.heads[buffer_id]..]; + /// Assigns `buffer` with the `br` buffer ring. + /// `buffer_id` is identifier which will be returned in the CQE. + /// `buffer_offset` is the offset to insert at from the current tail. + /// If just one buffer is provided before the ring tail is committed with + /// advance then offset should be 0. + /// If buffers are provided in a loop before being committed, the offset must + /// be incremented by one for each buffer added. + pub fn add( + br: *align(page_size_min) BufferRing, + buffer: []u8, + buffer_id: u16, + buffer_mask: u16, + buffer_offset: u16, + ) void { + const bufs: [*]Buffer = @ptrCast(br); + const buf: *Buffer = &bufs[(br.tail +% buffer_offset) & buffer_mask]; + + buf.addr = @intFromPtr(buffer.ptr); + buf.len = @intCast(buffer.len); + buf.bid = buffer_id; } - // Get buffer by CQE. - pub fn get(self: *BufferGroup, cqe: linux.io_uring_cqe) ![]u8 { - const buffer_id = try cqe.buffer_id(); - const used_len = @as(usize, @intCast(cqe.res)); - return self.get_by_id(buffer_id)[0..used_len]; + /// Make `count` new buffers visible to the kernel. Called after + /// `io_uring_buf_ring_add` has been called `count` times to fill in new + /// buffers. + pub fn advance(br: *align(page_size_min) BufferRing, count: u16) void { + const tail: u16 = br.tail +% count; + @atomicStore(u16, &br.tail, tail, .release); } +}; - // Release buffer from CQE to the kernel. - pub fn put(self: *BufferGroup, cqe: linux.io_uring_cqe) !void { - const buffer_id = try cqe.buffer_id(); - if (cqe.flags & linux.IORING_CQE_F_BUF_MORE == linux.IORING_CQE_F_BUF_MORE) { - // Incremental consumption active, kernel will write to the this buffer again - const used_len = @as(u32, @intCast(cqe.res)); - // Track what part of the buffer is used - self.heads[buffer_id] += used_len; - return; - } - self.heads[buffer_id] = 0; +/// argument for IORING_(UN)REGISTER_PBUF_RING +/// matches `io_uring_buf_reg` in liburing +pub const BufferRegister = extern struct { + ring_addr: u64, + ring_entries: u32, + bgid: u16, + flags: Flags, + resv: [3]u64, + + /// Flags for IORING_REGISTER_PBUF_RING. + pub const Flags = packed struct(u16) { + /// IOU_PBUF_RING_MMAP: + /// If set, kernel will allocate the memory for the ring. + /// The application must not set a ring_addr in struct io_uring_buf_reg + /// instead it must subsequently call mmap(2) with the offset set + /// as: IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) to get + /// a virtual mapping for the ring. + iou_pbuf_ring_mmap: bool = false, + /// IOU_PBUF_RING_INC: + /// If set, buffers consumed from this buffer ring can be + /// consumed incrementally. Normally one (or more) buffers + /// are fully consumed. With incremental consumptions, it's + /// feasible to register big ranges of buffers, and each + /// use of it will consume only as much as it needs. This + /// requires that both the kernel and application keep + /// track of where the current read/recv index is at. + iou_pbuf_ring_inc: bool = false, + _: u14 = 0, + }; +}; - // Release buffer to the kernel. const mask = buf_ring_mask(self.buffers_count); - const mask = buf_ring_mask(self.buffers_count); - buf_ring_add(self.br, self.get_by_id(buffer_id), buffer_id, mask, 0); - buf_ring_advance(self.br, 1); - } +/// argument for IORING_REGISTER_PBUF_STATUS +/// matches `io_uring_buf_status` in liburing +pub const BufferStatus = extern struct { + /// input + buf_group: u32, + /// output + head: u32, + resv: [8]u32, }; -/// Registers a shared buffer ring to be used with provided buffers. -/// `entries` number of `io_uring_buf` structures is mem mapped and shared by kernel. -/// `fd` is IO_Uring.fd for which the provided buffer ring is being registered. -/// `entries` is the number of entries requested in the buffer ring, must be power of 2. -/// `group_id` is the chosen buffer group ID, unique in IO_Uring. -pub fn setup_buf_ring( - fd: linux.fd_t, - entries: u16, - group_id: u16, - flags: linux.io_uring_buf_reg.Flags, -) !*align(page_size_min) linux.io_uring_buf_ring { - if (entries == 0 or entries > 1 << 15) return error.EntriesNotInRange; - if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo; +/// argument for IORING_(UN)REGISTER_NAPI +/// matches `io_uring_napi` in liburing +pub const Napi = extern struct { + busy_poll_to: u32, + prefer_busy_poll: u8, + pad: [3]u8, + resv: u64, +}; - const mmap_size = @as(usize, entries) * @sizeOf(linux.io_uring_buf); - const mmap = try posix.mmap( - null, - mmap_size, - posix.PROT.READ | posix.PROT.WRITE, - .{ .TYPE = .PRIVATE, .ANONYMOUS = true }, - -1, - 0, - ); - errdefer posix.munmap(mmap); - assert(mmap.len == mmap_size); +/// Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG +/// set, where the actual argument is an index into a previously registered +/// fixed wait region described by the below structure. +/// matches `io_uring_reg_wait` in liburing +pub const RegisterWait = extern struct { + ts: linux.kernel_timespec, + min_wait_usec: u32, + flags: Flags, + sigmask: u64, + sigmask_sz: u32, + pad: [3]u32, + pad2: [2]u64, + + pub const Flags = packed struct(u32) { + reg_wait_ts: bool = false, + _: u31 = 0, + }; +}; - const br: *align(page_size_min) linux.io_uring_buf_ring = @ptrCast(mmap.ptr); - try register_buf_ring(fd, @intFromPtr(br), entries, group_id, flags); - return br; -} +/// Argument for `io_uring_enter(2)` with IORING_GETEVENTS | +/// IORING_ENTER_EXT_ARG +/// matches `io_uring_getevents_arg` in liburing +pub const GetEventsArg = extern struct { + sigmask: u64, + sigmask_sz: u32, + pad: u32, + ts: u64, +}; -fn register_buf_ring( - fd: linux.fd_t, +/// Argument for IORING_REGISTER_SYNC_CANCEL +/// matches `io_uring_sync_cancel_reg` in liburing +pub const SyncCancelRegister = extern struct { addr: u64, - entries: u32, - group_id: u16, - flags: linux.io_uring_buf_reg.Flags, -) !void { - var reg = mem.zeroInit(linux.io_uring_buf_reg, .{ - .ring_addr = addr, - .ring_entries = entries, - .bgid = group_id, - .flags = flags, - }); - var res = linux.io_uring_register(fd, .REGISTER_PBUF_RING, @as(*const anyopaque, @ptrCast(®)), 1); - if (linux.E.init(res) == .INVAL and reg.flags.inc) { - // Retry without incremental buffer consumption. - // It is available since kernel 6.12. returns INVAL on older. - reg.flags.inc = false; - res = linux.io_uring_register(fd, .REGISTER_PBUF_RING, @as(*const anyopaque, @ptrCast(®)), 1); - } - try handle_register_buf_ring_result(res); -} + fd: i32, + flags: uflags.AsyncCancel, + timeout: linux.kernel_timespec, + opcode: Op, + pad: [7]u8, + pad2: [4]u64, +}; -fn unregister_buf_ring(fd: linux.fd_t, group_id: u16) !void { - var reg = mem.zeroInit(linux.io_uring_buf_reg, .{ - .bgid = group_id, - }); - const res = linux.io_uring_register( - fd, - .UNREGISTER_PBUF_RING, - @as(*const anyopaque, @ptrCast(®)), - 1, - ); - try handle_register_buf_ring_result(res); -} +/// Argument for IORING_REGISTER_FILE_ALLOC_RANGE +/// The range is specified as [off, off + len) +/// matches `io_uring_file_index_range` in liburing +pub const FileIndexRange = extern struct { + off: u32, + len: u32, + resv: u64, +}; -fn handle_register_buf_ring_result(res: usize) !void { - switch (linux.E.init(res)) { - .SUCCESS => {}, - .INVAL => return error.ArgumentsInvalid, - else => |errno| return posix.unexpectedErrno(errno), - } -} +/// matches `io_uring_recvmsg_out` in liburing +pub const RecvMsgOut = extern struct { + namelen: u32, + controllen: u32, + payloadlen: u32, + flags: linux.Msg, +}; -// Unregisters a previously registered shared buffer ring, returned from io_uring_setup_buf_ring. -pub fn free_buf_ring(fd: linux.fd_t, br: *align(page_size_min) linux.io_uring_buf_ring, entries: u32, group_id: u16) void { - unregister_buf_ring(fd, group_id) catch {}; - var mmap: []align(page_size_min) u8 = undefined; - mmap.ptr = @ptrCast(br); - mmap.len = entries * @sizeOf(linux.io_uring_buf); - posix.munmap(mmap); -} +/// Zero copy receive refill queue entry +/// matches `io_uring_zcrx_rqe` in liburing +pub const ZcrxRqe = extern struct { + off: u64, + len: u32, + __pad: u32, +}; -/// Initialises `br` so that it is ready to be used. -pub fn buf_ring_init(br: *linux.io_uring_buf_ring) void { - br.tail = 0; -} +/// matches `io_uring_zcrx_cqe` in liburing +pub const ZcrxCqe = extern struct { + off: u64, + __pad: u64, +}; -/// Calculates the appropriate size mask for a buffer ring. -/// `entries` is the ring entries as specified in io_uring_register_buf_ring. -pub fn buf_ring_mask(entries: u16) u16 { - return entries - 1; -} +/// matches `io_uring_zcrx_offsets` in liburing +pub const ZcrxOffsets = extern struct { + head: u32, + tail: u32, + rqes: u32, + __resv2: u32, + __resv: [2]u64, +}; -/// Assigns `buffer` with the `br` buffer ring. -/// `buffer_id` is identifier which will be returned in the CQE. -/// `buffer_offset` is the offset to insert at from the current tail. -/// If just one buffer is provided before the ring tail is committed with advance then offset should be 0. -/// If buffers are provided in a loop before being committed, the offset must be incremented by one for each buffer added. -pub fn buf_ring_add( - br: *linux.io_uring_buf_ring, - buffer: []u8, - buffer_id: u16, - mask: u16, - buffer_offset: u16, -) void { - const bufs: [*]linux.io_uring_buf = @ptrCast(br); - const buf: *linux.io_uring_buf = &bufs[(br.tail +% buffer_offset) & mask]; +/// matches `io_uring_zcrx_area_reg` in liburing +pub const ZcrxAreaRegister = extern struct { + addr: u64, + len: u64, + rq_area_token: u64, + flags: Flags, + dmabuf_fd: u32, + __resv2: [2]u64, + + pub const Flags = packed struct(u32) { + dmabuf: bool = false, + _: u31 = 0, + }; +}; - buf.addr = @intFromPtr(buffer.ptr); - buf.len = @intCast(buffer.len); - buf.bid = buffer_id; -} +/// Argument for IORING_REGISTER_ZCRX_IFQ +/// matches `io_uring_zcrx_ifq_reg` in liburing +pub const ZcrxIfqRegister = extern struct { + if_idx: u32, + if_rxq: u32, + rq_entries: u32, + // FIXME: I don't know what these flags are yet even after my research + flags: u32, + /// pointer to struct io_uring_zcrx_area_reg + area_ptr: u64, + /// struct io_uring_region_desc + region_ptr: u64, + offsets: ZcrxOffsets, // the kernel fill in the offsets + zcrx_id: u32, + __resv2: u32, + __resv: [3]u64, +}; -/// Make `count` new buffers visible to the kernel. Called after -/// `io_uring_buf_ring_add` has been called `count` times to fill in new buffers. -pub fn buf_ring_advance(br: *linux.io_uring_buf_ring, count: u16) void { - const tail: u16 = br.tail +% count; - @atomicStore(u16, &br.tail, tail, .release); -} +pub const constants = struct { + /// io_uring block file commands, see IORING_OP_URING_CMD. + /// It's a different number space from ioctl(), reuse the block's code 0x12. + /// It is the value of ioctl.IO(0x12, 0) at runtime + pub const BLOCK_URING_CMD_DISCARD = 0x1200; + /// If sqe.file_index (splice_fd_in in Zig Struct) is set to this for + /// opcodes that instantiate a new an available direct descriptor instead + /// of having the application pass one direct descriptor + /// (like openat/openat2/accept), then io_uring will allocate in. The + /// picked direct descriptor will be returned in cqe.res, or -ENFILE + /// if the space is full. + pub const FILE_INDEX_ALLOC = math.maxInt(u32); + + pub const CMD_MASK = 1 << 0; + + pub const TIMEOUT_CLOCK_MASK = ((1 << 2) | (1 << 3)); + pub const TIMEOUT_UPDATE_MASK = ((1 << 1) | (1 << 4)); + + pub const CQE_BUFFER_SHIFT = 16; + + /// cqe.res for IORING_CQE_F_NOTIF if IORING_SEND_ZC_REPORT_USAGE was + /// requested It should be treated as a flag, all other bits of cqe.res + /// should be treated as reserved! + pub const NOTIF_USAGE_ZC_COPIED = (1 << 31); + + //Magic offsets for the application to mmap the data it needs + pub const OFF_SQ_RING = 0; + pub const OFF_CQ_RING = 0x8000000; + pub const OFF_SQES = 0x10000000; + pub const OFF_PBUF_RING = 0x80000000; + pub const OFF_PBUF_SHIFT = 16; + pub const OFF_MMAP_MASK = 0xf8000000; + + /// Skip updating fd indexes set to this value in the fd table + pub const REGISTER_FILES_SKIP = -2; + + /// SOCKET_URING_OP_TX_TIMESTAMP definitions + pub const TIMESTAMP_HW_SHIFT = 16; + /// The cqe.flags bit from which the timestamp type is stored + pub const TIMESTAMP_TYPE_SHIFT = (TIMESTAMP_HW_SHIFT + 1); + /// The cqe.flags flag signifying whether it's a hardware timestamp + pub const CQE_F_TSTAMP_HW = (1 << TIMESTAMP_HW_SHIFT); + + /// The bit from which area id is encoded into offsets + pub const ZCRX_AREA_SHIFT = 48; + pub const ZCRX_AREA_MASK = (~((1 << ZCRX_AREA_SHIFT) - 1)); + + // flag added to the opcode to use a registered ring fd + pub const REGISTER_USE_REGISTERED_RING = 1 << 31; +}; + +pub const uflags = struct { + /// io_uring_setup() flags + pub const Setup = packed struct(u32) { + /// io_context is polled + iopoll: bool = false, + /// SQ poll thread + sqpoll: bool = false, + /// sq_thread_cpu is valid + sq_aff: bool = false, + /// app defines CQ size + cqsize: bool = false, + /// clamp SQ/CQ ring sizes + clamp: bool = false, + /// attach to existing wq + attach_wq: bool = false, + /// start with ring disabled + r_disabled: bool = false, + /// continue submit on error + submit_all: bool = false, + /// Cooperative task running. When requests complete, they often require + /// forcing the submitter to transition to the kernel to complete. If + /// this flag is set, work will be done when the task transitions + /// anyway, rather than force an inter-processor interrupt reschedule. + /// This avoids interrupting a task running in userspace, and saves an + /// IPI. + coop_taskrun: bool = false, + /// If COOP_TASKRUN is set, get notified if task work is available for + /// running and a kernel transition would be needed to run it. This sets + /// IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. + taskrun_flag: bool = false, + /// SQEs are 128 byte + sqe128: bool = false, + /// CQEs are 32 byte + cqe32: bool = false, + /// Only one task is allowed to submit requests + single_issuer: bool = false, + /// Defer running task work to get events. + /// Rather than running bits of task work whenever the task transitions + /// try to do it just before it is needed. + defer_taskrun: bool = false, + /// Application provides the memory for the rings + no_mmap: bool = false, + /// Register the ring fd in itself for use with + /// IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index + /// rather than an fd. + registered_fd_only: bool = false, + /// Removes indirection through the SQ index array. + no_sqarray: bool = false, + /// Use hybrid poll in iopoll process + hybrid_iopoll: bool = false, + /// Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have + /// IORING_CQE_F_32 set in cqe.flags. + cqe_mixed: bool = false, + _20: u13 = 0, + }; + + /// sqe.uring_cmd_flags (rw_flags in the Zig struct) + /// top 8bits aren't available for userspace + /// use registered buffer; pass this flag along with setting sqe.buf_index. + pub const Cmd = packed struct(u32) { + cmd_fixed: bool = false, + _2: u31 = 0, + }; + + /// sqe.fsync_flags (rw_flags in the Zig struct) + pub const Fsync = packed struct(u32) { + datasync: bool = false, + _2: u31 = 0, + }; + + /// sqe.timeout_flags + pub const Timeout = packed struct(u32) { + timeout_abs: bool = false, + /// Available since Linux 5.11 + timeout_update: bool = false, + /// Available since Linux 5.15 + timeout_boottime: bool = false, + /// Available since Linux 5.15 + timeout_realtime: bool = false, + /// Available since Linux 5.15 + link_timeout_update: bool = false, + /// Available since Linux 5.16 + timeout_etime_success: bool = false, + /// Available since Linux 6.4 + timeout_multishot: bool = false, + _8: u25 = 0, + }; + + /// sqe.splice_flags (rw_flags in Zig Struct) + /// extends splice(2) flags + pub const Splice = packed struct(u32) { + _1: u31 = 0, + /// the last bit of __u32 + f_fd_in_fixed: bool = false, + }; + + /// POLL_ADD flags. Note that since sqe.poll_events (rw_flags in Zig Struct) + /// is the flag space, the command flags for POLL_ADD are stored in sqe.len. + pub const Poll = packed struct(u32) { + /// IORING_POLL_ADD_MULTI + /// Multishot poll. Sets IORING_CQE_F_MORE if the poll handler will + /// continue to report CQEs on behalf of the same SQE. + add_multi: bool = false, + /// IORING_POLL_UPDATE + /// Update existing poll request, matching sqe.addr as the old user_data + /// field. + update_events: bool = false, + /// IORING_POLL_UPDATE + /// Update existing poll request, matching sqe.addr as the old user_data + /// field. + update_user_data: bool = false, + /// IORING_POLL_LEVEL + /// Level triggered poll. + add_level: bool = false, + _5: u28 = 0, + }; + + /// ASYNC_CANCEL flags. + pub const AsyncCancel = packed struct(u32) { + /// IORING_ASYNC_CANCEL_ALL + /// Cancel all requests that match the given key + cancel_all: bool = false, + /// IORING_ASYNC_CANCEL_FD + /// Key off 'fd' for cancelation rather than the request 'user_data' + cancel_fd: bool = false, + /// IORING_ASYNC_CANCEL_ANY + /// Match any request + cancel_any: bool = false, + /// IORING_ASYNC_CANCEL_FD_FIXED + /// 'fd' passed in is a fixed descriptor + cancel_fd_fixed: bool = false, + /// IORING_ASYNC_CANCEL_USERDATA + /// Match on user_data, default for no other key + cancel_userdata: bool = false, + /// IORING_ASYNC_CANCEL_OP + /// Match request based on opcode + cancel_op: bool = false, + _7: u26 = 0, + }; + + /// IORING_OP_MSG_RING flags (sqe.msg_ring_flags or sqe.rw_flags in Zig Struct) + pub const MsgRing = packed struct(u32) { + /// IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. + /// Not applicable for IORING_MSG_DATA, obviously. + cqe_skip: bool = false, + /// Pass through the flags from sqe.file_index to cqe.flags + flags_pass: bool = false, + _3: u30 = 0, + }; + + /// IORING_OP_FIXED_FD_INSTALL flags (sqe.install_fd_flags or sqe.rw_flags in Zig Struct) + pub const FixedFd = packed struct(u32) { + /// IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC + no_cloexec: bool = false, + }; + + /// IORING_OP_NOP flags (sqe.nop_flags or sqe.rw_flags in Zig Struct) + pub const Nop = packed struct(u32) { + /// IORING_NOP_INJECT_RESULT Inject result from sqe.result + inject_result: bool = false, + _2: u4 = 0, + cqe32: bool = false, + _7: u26 = 0, + }; + + /// io_uring_enter(2) flags + pub const Enter = packed struct(u32) { + getevents: bool = false, + sq_wakeup: bool = false, + sq_wait: bool = false, + ext_arg: bool = false, + registered_ring: bool = false, + // commit: new flags + abs_timer: bool = false, + ext_arg_reg: bool = false, + no_iowait: bool = false, + _9: u24 = 0, + + /// Ensure only `Init` flags usable in `Enter` are set + pub fn valid_init_flags(self: Enter) bool { + const valid_flags: u32 = @bitCast(Enter{ .registered_ring = true, .no_iowait = true }); + const flags: u32 = @bitCast(self); + // check if any invalid flags are set + return (flags & ~valid_flags) == 0; + } + + pub fn empty(flags: Enter) bool { + return @as(u32, @bitCast(flags)) == 0; + } + }; + + /// matches INT_FLAG_* in liburing + pub const Init = packed struct(u8) { + reg_reg_ring: bool = false, + app_mem: bool = false, + cq_enter: bool = false, + _4: u1 = 0, + /// matches `registered_ring` flag in `Enter` + reg_ring: bool = false, + _6: u2 = 0, + /// matches `no_iowait` flag in `Enter` + no_iowait: bool = false, + + /// Return all valid `Enter` flags set in `Init` + pub fn enter_flags(self: Init) Enter { + const valid_flags: u8 = @bitCast(Init{ .reg_ring = true, .no_iowait = true }); + const flags: u8 = @bitCast(self); + return @bitCast(@as(u32, @intCast(flags & valid_flags))); + } + }; + + /// io_uring_params.features flags + pub const Features = packed struct(u32) { + single_mmap: bool = false, + nodrop: bool = false, + submit_stable: bool = false, + rw_cur_pos: bool = false, + cur_personality: bool = false, + fast_poll: bool = false, + poll_32bits: bool = false, + sqpoll_nonfixed: bool = false, + ext_arg: bool = false, + native_workers: bool = false, + rsrc_tags: bool = false, + cqe_skip: bool = false, + linked_file: bool = false, + // commit: add new feature flags + reg_reg_ring: bool = false, + recvsend_bundle: bool = false, + min_timeout: bool = false, + rw_attr: bool = false, + no_iowait: bool = false, + _19: u14 = 0, + + pub fn empty(features: Features) bool { + return @as(u32, @bitCast(features)) == 0; + } + }; +}; + +/// `io_uring_register(2)` opcodes and arguments +/// matches `io_uring_register_op` in liburing +pub const RegisterOp = enum(u8) { + register_buffers, + unregister_buffers, + register_files, + unregister_files, + register_eventfd, + unregister_eventfd, + register_files_update, + register_eventfd_async, + register_probe, + register_personality, + unregister_personality, + register_restrictions, + register_enable_rings, + + // extended with tagging + register_files2, + register_files_update2, + register_buffers2, + register_buffers_update, + + // set/clear io-wq thread affinities + register_iowq_aff, + unregister_iowq_aff, + + // set/get max number of io-wq workers + register_iowq_max_workers, + + // register/unregister io_uring fd with the ring + register_ring_fds, + unregister_ring_fds, + + // register ring based provide buffer group + register_pbuf_ring, + unregister_pbuf_ring, + + // sync cancelation API + register_sync_cancel, + + // register a range of fixed file slots for automatic slot allocation + register_file_alloc_range, + + // return status information for a buffer group + register_pbuf_status, + + // set/clear busy poll settings + register_napi, + unregister_napi, + + register_clock, + + // clone registered buffers from source ring to current ring + register_clone_buffers, + + // send MSG_RING without having a ring + register_send_msg_ring, + + // register a netdev hw rx queue for zerocopy + register_zcrx_ifq, + + // resize CQ ring + register_resize_rings, + + register_mem_region, + + // query various aspects of io_uring, see linux/io_uring/query.h + register_query, + + _, +}; + +/// io-wq worker categories +/// matches `io_wq_type` in liburing +pub const IoWqCategory = enum(u8) { + bound, + unbound, + _, +}; + +/// matches `io_uring_socket_op` in liburing +pub const SocketOp = enum(u16) { + siocin, + siocoutq, + getsockopt, + setsockopt, + tx_timestamp, + _, +}; + +/// io_uring_restriction.opcode values +/// matches `io_uring_register_restriction_op` in liburing +pub const RestrictionOp = enum(u16) { + /// Allow an io_uring_register(2) opcode + register_op = 0, + /// Allow an sqe opcode + sqe_op = 1, + /// Allow sqe flags + sqe_flags_allowed = 2, + /// Require sqe flags (these flags must be set on each submission) + sqe_flags_required = 3, + + _, +}; + +/// IORING_OP_MSG_RING command types, stored in sqe.addr +/// matches `io_uring_msg_ring_flags` in liburing +pub const MsgRingCmd = enum { + /// pass sqe->len as 'res' and off as user_data + data, + /// send a registered fd to another ring + send_fd, +}; + +/// matches `io_uring_op` in liburing +pub const Op = enum(u8) { + nop, + readv, + writev, + fsync, + read_fixed, + write_fixed, + poll_add, + poll_remove, + sync_file_range, + sendmsg, + recvmsg, + timeout, + timeout_remove, + accept, + async_cancel, + link_timeout, + connect, + fallocate, + openat, + close, + files_update, + statx, + read, + write, + fadvise, + madvise, + send, + recv, + epoll_ctl, + openat2, + splice, + provide_buffers, + remove_buffers, + tee, + shutdown, + renameat, + unlinkat, + mkdirat, + symlinkat, + linkat, + msg_ring, + fsetxattr, + setxattr, + fgetxattr, + getxattr, + socket, + uring_cmd, + send_zc, + sendmsg_zc, + read_multishot, + waitid, + futex_wait, + futex_wake, + futex_waitv, + fixed_fd_install, + ftruncate, + bind, + listen, + recv_zc, + epoll_wait, + readv_fixed, + writev_fixed, + pipe, + + _, +}; test "structs/offsets/entries" { if (!is_linux) return error.SkipZigTest; - try testing.expectEqual(@as(usize, 120), @sizeOf(linux.io_uring_params)); - try testing.expectEqual(@as(usize, 64), @sizeOf(linux.io_uring_sqe)); - try testing.expectEqual(@as(usize, 16), @sizeOf(linux.io_uring_cqe)); + try testing.expectEqual(120, @sizeOf(Params)); + try testing.expectEqual(64, @sizeOf(Sqe)); + try testing.expectEqual(16, @sizeOf(Cqe)); - try testing.expectEqual(0, linux.IORING_OFF_SQ_RING); - try testing.expectEqual(0x8000000, linux.IORING_OFF_CQ_RING); - try testing.expectEqual(0x10000000, linux.IORING_OFF_SQES); + try testing.expectEqual(0, constants.OFF_SQ_RING); + try testing.expectEqual(0x8000000, constants.OFF_CQ_RING); + try testing.expectEqual(0x10000000, constants.OFF_SQES); - try testing.expectError(error.EntriesZero, IoUring.init(0, 0)); - try testing.expectError(error.EntriesNotPowerOfTwo, IoUring.init(3, 0)); + try testing.expectError(error.EntriesZero, IoUring.init(0, .{})); + try testing.expectError(error.EntriesNotPowerOfTwo, IoUring.init(3, .{})); } test "nop" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer { ring.deinit(); - testing.expectEqual(@as(linux.fd_t, -1), ring.fd) catch @panic("test failed"); + testing.expectEqual(-1, ring.fd) catch @panic("test failed"); } const sqe = try ring.nop(0xaaaaaaaa); - try testing.expectEqual(linux.io_uring_sqe{ - .opcode = .NOP, - .flags = 0, - .ioprio = 0, + try testing.expectEqual(Sqe{ + .opcode = .nop, + .flags = .{}, + .ioprio = .init_empty(), .fd = 0, .off = 0, .addr = 0, @@ -1894,46 +4811,46 @@ test "nop" { .resv = 0, }, sqe.*); - try testing.expectEqual(@as(u32, 0), ring.sq.sqe_head); - try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); - try testing.expectEqual(@as(u32, 0), ring.sq.tail.*); - try testing.expectEqual(@as(u32, 0), ring.cq.head.*); - try testing.expectEqual(@as(u32, 1), ring.sq_ready()); - try testing.expectEqual(@as(u32, 0), ring.cq_ready()); - - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(@as(u32, 1), ring.sq.sqe_head); - try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail); - try testing.expectEqual(@as(u32, 1), ring.sq.tail.*); - try testing.expectEqual(@as(u32, 0), ring.cq.head.*); - try testing.expectEqual(@as(u32, 0), ring.sq_ready()); - - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(0, ring.sq.sqe_head); + try testing.expectEqual(1, ring.sq.sqe_tail); + try testing.expectEqual(0, ring.sq.tail.*); + try testing.expectEqual(0, ring.cq.head.*); + try testing.expectEqual(1, ring.sq_ready()); + try testing.expectEqual(0, ring.cq_ready()); + + try testing.expectEqual(1, try ring.submit()); + try testing.expectEqual(1, ring.sq.sqe_head); + try testing.expectEqual(1, ring.sq.sqe_tail); + try testing.expectEqual(1, ring.sq.tail.*); + try testing.expectEqual(0, ring.cq.head.*); + try testing.expectEqual(0, ring.sq_ready()); + + try testing.expectEqual(Cqe{ .user_data = 0xaaaaaaaa, .res = 0, - .flags = 0, + .flags = .{}, }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 1), ring.cq.head.*); - try testing.expectEqual(@as(u32, 0), ring.cq_ready()); + try testing.expectEqual(1, ring.cq.head.*); + try testing.expectEqual(0, ring.cq_ready()); const sqe_barrier = try ring.nop(0xbbbbbbbb); - sqe_barrier.flags |= linux.IOSQE_IO_DRAIN; - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(linux.io_uring_cqe{ + sqe_barrier.flags.io_drain = true; + try testing.expectEqual(1, try ring.submit()); + try testing.expectEqual(Cqe{ .user_data = 0xbbbbbbbb, .res = 0, - .flags = 0, + .flags = .{}, }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 2), ring.sq.sqe_head); - try testing.expectEqual(@as(u32, 2), ring.sq.sqe_tail); - try testing.expectEqual(@as(u32, 2), ring.sq.tail.*); - try testing.expectEqual(@as(u32, 2), ring.cq.head.*); + try testing.expectEqual(2, ring.sq.sqe_head); + try testing.expectEqual(2, ring.sq.sqe_tail); + try testing.expectEqual(2, ring.sq.tail.*); + try testing.expectEqual(2, ring.cq.head.*); } test "readv" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -1949,25 +4866,28 @@ test "readv" { // https://github.com/torvalds/linux/blob/v5.4/fs/io_uring.c#L3119-L3124 vs // https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L6687-L6691 // We therefore avoid stressing sparse fd sets here: - var registered_fds = [_]linux.fd_t{0} ** 1; + var registered_fds: [1]linux.fd_t = @splat(0); const fd_index = 0; registered_fds[fd_index] = fd; try ring.register_files(registered_fds[0..]); - var buffer = [_]u8{42} ** 128; - var iovecs = [_]posix.iovec{posix.iovec{ .base = &buffer, .len = buffer.len }}; + var buffer: [128]u8 = @splat(42); + var iovecs: [1]posix.iovec = .{ + .{ .base = &buffer, .len = buffer.len }, + }; const sqe = try ring.read(0xcccccccc, fd_index, .{ .iovecs = iovecs[0..] }, 0); - try testing.expectEqual(linux.IORING_OP.READV, sqe.opcode); - sqe.flags |= linux.IOSQE_FIXED_FILE; + try testing.expectEqual(Op.readv, sqe.opcode); + sqe.flags.fixed_file = true; try testing.expectError(error.SubmissionQueueFull, ring.nop(0)); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(1, try ring.submit()); + try testing.expectEqual(Cqe{ .user_data = 0xcccccccc, .res = buffer.len, - .flags = 0, + .flags = .{}, }, try ring.copy_cqe()); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + const empty: [buffer.len]u8 = @splat(0); + try testing.expectEqualSlices(u8, empty[0..], buffer[0..]); try ring.unregister_files(); } @@ -1975,7 +4895,7 @@ test "readv" { test "writev/fsync/readv" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(4, 0) catch |err| switch (err) { + var ring = IoUring.init(4, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -1990,54 +4910,55 @@ test "writev/fsync/readv" { defer file.close(); const fd = file.handle; - const buffer_write = [_]u8{42} ** 128; - const iovecs_write = [_]posix.iovec_const{ - posix.iovec_const{ .base = &buffer_write, .len = buffer_write.len }, + const buffer_write: [128]u8 = @splat(42); + const iovecs_write: [1]posix.iovec_const = .{ + .{ .base = &buffer_write, .len = buffer_write.len }, }; - var buffer_read = [_]u8{0} ** 128; - var iovecs_read = [_]posix.iovec{ - posix.iovec{ .base = &buffer_read, .len = buffer_read.len }, + + var buffer_read: [128]u8 = @splat(0); + var iovecs_read: [1]posix.iovec = .{ + .{ .base = &buffer_read, .len = buffer_read.len }, }; - const sqe_writev = try ring.writev(0xdddddddd, fd, iovecs_write[0..], 17); - try testing.expectEqual(linux.IORING_OP.WRITEV, sqe_writev.opcode); - try testing.expectEqual(@as(u64, 17), sqe_writev.off); - sqe_writev.flags |= linux.IOSQE_IO_LINK; + const sqe_writev = try ring.write(0xdddddddd, fd, .{ .iovecs = iovecs_write[0..] }, 17); + try testing.expectEqual(Op.writev, sqe_writev.opcode); + try testing.expectEqual(17, sqe_writev.off); + sqe_writev.link_next(); - const sqe_fsync = try ring.fsync(0xeeeeeeee, fd, 0); - try testing.expectEqual(linux.IORING_OP.FSYNC, sqe_fsync.opcode); + const sqe_fsync = try ring.fsync(0xeeeeeeee, fd, .{}); + try testing.expectEqual(.fsync, sqe_fsync.opcode); try testing.expectEqual(fd, sqe_fsync.fd); - sqe_fsync.flags |= linux.IOSQE_IO_LINK; + sqe_fsync.link_next(); const sqe_readv = try ring.read(0xffffffff, fd, .{ .iovecs = iovecs_read[0..] }, 17); - try testing.expectEqual(linux.IORING_OP.READV, sqe_readv.opcode); - try testing.expectEqual(@as(u64, 17), sqe_readv.off); + try testing.expectEqual(Op.readv, sqe_readv.opcode); + try testing.expectEqual(17, sqe_readv.off); - try testing.expectEqual(@as(u32, 3), ring.sq_ready()); - try testing.expectEqual(@as(u32, 3), try ring.submit_and_wait(3)); - try testing.expectEqual(@as(u32, 0), ring.sq_ready()); - try testing.expectEqual(@as(u32, 3), ring.cq_ready()); + try testing.expectEqual(3, ring.sq_ready()); + try testing.expectEqual(3, try ring.submit_and_wait(3)); + try testing.expectEqual(0, ring.sq_ready()); + try testing.expectEqual(3, ring.cq_ready()); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xdddddddd, .res = buffer_write.len, - .flags = 0, + .flags = .{}, }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 2), ring.cq_ready()); + try testing.expectEqual(2, ring.cq_ready()); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xeeeeeeee, .res = 0, - .flags = 0, + .flags = .{}, }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 1), ring.cq_ready()); + try testing.expectEqual(1, ring.cq_ready()); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xffffffff, .res = buffer_read.len, - .flags = 0, + .flags = .{}, }, try ring.copy_cqe()); - try testing.expectEqual(@as(u32, 0), ring.cq_ready()); + try testing.expectEqual(0, ring.cq_ready()); try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); } @@ -2045,7 +4966,7 @@ test "writev/fsync/readv" { test "write/read" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, 0) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -2059,16 +4980,16 @@ test "write/read" { defer file.close(); const fd = file.handle; - const buffer_write = [_]u8{97} ** 20; - var buffer_read = [_]u8{98} ** 20; - const sqe_write = try ring.write(0x11111111, fd, buffer_write[0..], 10); - try testing.expectEqual(linux.IORING_OP.WRITE, sqe_write.opcode); - try testing.expectEqual(@as(u64, 10), sqe_write.off); - sqe_write.flags |= linux.IOSQE_IO_LINK; + const buffer_write: [20]u8 = @splat(97); + var buffer_read: [20]u8 = @splat(98); + const sqe_write = try ring.write(0x11111111, fd, .{ .buffer = buffer_write[0..] }, 10); + try testing.expectEqual(Op.write, sqe_write.opcode); + try testing.expectEqual(10, sqe_write.off); + sqe_write.flags.io_link = true; const sqe_read = try ring.read(0x22222222, fd, .{ .buffer = buffer_read[0..] }, 10); - try testing.expectEqual(linux.IORING_OP.READ, sqe_read.opcode); - try testing.expectEqual(@as(u64, 10), sqe_read.off); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + try testing.expectEqual(Op.read, sqe_read.opcode); + try testing.expectEqual(10, sqe_read.off); + try testing.expectEqual(2, try ring.submit()); const cqe_write = try ring.copy_cqe(); const cqe_read = try ring.copy_cqe(); @@ -2076,15 +4997,15 @@ test "write/read" { // https://lwn.net/Articles/809820/ if (cqe_write.err() == .INVAL) return error.SkipZigTest; if (cqe_read.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x11111111, .res = buffer_write.len, - .flags = 0, + .flags = .{}, }, cqe_write); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x22222222, .res = buffer_read.len, - .flags = 0, + .flags = .{}, }, cqe_read); try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); } @@ -2092,7 +5013,7 @@ test "write/read" { test "splice/read" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(4, 0) catch |err| switch (err) { + var ring = IoUring.init(4, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -2110,29 +5031,29 @@ test "splice/read" { defer file_dst.close(); const fd_dst = file_dst.handle; - const buffer_write = [_]u8{97} ** 20; - var buffer_read = [_]u8{98} ** 20; + const buffer_write: [20]u8 = @splat(97); + var buffer_read: [20]u8 = @splat(98); _ = try file_src.write(&buffer_write); const fds = try posix.pipe(); - const pipe_offset: u64 = std.math.maxInt(u64); + const pipe_offset: u64 = math.maxInt(u64); - const sqe_splice_to_pipe = try ring.splice(0x11111111, fd_src, 0, fds[1], pipe_offset, buffer_write.len); - try testing.expectEqual(linux.IORING_OP.SPLICE, sqe_splice_to_pipe.opcode); - try testing.expectEqual(@as(u64, 0), sqe_splice_to_pipe.addr); + const sqe_splice_to_pipe = try ring.splice(0x11111111, fd_src, 0, fds[1], pipe_offset, buffer_write.len, .{}); + try testing.expectEqual(Op.splice, sqe_splice_to_pipe.opcode); + try testing.expectEqual(0, sqe_splice_to_pipe.addr); try testing.expectEqual(pipe_offset, sqe_splice_to_pipe.off); - sqe_splice_to_pipe.flags |= linux.IOSQE_IO_LINK; + sqe_splice_to_pipe.link_next(); - const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len); - try testing.expectEqual(linux.IORING_OP.SPLICE, sqe_splice_from_pipe.opcode); + const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len, .{}); + try testing.expectEqual(Op.splice, sqe_splice_from_pipe.opcode); try testing.expectEqual(pipe_offset, sqe_splice_from_pipe.addr); - try testing.expectEqual(@as(u64, 10), sqe_splice_from_pipe.off); - sqe_splice_from_pipe.flags |= linux.IOSQE_IO_LINK; + try testing.expectEqual(10, sqe_splice_from_pipe.off); + sqe_splice_from_pipe.link_next(); const sqe_read = try ring.read(0x33333333, fd_dst, .{ .buffer = buffer_read[0..] }, 10); - try testing.expectEqual(linux.IORING_OP.READ, sqe_read.opcode); - try testing.expectEqual(@as(u64, 10), sqe_read.off); - try testing.expectEqual(@as(u32, 3), try ring.submit()); + try testing.expectEqual(Op.read, sqe_read.opcode); + try testing.expectEqual(10, sqe_read.off); + try testing.expectEqual(3, try ring.submit()); const cqe_splice_to_pipe = try ring.copy_cqe(); const cqe_splice_from_pipe = try ring.copy_cqe(); @@ -2142,20 +5063,20 @@ test "splice/read" { if (cqe_splice_to_pipe.err() == .INVAL) return error.SkipZigTest; if (cqe_splice_from_pipe.err() == .INVAL) return error.SkipZigTest; if (cqe_read.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x11111111, .res = buffer_write.len, - .flags = 0, + .flags = .{}, }, cqe_splice_to_pipe); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x22222222, .res = buffer_write.len, - .flags = 0, + .flags = .{}, }, cqe_splice_from_pipe); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x33333333, .res = buffer_read.len, - .flags = 0, + .flags = .{}, }, cqe_read); try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]); } @@ -2163,7 +5084,7 @@ test "splice/read" { test "write_fixed/read_fixed" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, 0) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -2183,7 +5104,7 @@ test "write_fixed/read_fixed" { @memset(&raw_buffers[0], 'z'); raw_buffers[0][0.."foobar".len].* = "foobar".*; - var buffers = [2]posix.iovec{ + var buffers: [2]posix.iovec = .{ .{ .base = &raw_buffers[0], .len = raw_buffers[0].len }, .{ .base = &raw_buffers[1], .len = raw_buffers[1].len }, }; @@ -2195,29 +5116,29 @@ test "write_fixed/read_fixed" { else => |e| return e, }; - const sqe_write = try ring.write_fixed(0x45454545, fd, &buffers[0], 3, 0); - try testing.expectEqual(linux.IORING_OP.WRITE_FIXED, sqe_write.opcode); - try testing.expectEqual(@as(u64, 3), sqe_write.off); - sqe_write.flags |= linux.IOSQE_IO_LINK; + const sqe_write = try ring.write_fixed(0x45454545, fd, .{ .buffer = raw_buffers[0][0..] }, 3, 0); + try testing.expectEqual(Op.write_fixed, sqe_write.opcode); + try testing.expectEqual(3, sqe_write.off); + sqe_write.link_next(); - const sqe_read = try ring.read_fixed(0x12121212, fd, &buffers[1], 0, 1); - try testing.expectEqual(linux.IORING_OP.READ_FIXED, sqe_read.opcode); - try testing.expectEqual(@as(u64, 0), sqe_read.off); + const sqe_read = try ring.read_fixed(0x12121212, fd, .{ .buffer = raw_buffers[1][0..] }, 0, 1); + try testing.expectEqual(Op.read_fixed, sqe_read.opcode); + try testing.expectEqual(0, sqe_read.off); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + try testing.expectEqual(2, try ring.submit()); const cqe_write = try ring.copy_cqe(); const cqe_read = try ring.copy_cqe(); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x45454545, - .res = @as(i32, @intCast(buffers[0].len)), - .flags = 0, + .res = @intCast(buffers[0].len), + .flags = .{}, }, cqe_write); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x12121212, - .res = @as(i32, @intCast(buffers[1].len)), - .flags = 0, + .res = @intCast(buffers[1].len), + .flags = .{}, }, cqe_read); try testing.expectEqualSlices(u8, "\x00\x00\x00", buffers[1].base[0..3]); @@ -2228,7 +5149,7 @@ test "write_fixed/read_fixed" { test "openat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -2248,12 +5169,12 @@ test "openat" { } else @intFromPtr(path); const flags: linux.O = .{ .CLOEXEC = true, .ACCMODE = .RDWR, .CREAT = true }; - const mode: posix.mode_t = 0o666; + const mode: linux.mode_t = 0o666; const sqe_openat = try ring.openat(0x33333333, tmp.dir.fd, path, flags, mode); - try testing.expectEqual(linux.io_uring_sqe{ - .opcode = .OPENAT, - .flags = 0, - .ioprio = 0, + try testing.expectEqual(Sqe{ + .opcode = .openat, + .flags = .{}, + .ioprio = .init_empty(), .fd = tmp.dir.fd, .off = 0, .addr = path_addr, @@ -2266,15 +5187,15 @@ test "openat" { .addr3 = 0, .resv = 0, }, sqe_openat.*); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe_openat = try ring.copy_cqe(); - try testing.expectEqual(@as(u64, 0x33333333), cqe_openat.user_data); + try testing.expectEqual(0x33333333, cqe_openat.user_data); if (cqe_openat.err() == .INVAL) return error.SkipZigTest; if (cqe_openat.err() == .BADF) return error.SkipZigTest; if (cqe_openat.res <= 0) std.debug.print("\ncqe_openat.res={}\n", .{cqe_openat.res}); try testing.expect(cqe_openat.res > 0); - try testing.expectEqual(@as(u32, 0), cqe_openat.flags); + try testing.expectEqual(@as(Cqe.Flags, .{}), cqe_openat.flags); posix.close(cqe_openat.res); } @@ -2282,7 +5203,7 @@ test "openat" { test "close" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -2297,23 +5218,23 @@ test "close" { errdefer file.close(); const sqe_close = try ring.close(0x44444444, file.handle); - try testing.expectEqual(linux.IORING_OP.CLOSE, sqe_close.opcode); + try testing.expectEqual(Op.close, sqe_close.opcode); try testing.expectEqual(file.handle, sqe_close.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe_close = try ring.copy_cqe(); if (cqe_close.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x44444444, .res = 0, - .flags = 0, + .flags = .{}, }, cqe_close); } test "accept/connect/send/recv" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -2326,26 +5247,28 @@ test "accept/connect/send/recv" { const buffer_send = [_]u8{ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 }; var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; - const sqe_send = try ring.send(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0); - sqe_send.flags |= linux.IOSQE_IO_LINK; - _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + const sqe_send = try ring.send(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], .{}); + sqe_send.link_next(); + + _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); + try testing.expectEqual(2, try ring.submit()); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xeeeeeeee, .res = buffer_send.len, - .flags = 0, + .flags = .{}, }, cqe_send); const cqe_recv = try ring.copy_cqe(); if (cqe_recv.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xffffffff, .res = buffer_recv.len, - // ignore IORING_CQE_F_SOCK_NONEMPTY since it is only set on some systems - .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, + // Only check IORING_CQE_F_SOCK_NONEMPTY flag, as other flags are + // system-dependent + .flags = .{ .f_sock_nonempty = cqe_recv.flags.f_sock_nonempty }, }, cqe_recv); try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); @@ -2354,7 +5277,7 @@ test "accept/connect/send/recv" { test "sendmsg/recvmsg" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, 0) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -2368,8 +5291,8 @@ test "sendmsg/recvmsg" { const server = try posix.socket(address_server.family, posix.SOCK.DGRAM, 0); defer posix.close(server); - try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEPORT, &mem.toBytes(@as(c_int, 1))); - try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); + try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEPORT, &mem.toBytes(@as(u32, 1))); + try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(u32, 1))); try posix.bind(server, addrAny(&address_server), @sizeOf(linux.sockaddr.in)); // set address_server to the OS-chosen IP/port. @@ -2379,9 +5302,9 @@ test "sendmsg/recvmsg" { const client = try posix.socket(address_server.family, posix.SOCK.DGRAM, 0); defer posix.close(client); - const buffer_send = [_]u8{42} ** 128; - const iovecs_send = [_]posix.iovec_const{ - posix.iovec_const{ .base = &buffer_send, .len = buffer_send.len }, + const buffer_send: [128]u8 = @splat(42); + const iovecs_send: [1]posix.iovec_const = .{ + .{ .base = &buffer_send, .len = buffer_send.len }, }; const msg_send: linux.msghdr_const = .{ .name = addrAny(&address_server), @@ -2392,14 +5315,14 @@ test "sendmsg/recvmsg" { .controllen = 0, .flags = 0, }; - const sqe_sendmsg = try ring.sendmsg(0x11111111, client, &msg_send, 0); - sqe_sendmsg.flags |= linux.IOSQE_IO_LINK; - try testing.expectEqual(linux.IORING_OP.SENDMSG, sqe_sendmsg.opcode); + const sqe_sendmsg = try ring.sendmsg(0x11111111, client, &msg_send, .{}); + sqe_sendmsg.flags.io_link = true; + try testing.expectEqual(Op.sendmsg, sqe_sendmsg.opcode); try testing.expectEqual(client, sqe_sendmsg.fd); - var buffer_recv = [_]u8{0} ** 128; - var iovecs_recv = [_]posix.iovec{ - posix.iovec{ .base = &buffer_recv, .len = buffer_recv.len }, + var buffer_recv: [128]u8 = @splat(0); + var iovecs_recv: [1]posix.iovec = .{ + .{ .base = &buffer_recv, .len = buffer_recv.len }, }; var address_recv: linux.sockaddr.in = .{ .port = 0, @@ -2414,30 +5337,30 @@ test "sendmsg/recvmsg" { .controllen = 0, .flags = 0, }; - const sqe_recvmsg = try ring.recvmsg(0x22222222, server, &msg_recv, 0); - try testing.expectEqual(linux.IORING_OP.RECVMSG, sqe_recvmsg.opcode); + const sqe_recvmsg = try ring.recvmsg(0x22222222, server, &msg_recv, .{}); + try testing.expectEqual(Op.recvmsg, sqe_recvmsg.opcode); try testing.expectEqual(server, sqe_recvmsg.fd); - try testing.expectEqual(@as(u32, 2), ring.sq_ready()); - try testing.expectEqual(@as(u32, 2), try ring.submit_and_wait(2)); - try testing.expectEqual(@as(u32, 0), ring.sq_ready()); - try testing.expectEqual(@as(u32, 2), ring.cq_ready()); + try testing.expectEqual(2, ring.sq_ready()); + try testing.expectEqual(2, try ring.submit_and_wait(2)); + try testing.expectEqual(0, ring.sq_ready()); + try testing.expectEqual(2, ring.cq_ready()); const cqe_sendmsg = try ring.copy_cqe(); if (cqe_sendmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x11111111, .res = buffer_send.len, - .flags = 0, + .flags = .{}, }, cqe_sendmsg); const cqe_recvmsg = try ring.copy_cqe(); if (cqe_recvmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x22222222, .res = buffer_recv.len, // ignore IORING_CQE_F_SOCK_NONEMPTY since it is set non-deterministically - .flags = cqe_recvmsg.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, + .flags = .{ .f_sock_nonempty = cqe_recvmsg.flags.f_sock_nonempty }, }, cqe_recvmsg); try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); @@ -2445,42 +5368,41 @@ test "sendmsg/recvmsg" { test "timeout (after a relative time)" { if (!is_linux) return error.SkipZigTest; + const io = std.testing.io; - const io = testing.io; - - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - const ms = 10; - const margin = 5; - const ts: linux.kernel_timespec = .{ .sec = 0, .nsec = ms * 1000000 }; + const ms = 5; + const ts: linux.kernel_timespec = .{ .sec = 0, .nsec = ms * 1_000_000 }; const started = try std.Io.Clock.awake.now(io); - const sqe = try ring.timeout(0x55555555, &ts, 0, 0); - try testing.expectEqual(linux.IORING_OP.TIMEOUT, sqe.opcode); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + const sqe = try ring.timeout(0x55555555, &ts, 0, .{}); + try testing.expectEqual(Op.timeout, sqe.opcode); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); const stopped = try std.Io.Clock.awake.now(io); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x55555555, .res = -@as(i32, @intFromEnum(linux.E.TIME)), - .flags = 0, + .flags = .{}, }, cqe); // Tests should not depend on timings: skip test if outside margin. + const ms_margin = 5; const ms_elapsed = started.durationTo(stopped).toMilliseconds(); - if (ms_elapsed > margin) return error.SkipZigTest; + if (ms_elapsed > ms_margin) return error.SkipZigTest; } test "timeout (after a number of completions)" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, 0) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -2489,31 +5411,31 @@ test "timeout (after a number of completions)" { const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; const count_completions: u64 = 1; - const sqe_timeout = try ring.timeout(0x66666666, &ts, count_completions, 0); - try testing.expectEqual(linux.IORING_OP.TIMEOUT, sqe_timeout.opcode); + const sqe_timeout = try ring.timeout(0x66666666, &ts, count_completions, .{}); + try testing.expectEqual(Op.timeout, sqe_timeout.opcode); try testing.expectEqual(count_completions, sqe_timeout.off); _ = try ring.nop(0x77777777); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + try testing.expectEqual(2, try ring.submit()); const cqe_nop = try ring.copy_cqe(); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x77777777, .res = 0, - .flags = 0, + .flags = .{}, }, cqe_nop); const cqe_timeout = try ring.copy_cqe(); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x66666666, .res = 0, - .flags = 0, + .flags = .{}, }, cqe_timeout); } test "timeout_remove" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, 0) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -2521,22 +5443,22 @@ test "timeout_remove" { defer ring.deinit(); const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 }; - const sqe_timeout = try ring.timeout(0x88888888, &ts, 0, 0); - try testing.expectEqual(linux.IORING_OP.TIMEOUT, sqe_timeout.opcode); - try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout.user_data); + const sqe_timeout = try ring.timeout(0x88888888, &ts, 0, .{}); + try testing.expectEqual(Op.timeout, sqe_timeout.opcode); + try testing.expectEqual(0x88888888, sqe_timeout.user_data); - const sqe_timeout_remove = try ring.timeout_remove(0x99999999, 0x88888888, 0); - try testing.expectEqual(linux.IORING_OP.TIMEOUT_REMOVE, sqe_timeout_remove.opcode); - try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout_remove.addr); - try testing.expectEqual(@as(u64, 0x99999999), sqe_timeout_remove.user_data); + const sqe_timeout_remove = try ring.timeout_remove(0x99999999, 0x88888888, .{}); + try testing.expectEqual(Op.timeout_remove, sqe_timeout_remove.opcode); + try testing.expectEqual(0x88888888, sqe_timeout_remove.addr); + try testing.expectEqual(0x99999999, sqe_timeout_remove.user_data); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + try testing.expectEqual(2, try ring.submit()); // The order in which the CQE arrive is not clearly documented and it changed with kernel 5.18: // * kernel 5.10 gives user data 0x88888888 first, 0x99999999 second // * kernel 5.18 gives user data 0x99999999 first, 0x88888888 second - var cqes: [2]linux.io_uring_cqe = undefined; + var cqes: [2]Cqe = undefined; cqes[0] = try ring.copy_cqe(); cqes[1] = try ring.copy_cqe(); @@ -2547,7 +5469,7 @@ test "timeout_remove" { // We don't want to skip this test for newer kernels. if (cqe.user_data == 0x99999999 and cqe.err() == .BADF and - (ring.features & linux.IORING_FEAT_RW_CUR_POS) == 0) + (!ring.features.rw_cur_pos)) { return error.SkipZigTest; } @@ -2555,16 +5477,16 @@ test "timeout_remove" { try testing.expect(cqe.user_data == 0x88888888 or cqe.user_data == 0x99999999); if (cqe.user_data == 0x88888888) { - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x88888888, .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), - .flags = 0, + .flags = .{}, }, cqe); } else if (cqe.user_data == 0x99999999) { - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x99999999, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); } } @@ -2573,7 +5495,7 @@ test "timeout_remove" { test "accept/connect/recv/link_timeout" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -2585,14 +5507,14 @@ test "accept/connect/recv/link_timeout" { var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; - const sqe_recv = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); - sqe_recv.flags |= linux.IOSQE_IO_LINK; + const sqe_recv = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); + sqe_recv.link_next(); const ts = linux.kernel_timespec{ .sec = 0, .nsec = 1000000 }; - _ = try ring.link_timeout(0x22222222, &ts, 0); + _ = try ring.link_timeout(0x22222222, &ts, .{}); const nr_wait = try ring.submit(); - try testing.expectEqual(@as(u32, 2), nr_wait); + try testing.expectEqual(2, nr_wait); var i: usize = 0; while (i < nr_wait) : (i += 1) { @@ -2622,7 +5544,7 @@ test "accept/connect/recv/link_timeout" { test "fallocate" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -2636,13 +5558,13 @@ test "fallocate" { const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); defer file.close(); - try testing.expectEqual(@as(u64, 0), (try file.stat()).size); + try testing.expectEqual(0, (try file.stat()).size); const len: u64 = 65536; const sqe = try ring.fallocate(0xaaaaaaaa, file.handle, 0, 0, len); - try testing.expectEqual(linux.IORING_OP.FALLOCATE, sqe.opcode); + try testing.expectEqual(Op.fallocate, sqe.opcode); try testing.expectEqual(file.handle, sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -2656,10 +5578,10 @@ test "fallocate" { .OPNOTSUPP => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xaaaaaaaa, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); try testing.expectEqual(len, (try file.stat()).size); @@ -2668,7 +5590,7 @@ test "fallocate" { test "statx" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -2681,7 +5603,7 @@ test "statx" { const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 }); defer file.close(); - try testing.expectEqual(@as(u64, 0), (try file.stat()).size); + try testing.expectEqual(0, (try file.stat()).size); try file.writeAll("foobar"); @@ -2690,13 +5612,13 @@ test "statx" { 0xaaaaaaaa, tmp.dir.fd, path, - 0, - linux.STATX_SIZE, + .{}, + .{ .size = true }, &buf, ); - try testing.expectEqual(linux.IORING_OP.STATX, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Op.statx, sqe.opcode); + try testing.expectEqual(tmp.dir.fd, sqe.fd); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -2712,20 +5634,20 @@ test "statx" { .BADF => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xaaaaaaaa, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); - try testing.expect(buf.mask & linux.STATX_SIZE == linux.STATX_SIZE); - try testing.expectEqual(@as(u64, 6), buf.size); + try testing.expect(buf.mask.size); + try testing.expectEqual(6, buf.size); } test "accept/connect/recv/cancel" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -2737,14 +5659,14 @@ test "accept/connect/recv/cancel" { var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 }; - _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); + try testing.expectEqual(1, try ring.submit()); - const sqe_cancel = try ring.cancel(0x99999999, 0xffffffff, 0); - try testing.expectEqual(linux.IORING_OP.ASYNC_CANCEL, sqe_cancel.opcode); - try testing.expectEqual(@as(u64, 0xffffffff), sqe_cancel.addr); - try testing.expectEqual(@as(u64, 0x99999999), sqe_cancel.user_data); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + const sqe_cancel = try ring.cancel(0x99999999, 0xffffffff, .{}); + try testing.expectEqual(Op.async_cancel, sqe_cancel.opcode); + try testing.expectEqual(0xffffffff, sqe_cancel.addr); + try testing.expectEqual(0x99999999, sqe_cancel.user_data); + try testing.expectEqual(1, try ring.submit()); var cqe_recv = try ring.copy_cqe(); if (cqe_recv.err() == .INVAL) return error.SkipZigTest; @@ -2759,23 +5681,23 @@ test "accept/connect/recv/cancel" { cqe_cancel = a; } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xffffffff, .res = -@as(i32, @intFromEnum(linux.E.CANCELED)), - .flags = 0, + .flags = .{}, }, cqe_recv); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x99999999, .res = 0, - .flags = 0, + .flags = .{}, }, cqe_cancel); } test "register_files_update" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -2785,7 +5707,7 @@ test "register_files_update" { const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0); defer posix.close(fd); - var registered_fds = [_]linux.fd_t{0} ** 2; + var registered_fds: [2]linux.fd_t = @splat(0); const fd_index = 0; const fd_index2 = 1; registered_fds[fd_index] = fd; @@ -2807,19 +5729,20 @@ test "register_files_update" { registered_fds[fd_index2] = -1; try ring.register_files_update(0, registered_fds[0..]); - var buffer = [_]u8{42} ** 128; + var buffer: [128]u8 = @splat(42); { const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); - sqe.flags |= linux.IOSQE_FIXED_FILE; + try testing.expectEqual(Op.read, sqe.opcode); + sqe.set_flags(.{ .fixed_file = true }); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(1, try ring.submit()); + try testing.expectEqual(Cqe{ .user_data = 0xcccccccc, .res = buffer.len, - .flags = 0, + .flags = .{}, }, try ring.copy_cqe()); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + const empty: [buffer.len]u8 = @splat(0); + try testing.expectEqualSlices(u8, empty[0..], buffer[0..]); } // Test with a non-zero offset @@ -2831,16 +5754,18 @@ test "register_files_update" { { // Next read should still work since fd_index in the registered file descriptors hasn't been updated yet. const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); - sqe.flags |= linux.IOSQE_FIXED_FILE; + try testing.expectEqual(Op.read, sqe.opcode); + sqe.set_flags(.{ .fixed_file = true }); - try testing.expectEqual(@as(u32, 1), try ring.submit()); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(1, try ring.submit()); + try testing.expectEqual(Cqe{ .user_data = 0xcccccccc, .res = buffer.len, - .flags = 0, + .flags = .{}, }, try ring.copy_cqe()); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]); + const empty: [buffer.len]u8 = @splat(0); + + try testing.expectEqualSlices(u8, empty[0..], buffer[0..]); } try ring.register_files_update(0, registered_fds[0..]); @@ -2848,10 +5773,10 @@ test "register_files_update" { { // Now this should fail since both fds are sparse (-1) const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); - sqe.flags |= linux.IOSQE_FIXED_FILE; + try testing.expectEqual(Op.read, sqe.opcode); + sqe.set_flags(.{ .fixed_file = true }); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); try testing.expectEqual(linux.E.BADF, cqe.err()); } @@ -2862,7 +5787,7 @@ test "register_files_update" { test "shutdown" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -2876,9 +5801,10 @@ test "shutdown" { // Socket bound, expect shutdown to work { - const server = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); + // TODO: update posix later to use Typed Flags + const server = try posix.socket(address.family, @as(u32, @bitCast(linux.Sock{ .type = .stream, .flags = .{ .cloexec = true } })), 0); defer posix.close(server); - try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); + try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(u32, 1))); try posix.bind(server, addrAny(&address), @sizeOf(linux.sockaddr.in)); try posix.listen(server, 1); @@ -2886,11 +5812,11 @@ test "shutdown" { var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in); try posix.getsockname(server, addrAny(&address), &slen); - const shutdown_sqe = try ring.shutdown(0x445445445, server, linux.SHUT.RD); - try testing.expectEqual(linux.IORING_OP.SHUTDOWN, shutdown_sqe.opcode); - try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); + const shutdown_sqe = try ring.shutdown(0x445445445, server, .rd); + try testing.expectEqual(Op.shutdown, shutdown_sqe.opcode); + try testing.expectEqual(server, shutdown_sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -2900,10 +5826,10 @@ test "shutdown" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x445445445, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); } @@ -2912,16 +5838,16 @@ test "shutdown" { const server = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); defer posix.close(server); - const shutdown_sqe = ring.shutdown(0x445445445, server, linux.SHUT.RD) catch |err| switch (err) { + const shutdown_sqe = ring.shutdown(0x445445445, server, .rd) catch |err| switch (err) { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), }; - try testing.expectEqual(linux.IORING_OP.SHUTDOWN, shutdown_sqe.opcode); - try testing.expectEqual(@as(i32, server), shutdown_sqe.fd); + try testing.expectEqual(Op.shutdown, shutdown_sqe.opcode); + try testing.expectEqual(server, shutdown_sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); - try testing.expectEqual(@as(u64, 0x445445445), cqe.user_data); + try testing.expectEqual(0x445445445, cqe.user_data); try testing.expectEqual(linux.E.NOTCONN, cqe.err()); } } @@ -2929,7 +5855,7 @@ test "shutdown" { test "renameat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -2956,12 +5882,12 @@ test "renameat" { old_path, tmp.dir.fd, new_path, - 0, + .{}, ); - try testing.expectEqual(linux.IORING_OP.RENAMEAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Op.renameat, sqe.opcode); + try testing.expectEqual(tmp.dir.fd, sqe.fd); + try testing.expectEqual(tmp.dir.fd, @as(i32, @intCast(sqe.len))); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -2970,10 +5896,10 @@ test "renameat" { .BADF, .INVAL => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); // Validate that the old file doesn't exist anymore @@ -2987,7 +5913,7 @@ test "renameat" { test "unlinkat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3010,11 +5936,11 @@ test "unlinkat" { 0x12121212, tmp.dir.fd, path, - 0, + .{}, ); - try testing.expectEqual(linux.IORING_OP.UNLINKAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Op.unlinkat, sqe.opcode); + try testing.expectEqual(tmp.dir.fd, sqe.fd); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3023,10 +5949,10 @@ test "unlinkat" { .BADF, .INVAL => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); // Validate that the file doesn't exist anymore @@ -3039,7 +5965,7 @@ test "unlinkat" { test "mkdirat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3059,9 +5985,9 @@ test "mkdirat" { path, 0o0755, ); - try testing.expectEqual(linux.IORING_OP.MKDIRAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Op.mkdirat, sqe.opcode); + try testing.expectEqual(tmp.dir.fd, sqe.fd); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3070,10 +5996,10 @@ test "mkdirat" { .BADF, .INVAL => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); // Validate that the directory exist @@ -3083,7 +6009,7 @@ test "mkdirat" { test "symlinkat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3107,9 +6033,9 @@ test "symlinkat" { tmp.dir.fd, link_path, ); - try testing.expectEqual(linux.IORING_OP.SYMLINKAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Op.symlinkat, sqe.opcode); + try testing.expectEqual(tmp.dir.fd, sqe.fd); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3118,10 +6044,10 @@ test "symlinkat" { .BADF, .INVAL => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); // Validate that the symlink exist @@ -3131,7 +6057,7 @@ test "symlinkat" { test "linkat" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3158,12 +6084,12 @@ test "linkat" { first_path, tmp.dir.fd, second_path, - 0, + .{}, ); - try testing.expectEqual(linux.IORING_OP.LINKAT, sqe.opcode); - try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd); - try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len))); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Op.linkat, sqe.opcode); + try testing.expectEqual(tmp.dir.fd, sqe.fd); + try testing.expectEqual(tmp.dir.fd, @as(i32, @intCast(sqe.len))); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3172,10 +6098,10 @@ test "linkat" { .BADF, .INVAL => return error.SkipZigTest, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0x12121212, .res = 0, - .flags = 0, + .flags = .{}, }, cqe); // Validate the second file @@ -3186,7 +6112,7 @@ test "linkat" { test "provide_buffers: read" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3206,12 +6132,12 @@ test "provide_buffers: read" { // Provide 4 buffers { - const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(linux.IORING_OP.PROVIDE_BUFFERS, sqe.opcode); + const sqe = try ring.provide_buffers(0xcccccccc, @ptrCast(&buffers), buffer_len, buffers.len, group_id, buffer_id); + try testing.expectEqual(Op.provide_buffers, sqe.opcode); try testing.expectEqual(@as(i32, buffers.len), sqe.fd); - try testing.expectEqual(@as(u32, buffers[0].len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(buffers[0].len, sqe.len); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3220,7 +6146,7 @@ test "provide_buffers: read" { .SUCCESS => {}, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); + try testing.expectEqual(0xcccccccc, cqe.user_data); } // Do 4 reads which should consume all buffers @@ -3228,12 +6154,12 @@ test "provide_buffers: read" { var i: usize = 0; while (i < buffers.len) : (i += 1) { const sqe = try ring.read(0xdededede, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); - try testing.expectEqual(@as(i32, fd), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Op.read, sqe.opcode); + try testing.expectEqual(fd, sqe.fd); + try testing.expectEqual(0, sqe.addr); + try testing.expectEqual(buffer_len, sqe.len); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3241,25 +6167,26 @@ test "provide_buffers: read" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); - const used_buffer_id = cqe.flags >> 16; + try testing.expect(cqe.flags.f_buffer); + const used_buffer_id = try cqe.buffer_id(); try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); + try testing.expectEqual(buffer_len, cqe.res); - try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); + try testing.expectEqual(0xdededede, cqe.user_data); + const empty: [buffer_len]u8 = @splat(0); + try testing.expectEqualSlices(u8, empty[0..], buffers[used_buffer_id][0..@intCast(cqe.res)]); } // This read should fail { const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); - try testing.expectEqual(@as(i32, fd), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Op.read, sqe.opcode); + try testing.expectEqual(fd, sqe.fd); + try testing.expectEqual(0, sqe.addr); + try testing.expectEqual(buffer_len, sqe.len); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3268,7 +6195,7 @@ test "provide_buffers: read" { .SUCCESS => std.debug.panic("unexpected success", .{}), else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + try testing.expectEqual(0xdfdfdfdf, cqe.user_data); } // Provide 1 buffer again @@ -3279,8 +6206,8 @@ test "provide_buffers: read" { const reprovided_buffer_id = 2; { - _ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.provide_buffers(0xabababab, @ptrCast(&buffers[reprovided_buffer_id]), buffer_len, 1, group_id, reprovided_buffer_id); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3293,12 +6220,12 @@ test "provide_buffers: read" { { const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.READ, sqe.opcode); - try testing.expectEqual(@as(i32, fd), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Op.read, sqe.opcode); + try testing.expectEqual(fd, sqe.fd); + try testing.expectEqual(0, sqe.addr); + try testing.expectEqual(buffer_len, sqe.len); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3306,19 +6233,20 @@ test "provide_buffers: read" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); - const used_buffer_id = cqe.flags >> 16; + try testing.expect(cqe.flags.f_buffer); + const used_buffer_id = try cqe.buffer_id(); try testing.expectEqual(used_buffer_id, reprovided_buffer_id); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); + try testing.expectEqual(buffer_len, cqe.res); + try testing.expectEqual(0xdfdfdfdf, cqe.user_data); + const empty: [buffer_len]u8 = @splat(0); + try testing.expectEqualSlices(u8, empty[0..], buffers[used_buffer_id][0..@intCast(cqe.res)]); } } test "remove_buffers" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3338,8 +6266,8 @@ test "remove_buffers" { // Provide 4 buffers { - _ = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.provide_buffers(0xcccccccc, @ptrCast(&buffers), buffer_len, buffers.len, group_id, buffer_id); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3347,32 +6275,32 @@ test "remove_buffers" { .SUCCESS => {}, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); + try testing.expectEqual(0xcccccccc, cqe.user_data); } // Remove 3 buffers { const sqe = try ring.remove_buffers(0xbababababa, 3, group_id); - try testing.expectEqual(linux.IORING_OP.REMOVE_BUFFERS, sqe.opcode); - try testing.expectEqual(@as(i32, 3), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(Op.remove_buffers, sqe.opcode); + try testing.expectEqual(3, sqe.fd); + try testing.expectEqual(0, sqe.addr); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { .SUCCESS => {}, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(@as(u64, 0xbababababa), cqe.user_data); + try testing.expectEqual(0xbababababa, cqe.user_data); } // This read should work { _ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3380,19 +6308,20 @@ test "remove_buffers" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); - const used_buffer_id = cqe.flags >> 16; + try testing.expect(cqe.flags.f_buffer); + const used_buffer_id = try cqe.buffer_id(); try testing.expect(used_buffer_id >= 0 and used_buffer_id < 4); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]); + try testing.expectEqual(buffer_len, cqe.res); + try testing.expectEqual(0xdfdfdfdf, cqe.user_data); + const empty: [buffer_len]u8 = @splat(0); + try testing.expectEqualSlices(u8, empty[0..], buffers[used_buffer_id][0..@intCast(cqe.res)]); } // Final read should _not_ work { _ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3407,7 +6336,7 @@ test "remove_buffers" { test "provide_buffers: accept/connect/send/recv" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3423,12 +6352,12 @@ test "provide_buffers: accept/connect/send/recv" { // Provide 4 buffers { - const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id); - try testing.expectEqual(linux.IORING_OP.PROVIDE_BUFFERS, sqe.opcode); + const sqe = try ring.provide_buffers(0xcccccccc, @ptrCast(&buffers), buffer_len, buffers.len, group_id, buffer_id); + try testing.expectEqual(Op.provide_buffers, sqe.opcode); try testing.expectEqual(@as(i32, buffers.len), sqe.fd); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(buffer_len, sqe.len); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3439,7 +6368,7 @@ test "provide_buffers: accept/connect/send/recv" { .SUCCESS => {}, else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data); + try testing.expectEqual(0xcccccccc, cqe.user_data); } const socket_test_harness = try createSocketTestHarness(&ring); @@ -3450,12 +6379,13 @@ test "provide_buffers: accept/connect/send/recv" { { var i: usize = 0; while (i < buffers.len) : (i += 1) { - _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'z'} ** buffer_len), 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + const zz_buffer: [buffer_len]u8 = @splat('z'); + _ = try ring.send(0xdeaddead, socket_test_harness.server, zz_buffer[0..], .{}); + try testing.expectEqual(1, try ring.submit()); } - var cqes: [4]linux.io_uring_cqe = undefined; - try testing.expectEqual(@as(u32, 4), try ring.copy_cqes(&cqes, 4)); + var cqes: [4]Cqe = undefined; + try testing.expectEqual(4, try ring.copy_cqes(&cqes, 4)); } // Do 4 recv which should consume all buffers @@ -3465,15 +6395,15 @@ test "provide_buffers: accept/connect/send/recv" { var i: usize = 0; while (i < buffers.len) : (i += 1) { - const sqe = try ring.recv(0xdededede, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.RECV, sqe.opcode); + const sqe = try ring.recv(0xdededede, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, .{}); + try testing.expectEqual(Op.recv, sqe.opcode); try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(0, sqe.addr); + try testing.expectEqual(buffer_len, sqe.len); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(0, sqe.rw_flags); + try testing.expectEqual(Sqe.IoSqe{ .buffer_select = true }, sqe.flags); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3481,28 +6411,28 @@ test "provide_buffers: accept/connect/send/recv" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); - const used_buffer_id = cqe.flags >> 16; + try testing.expect(cqe.flags.f_buffer); + const used_buffer_id = try cqe.buffer_id(); try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); + try testing.expectEqual(buffer_len, cqe.res); - try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data); - const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]; - try testing.expectEqualSlices(u8, &([_]u8{'z'} ** buffer_len), buffer); + try testing.expectEqual(0xdededede, cqe.user_data); + const zzz: [buffer_len]u8 = @splat('z'); + try testing.expectEqualSlices(u8, zzz[0..], buffers[used_buffer_id][0..@intCast(cqe.res)]); } // This recv should fail { - const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.RECV, sqe.opcode); + const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, .{}); + try testing.expectEqual(Op.recv, sqe.opcode); try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(0, sqe.addr); + try testing.expectEqual(buffer_len, sqe.len); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(0, sqe.rw_flags); + try testing.expectEqual(Sqe.IoSqe{ .buffer_select = true }, sqe.flags); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3511,7 +6441,7 @@ test "provide_buffers: accept/connect/send/recv" { .SUCCESS => std.debug.panic("unexpected success", .{}), else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); + try testing.expectEqual(0xdfdfdfdf, cqe.user_data); } // Provide 1 buffer again @@ -3519,8 +6449,8 @@ test "provide_buffers: accept/connect/send/recv" { const reprovided_buffer_id = 2; { - _ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.provide_buffers(0xabababab, @ptrCast(&buffers[reprovided_buffer_id]), buffer_len, 1, group_id, reprovided_buffer_id); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3532,8 +6462,9 @@ test "provide_buffers: accept/connect/send/recv" { // Redo 1 send on the server socket { - _ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'w'} ** buffer_len), 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + const ww_buffer: [buffer_len]u8 = @splat('w'); + _ = try ring.send(0xdeaddead, socket_test_harness.server, ww_buffer[0..], .{}); + try testing.expectEqual(1, try ring.submit()); _ = try ring.copy_cqe(); } @@ -3544,15 +6475,15 @@ test "provide_buffers: accept/connect/send/recv" { @memset(mem.sliceAsBytes(&buffers), 1); { - const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0); - try testing.expectEqual(linux.IORING_OP.RECV, sqe.opcode); + const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, .{}); + try testing.expectEqual(Op.recv, sqe.opcode); try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd); - try testing.expectEqual(@as(u64, 0), sqe.addr); - try testing.expectEqual(@as(u32, buffer_len), sqe.len); - try testing.expectEqual(@as(u16, group_id), sqe.buf_index); - try testing.expectEqual(@as(u32, 0), sqe.rw_flags); - try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(0, sqe.addr); + try testing.expectEqual(buffer_len, sqe.len); + try testing.expectEqual(group_id, sqe.buf_index); + try testing.expectEqual(0, sqe.rw_flags); + try testing.expectEqual(Sqe.IoSqe{ .buffer_select = true }, sqe.flags); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); switch (cqe.err()) { @@ -3560,13 +6491,13 @@ test "provide_buffers: accept/connect/send/recv" { else => |errno| std.debug.panic("unhandled errno: {}", .{errno}), } - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); - const used_buffer_id = cqe.flags >> 16; + try testing.expect(cqe.flags.f_buffer); + const used_buffer_id = try cqe.buffer_id(); try testing.expectEqual(used_buffer_id, reprovided_buffer_id); - try testing.expectEqual(@as(i32, buffer_len), cqe.res); - try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data); - const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]; - try testing.expectEqualSlices(u8, &([_]u8{'w'} ** buffer_len), buffer); + try testing.expectEqual(buffer_len, cqe.res); + try testing.expectEqual(0xdfdfdfdf, cqe.user_data); + const www: [buffer_len]u8 = @splat('w'); + try testing.expectEqualSlices(u8, www[0..], buffers[used_buffer_id][0..@intCast(cqe.res)]); } } @@ -3592,16 +6523,16 @@ fn createSocketTestHarness(ring: *IoUring) !SocketTestHarness { errdefer posix.close(listener_socket); // Submit 1 accept - var accept_addr: posix.sockaddr = undefined; - var accept_addr_len: posix.socklen_t = @sizeOf(@TypeOf(accept_addr)); - _ = try ring.accept(0xaaaaaaaa, listener_socket, &accept_addr, &accept_addr_len, 0); + var accept_addr: linux.sockaddr = undefined; + var accept_addr_len: linux.socklen_t = @sizeOf(@TypeOf(accept_addr)); + _ = try ring.accept(0xaaaaaaaa, listener_socket, &accept_addr, &accept_addr_len, .{}); // Create a TCP client socket const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); errdefer posix.close(client); _ = try ring.connect(0xcccccccc, client, addrAny(&address), @sizeOf(linux.sockaddr.in)); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + try testing.expectEqual(2, try ring.submit()); var cqe_accept = try ring.copy_cqe(); if (cqe_accept.err() == .INVAL) return error.SkipZigTest; @@ -3616,19 +6547,19 @@ fn createSocketTestHarness(ring: *IoUring) !SocketTestHarness { cqe_connect = a; } - try testing.expectEqual(@as(u64, 0xaaaaaaaa), cqe_accept.user_data); + try testing.expectEqual(0xaaaaaaaa, cqe_accept.user_data); if (cqe_accept.res <= 0) std.debug.print("\ncqe_accept.res={}\n", .{cqe_accept.res}); try testing.expect(cqe_accept.res > 0); - try testing.expectEqual(@as(u32, 0), cqe_accept.flags); - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(@as(Cqe.Flags, .{}), cqe_accept.flags); + try testing.expectEqual(Cqe{ .user_data = 0xcccccccc, .res = 0, - .flags = 0, + .flags = .{}, }, cqe_connect); // All good - return SocketTestHarness{ + return .{ .listener = listener_socket, .server = cqe_accept.res, .client = client, @@ -3640,7 +6571,7 @@ fn createListenerSocket(address: *linux.sockaddr.in) !posix.socket_t { const listener_socket = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); errdefer posix.close(listener_socket); - try posix.setsockopt(listener_socket, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1))); + try posix.setsockopt(listener_socket, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(u32, 1))); try posix.bind(listener_socket, addrAny(address), @sizeOf(linux.sockaddr.in)); try posix.listen(listener_socket, kernel_backlog); @@ -3654,7 +6585,7 @@ fn createListenerSocket(address: *linux.sockaddr.in) !posix.socket_t { test "accept multishot" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3669,11 +6600,11 @@ test "accept multishot" { defer posix.close(listener_socket); // submit multishot accept operation - var addr: posix.sockaddr = undefined; - var addr_len: posix.socklen_t = @sizeOf(@TypeOf(addr)); + var addr: linux.sockaddr = undefined; + var addr_len: linux.socklen_t = @sizeOf(@TypeOf(addr)); const userdata: u64 = 0xaaaaaaaa; - _ = try ring.accept_multishot(userdata, listener_socket, &addr, &addr_len, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.accept_multishot(userdata, listener_socket, &addr, &addr_len, .{}); + try testing.expectEqual(1, try ring.submit()); var nr: usize = 4; // number of clients to connect while (nr > 0) : (nr -= 1) { @@ -3687,7 +6618,7 @@ test "accept multishot" { if (cqe.err() == .INVAL) return error.SkipZigTest; try testing.expect(cqe.res > 0); try testing.expect(cqe.user_data == userdata); - try testing.expect(cqe.flags & linux.IORING_CQE_F_MORE > 0); // more flag is set + try testing.expect(cqe.flags.f_more); // more flag is set posix.close(client); } @@ -3696,7 +6627,7 @@ test "accept multishot" { test "accept/connect/send_zc/recv" { try skipKernelLessThan(.{ .major = 6, .minor = 0, .patch = 0 }); - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3706,14 +6637,14 @@ test "accept/connect/send_zc/recv" { const socket_test_harness = try createSocketTestHarness(&ring); defer socket_test_harness.close(); - const buffer_send = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; - var buffer_recv = [_]u8{0} ** 10; + const buffer_send: [15]u8 = .{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; + var buffer_recv: [10]u8 = @splat(0); // zero-copy send - const sqe_send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0, 0); - sqe_send.flags |= linux.IOSQE_IO_LINK; - _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0); - try testing.expectEqual(@as(u32, 2), try ring.submit()); + const sqe_send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], .{}, .{}); + sqe_send.link_next(); + _ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, .{}); + try testing.expectEqual(2, try ring.submit()); var cqe_send = try ring.copy_cqe(); // First completion of zero-copy send. @@ -3721,10 +6652,10 @@ test "accept/connect/send_zc/recv" { // will be a second completion event / notification for the // request, with the user_data field set to the same value. // buffer_send must be keep alive until second cqe. - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xeeeeeeee, .res = buffer_send.len, - .flags = linux.IORING_CQE_F_MORE, + .flags = .{ .f_more = true }, }, cqe_send); cqe_send, const cqe_recv = brk: { @@ -3733,26 +6664,26 @@ test "accept/connect/send_zc/recv" { break :brk if (cqe1.user_data == 0xeeeeeeee) .{ cqe1, cqe2 } else .{ cqe2, cqe1 }; }; - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xffffffff, .res = buffer_recv.len, - .flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY, + .flags = .{ .f_sock_nonempty = cqe_recv.flags.f_sock_nonempty }, }, cqe_recv); try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]); // Second completion of zero-copy send. // IORING_CQE_F_NOTIF in flags signals that kernel is done with send_buffer - try testing.expectEqual(linux.io_uring_cqe{ + try testing.expectEqual(Cqe{ .user_data = 0xeeeeeeee, .res = 0, - .flags = linux.IORING_CQE_F_NOTIF, + .flags = .{ .f_notif = true }, }, cqe_send); } test "accept_direct" { try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3764,7 +6695,7 @@ test "accept_direct" { }; // register direct file descriptors - var registered_fds = [_]linux.fd_t{-1} ** 2; + var registered_fds: [2]linux.fd_t = @splat(-1); try ring.register_files(registered_fds[0..]); const listener_socket = try createListenerSocket(&address); @@ -3776,12 +6707,12 @@ test "accept_direct" { for (0..2) |_| { for (registered_fds, 0..) |_, i| { - var buffer_recv = [_]u8{0} ** 16; + var buffer_recv: [16]u8 = @splat(0); const buffer_send: []const u8 = data[0 .. data.len - i]; // make it different at each loop // submit accept, will chose registered fd and return index in cqe - _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, .{}); + try testing.expectEqual(1, try ring.submit()); // connect const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); @@ -3790,7 +6721,7 @@ test "accept_direct" { // accept completion const cqe_accept = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_accept.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe_accept.err()); const fd_index = cqe_accept.res; try testing.expect(fd_index < registered_fds.len); try testing.expect(cqe_accept.user_data == accept_userdata); @@ -3802,9 +6733,10 @@ test "accept_direct" { // Submit receive to fixed file returned by accept (fd_index). // Fd field is set to registered file index, returned by accept. // Flag linux.IOSQE_FIXED_FILE must be set. - const recv_sqe = try ring.recv(read_userdata, fd_index, .{ .buffer = &buffer_recv }, 0); - recv_sqe.flags |= linux.IOSQE_FIXED_FILE; - try testing.expectEqual(@as(u32, 1), try ring.submit()); + const recv_sqe = try ring.recv(read_userdata, fd_index, .{ .buffer = &buffer_recv }, .{}); + recv_sqe.set_flags(.{ .fixed_file = true }); + + try testing.expectEqual(1, try ring.submit()); // accept receive const recv_cqe = try ring.copy_cqe(); @@ -3815,8 +6747,8 @@ test "accept_direct" { // no more available fds, accept will get NFILE error { // submit accept - _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.accept_direct(accept_userdata, listener_socket, null, null, .{}); + try testing.expectEqual(1, try ring.submit()); // connect const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0); try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in)); @@ -3824,7 +6756,7 @@ test "accept_direct" { // completion with error const cqe_accept = try ring.copy_cqe(); try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expectEqual(posix.E.NFILE, cqe_accept.err()); + try testing.expectEqual(linux.E.NFILE, cqe_accept.err()); } // return file descriptors to kernel try ring.register_files_update(0, registered_fds[0..]); @@ -3840,7 +6772,7 @@ test "accept_multishot_direct" { return error.SkipZigTest; } - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3852,7 +6784,7 @@ test "accept_multishot_direct" { .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), }; - var registered_fds = [_]linux.fd_t{-1} ** 2; + var registered_fds: [2]linux.fd_t = @splat(-1); try ring.register_files(registered_fds[0..]); const listener_socket = try createListenerSocket(&address); @@ -3863,8 +6795,8 @@ test "accept_multishot_direct" { for (0..2) |_| { // submit multishot accept // Will chose registered fd and return index of the selected registered file in cqe. - _ = try ring.accept_multishot_direct(accept_userdata, listener_socket, null, null, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.accept_multishot_direct(accept_userdata, listener_socket, null, null, .{}); + try testing.expectEqual(1, try ring.submit()); for (registered_fds) |_| { // connect @@ -3877,7 +6809,7 @@ test "accept_multishot_direct" { const fd_index = cqe_accept.res; try testing.expect(fd_index < registered_fds.len); try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expect(cqe_accept.flags & linux.IORING_CQE_F_MORE > 0); // has more is set + try testing.expect(cqe_accept.flags.f_more); // has more is set } // No more available fds, accept will get NFILE error. // Multishot is terminated (more flag is not set). @@ -3889,8 +6821,8 @@ test "accept_multishot_direct" { // completion with error const cqe_accept = try ring.copy_cqe(); try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expectEqual(posix.E.NFILE, cqe_accept.err()); - try testing.expect(cqe_accept.flags & linux.IORING_CQE_F_MORE == 0); // has more is not set + try testing.expectEqual(linux.E.NFILE, cqe_accept.err()); + try testing.expect(!cqe_accept.flags.f_more); // has more is not set } // return file descriptors to kernel try ring.register_files_update(0, registered_fds[0..]); @@ -3901,7 +6833,7 @@ test "accept_multishot_direct" { test "socket" { try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - var ring = IoUring.init(1, 0) catch |err| switch (err) { + var ring = IoUring.init(1, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -3909,12 +6841,12 @@ test "socket" { defer ring.deinit(); // prepare, submit socket operation - _ = try ring.socket(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.socket(0, .inet, .{ .type = .stream }, .default, 0); + try testing.expectEqual(1, try ring.submit()); // test completion var cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); const fd: linux.fd_t = @intCast(cqe.res); try testing.expect(fd > 2); @@ -3924,36 +6856,36 @@ test "socket" { test "socket_direct/socket_direct_alloc/close_direct" { try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - var ring = IoUring.init(2, 0) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - var registered_fds = [_]linux.fd_t{-1} ** 3; + var registered_fds: [3]linux.fd_t = @splat(-1); try ring.register_files(registered_fds[0..]); // create socket in registered file descriptor at index 0 (last param) - _ = try ring.socket_direct(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.socket_direct(0, .inet, .{ .type = .stream }, .default, 0, 0); + try testing.expectEqual(1, try ring.submit()); var cqe_socket = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe_socket.err()); try testing.expect(cqe_socket.res == 0); // create socket in registered file descriptor at index 1 (last param) - _ = try ring.socket_direct(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0, 1); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.socket_direct(0, .inet, .{ .type = .stream }, .default, 0, 1); + try testing.expectEqual(1, try ring.submit()); cqe_socket = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe_socket.err()); try testing.expect(cqe_socket.res == 0); // res is 0 when index is specified // create socket in kernel chosen file descriptor index (_alloc version) // completion res has index from registered files - _ = try ring.socket_direct_alloc(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.socket_direct_alloc(0, .inet, .{ .type = .stream }, .default, 0); + try testing.expectEqual(1, try ring.submit()); cqe_socket = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe_socket.err()); try testing.expect(cqe_socket.res == 2); // returns registered file index // use sockets from registered_fds in connect operation @@ -3968,12 +6900,13 @@ test "socket_direct/socket_direct_alloc/close_direct" { const close_userdata: u64 = 0xcccccccc; for (registered_fds, 0..) |_, fd_index| { // prepare accept - _ = try ring.accept(accept_userdata, listener_socket, null, null, 0); + _ = try ring.accept(accept_userdata, listener_socket, null, null, .{}); // prepare connect with fixed socket const connect_sqe = try ring.connect(connect_userdata, @intCast(fd_index), addrAny(&address), @sizeOf(linux.sockaddr.in)); - connect_sqe.flags |= linux.IOSQE_FIXED_FILE; // fd is fixed file index + // fd is fixed file index + connect_sqe.set_flags(.{ .fixed_file = true }); // submit both - try testing.expectEqual(@as(u32, 2), try ring.submit()); + try testing.expectEqual(2, try ring.submit()); // get completions var cqe_connect = try ring.copy_cqe(); var cqe_accept = try ring.copy_cqe(); @@ -3986,17 +6919,17 @@ test "socket_direct/socket_direct_alloc/close_direct" { } // test connect completion try testing.expect(cqe_connect.user_data == connect_userdata); - try testing.expectEqual(posix.E.SUCCESS, cqe_connect.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe_connect.err()); // test accept completion try testing.expect(cqe_accept.user_data == accept_userdata); - try testing.expectEqual(posix.E.SUCCESS, cqe_accept.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe_accept.err()); // submit and test close_direct _ = try ring.close_direct(close_userdata, @intCast(fd_index)); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); var cqe_close = try ring.copy_cqe(); try testing.expect(cqe_close.user_data == close_userdata); - try testing.expectEqual(posix.E.SUCCESS, cqe_close.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe_close.err()); } try ring.unregister_files(); @@ -4005,50 +6938,50 @@ test "socket_direct/socket_direct_alloc/close_direct" { test "openat_direct/close_direct" { try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 }); - var ring = IoUring.init(2, 0) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, }; defer ring.deinit(); - var registered_fds = [_]linux.fd_t{-1} ** 3; + var registered_fds: [3]linux.fd_t = @splat(-1); try ring.register_files(registered_fds[0..]); var tmp = std.testing.tmpDir(.{}); defer tmp.cleanup(); const path = "test_io_uring_close_direct"; const flags: linux.O = .{ .ACCMODE = .RDWR, .CREAT = true }; - const mode: posix.mode_t = 0o666; + const mode: linux.mode_t = 0o666; const user_data: u64 = 0; // use registered file at index 0 (last param) _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); var cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); try testing.expect(cqe.res == 0); // use registered file at index 1 _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 1); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); try testing.expect(cqe.res == 0); // res is 0 when we specify index // let kernel choose registered file index - _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, linux.IORING_FILE_INDEX_ALLOC); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, constants.FILE_INDEX_ALLOC); + try testing.expectEqual(1, try ring.submit()); cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); try testing.expect(cqe.res == 2); // chosen index is in res // close all open file descriptors for (registered_fds, 0..) |_, fd_index| { _ = try ring.close_direct(user_data, @intCast(fd_index)); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + try testing.expectEqual(1, try ring.submit()); var cqe_close = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe_close.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe_close.err()); } try ring.unregister_files(); } @@ -4056,7 +6989,7 @@ test "openat_direct/close_direct" { test "waitid" { try skipKernelLessThan(.{ .major = 6, .minor = 7, .patch = 0 }); - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4068,8 +7001,8 @@ test "waitid" { posix.exit(7); } - var siginfo: posix.siginfo_t = undefined; - _ = try ring.waitid(0, .PID, pid, &siginfo, posix.W.EXITED, 0); + var siginfo: linux.siginfo_t = undefined; + _ = try ring.waitid(0, .PID, pid, &siginfo, .{ .exited = true }, 0); try testing.expectEqual(1, try ring.submit()); @@ -4091,14 +7024,26 @@ inline fn skipKernelLessThan(required: std.SemanticVersion) !void { } const release = mem.sliceTo(&uts.release, 0); - // Strips potential extra, as kernel version might not be semver compliant, example "6.8.9-300.fc40.x86_64" - const extra_index = std.mem.indexOfAny(u8, release, "-+"); - const stripped = release[0..(extra_index orelse release.len)]; - // Make sure the input don't rely on the extra we just stripped + // Make sure the input don't rely on the extra we are about to stripped try testing.expect(required.pre == null and required.build == null); + const stripped = blk: { + // Strips potential extra, as kernel version might not be semver compliant, example "6.8.9-300.fc40.x86_64" + const extra_index = std.mem.findAny(u8, release, "-+"); + const stripped = release[0..(extra_index orelse release.len)]; + + // wsl kernel isn't semver compliant + // .ie 6.6.87.2-microsoft-standard-WSL2 strip the extra .2 after 87 + const wsl = "WSL2"; + if (std.mem.eql(u8, release[release.len - wsl.len ..][0..wsl.len], wsl)) { + const wsl_stripped, _ = std.mem.cutScalarLast(u8, stripped, '.') orelse unreachable; + break :blk wsl_stripped; + } + break :blk stripped; + }; var current = try std.SemanticVersion.parse(stripped); current.pre = null; // don't check pre field + if (required.order(current) == .gt) return error.SkipZigTest; } @@ -4106,7 +7051,7 @@ test BufferGroup { if (!is_linux) return error.SkipZigTest; // Init IoUring - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4137,18 +7082,18 @@ test BufferGroup { // Client sends data { - _ = try ring.send(1, fds.client, data[0..], 0); + _ = try ring.send(1, fds.client, data[0..], .{}); const submitted = try ring.submit(); try testing.expectEqual(1, submitted); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ .user_data = 1, .res = data.len, .flags = 0 }, cqe_send); + try testing.expectEqual(Cqe{ .user_data = 1, .res = data.len, .flags = .{} }, cqe_send); } // Server uses buffer group receive { // Submit recv operation, buffer will be chosen from buffer group - _ = try buf_grp.recv(2, fds.server, 0); + _ = try buf_grp.recv(2, fds.server, .{}); const submitted = try ring.submit(); try testing.expectEqual(1, submitted); @@ -4156,8 +7101,8 @@ test BufferGroup { const cqe = try ring.copy_cqe(); try testing.expectEqual(2, cqe.user_data); // matches submitted user_data try testing.expect(cqe.res >= 0); // success - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expectEqual(data.len, @as(usize, @intCast(cqe.res))); // cqe.res holds received data len + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); + try testing.expectEqual(@as(i32, data.len), cqe.res); // cqe.res holds received data len // Get buffer from pool const buf = try buf_grp.get(cqe); @@ -4170,7 +7115,7 @@ test BufferGroup { test "ring mapped buffers recv" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4208,11 +7153,11 @@ test "ring mapped buffers recv" { const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe }; { const user_data = rnd.int(u64); - _ = try ring.send(user_data, fds.client, data[0..], 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.send(user_data, fds.client, data[0..], .{}); + try testing.expectEqual(1, try ring.submit()); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send); + try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = .{} }, cqe_send); } var pos: usize = 0; @@ -4231,13 +7176,13 @@ test "ring mapped buffers recv" { // 'no more buffers', until we put buffers to the kernel { const user_data = rnd.int(u64); - _ = try buf_grp.recv(user_data, fds.server, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try buf_grp.recv(user_data, fds.server, .{}); + try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); try testing.expectEqual(user_data, cqe.user_data); try testing.expect(cqe.res < 0); // fail - try testing.expectEqual(posix.E.NOBUFS, cqe.err()); - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == 0); // IORING_CQE_F_BUFFER flags is set on success only + try testing.expectEqual(linux.E.NOBUFS, cqe.err()); + try testing.expect(!cqe.flags.f_buffer); // IORING_CQE_F_BUFFER flags is set on success only try testing.expectError(error.NoBufferSelected, cqe.buffer_id()); } @@ -4259,7 +7204,7 @@ test "ring mapped buffers recv" { test "ring mapped buffers multishot recv" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(16, 0) catch |err| switch (err) { + var ring = IoUring.init(16, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4297,28 +7242,28 @@ test "ring mapped buffers multishot recv" { const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf }; { const user_data = rnd.int(u64); - _ = try ring.send(user_data, fds.client, data[0..], 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.send(user_data, fds.client, data[0..], .{}); + try testing.expectEqual(1, try ring.submit()); const cqe_send = try ring.copy_cqe(); if (cqe_send.err() == .INVAL) return error.SkipZigTest; - try testing.expectEqual(linux.io_uring_cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send); + try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = .{} }, cqe_send); } // start multishot recv var recv_user_data = rnd.int(u64); - _ = try buf_grp.recv_multishot(recv_user_data, fds.server, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit + _ = try buf_grp.recv_multishot(recv_user_data, fds.server, .{}); + try testing.expectEqual(1, try ring.submit()); // submit // server reads data into provided buffers // there are 2 buffers of size 4, so each read gets only chunk of data // we read four chunks of 4, 4, 4, 4 bytes each var chunk: []const u8 = data[0..buffer_size]; // first chunk const cqe1 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe1.flags & linux.IORING_CQE_F_MORE > 0); + try testing.expect(cqe1.flags.f_more); chunk = data[buffer_size .. buffer_size * 2]; // second chunk const cqe2 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe2.flags & linux.IORING_CQE_F_MORE > 0); + try testing.expect(cqe2.flags.f_more); // both buffers provided to the kernel are used so we get error // 'no more buffers', until we put buffers to the kernel @@ -4326,11 +7271,12 @@ test "ring mapped buffers multishot recv" { const cqe = try ring.copy_cqe(); try testing.expectEqual(recv_user_data, cqe.user_data); try testing.expect(cqe.res < 0); // fail - try testing.expectEqual(posix.E.NOBUFS, cqe.err()); - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == 0); // IORING_CQE_F_BUFFER flags is set on success only + try testing.expectEqual(linux.E.NOBUFS, cqe.err()); + // IORING_CQE_F_BUFFER flags is set on success only + try testing.expect(!cqe.flags.f_buffer); // has more is not set // indicates that multishot is finished - try testing.expect(cqe.flags & linux.IORING_CQE_F_MORE == 0); + try testing.expect(!cqe.flags.f_more); try testing.expectError(error.NoBufferSelected, cqe.buffer_id()); } @@ -4340,24 +7286,24 @@ test "ring mapped buffers multishot recv" { // restart multishot recv_user_data = rnd.int(u64); - _ = try buf_grp.recv_multishot(recv_user_data, fds.server, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit + _ = try buf_grp.recv_multishot(recv_user_data, fds.server, .{}); + try testing.expectEqual(1, try ring.submit()); // submit chunk = data[buffer_size * 2 .. buffer_size * 3]; // third chunk const cqe3 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe3.flags & linux.IORING_CQE_F_MORE > 0); + try testing.expect(cqe3.flags.f_more); try buf_grp.put(cqe3); chunk = data[buffer_size * 3 ..]; // last chunk const cqe4 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk); - try testing.expect(cqe4.flags & linux.IORING_CQE_F_MORE > 0); + try testing.expect(cqe4.flags.f_more); try buf_grp.put(cqe4); // cancel pending multishot recv operation { const cancel_user_data = rnd.int(u64); - _ = try ring.cancel(cancel_user_data, recv_user_data, 0); - try testing.expectEqual(@as(u32, 1), try ring.submit()); + _ = try ring.cancel(cancel_user_data, recv_user_data, .{}); + try testing.expectEqual(1, try ring.submit()); // expect completion of cancel operation and completion of recv operation var cqe_cancel = try ring.copy_cqe(); @@ -4390,7 +7336,7 @@ test "ring mapped buffers multishot recv" { try testing.expectEqual(recv_user_data, cqe_recv.user_data); try testing.expect(cqe_recv.res < 0); try testing.expect(cqe_recv.err() == .NOBUFS or cqe_recv.err() == .CANCELED); - try testing.expect(cqe_recv.flags & linux.IORING_CQE_F_MORE == 0); + try testing.expect(!cqe_recv.flags.f_more); } } } @@ -4401,18 +7347,18 @@ fn buf_grp_recv_submit_get_cqe( buf_grp: *BufferGroup, fd: linux.fd_t, user_data: u64, -) !linux.io_uring_cqe { +) !Cqe { // prepare and submit recv - const sqe = try buf_grp.recv(user_data, fd, 0); - try testing.expect(sqe.flags & linux.IOSQE_BUFFER_SELECT == linux.IOSQE_BUFFER_SELECT); + const sqe = try buf_grp.recv(user_data, fd, .{}); + try testing.expect(sqe.flags.buffer_select); try testing.expect(sqe.buf_index == buf_grp.group_id); - try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit + try testing.expectEqual(1, try ring.submit()); // submit // get cqe, expect success const cqe = try ring.copy_cqe(); try testing.expectEqual(user_data, cqe.user_data); try testing.expect(cqe.res >= 0); // success - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); // IORING_CQE_F_BUFFER flag is set + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); + try testing.expect(cqe.flags.f_buffer); // IORING_CQE_F_BUFFER flag is set return cqe; } @@ -4422,18 +7368,18 @@ fn expect_buf_grp_cqe( buf_grp: *BufferGroup, user_data: u64, expected: []const u8, -) !linux.io_uring_cqe { +) !Cqe { // get cqe const cqe = try ring.copy_cqe(); try testing.expectEqual(user_data, cqe.user_data); try testing.expect(cqe.res >= 0); // success - try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER); // IORING_CQE_F_BUFFER flag is set - try testing.expectEqual(expected.len, @as(usize, @intCast(cqe.res))); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expect(cqe.flags.f_buffer); // IORING_CQE_F_BUFFER flag is set + try testing.expectEqual(@as(i32, @intCast(expected.len)), cqe.res); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); // get buffer from pool const buffer_id = try cqe.buffer_id(); - const len = @as(usize, @intCast(cqe.res)); + const len: usize = @intCast(cqe.res); const buf = buf_grp.get_by_id(buffer_id)[0..len]; try testing.expectEqualSlices(u8, expected, buf); @@ -4443,7 +7389,7 @@ fn expect_buf_grp_cqe( test "copy_cqes with wrapping sq.cqes buffer" { if (!is_linux) return error.SkipZigTest; - var ring = IoUring.init(2, 0) catch |err| switch (err) { + var ring = IoUring.init(2, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4454,11 +7400,11 @@ test "copy_cqes with wrapping sq.cqes buffer" { try testing.expectEqual(4, ring.cq.cqes.len); // submit 2 entries, receive 2 completions - var cqes: [8]linux.io_uring_cqe = undefined; + var cqes: [8]Cqe = undefined; { for (0..2) |_| { const sqe = try ring.get_sqe(); - sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, 0); + sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, .{}); try testing.expect(try ring.submit() == 1); } var cqe_count: u32 = 0; @@ -4475,7 +7421,7 @@ test "copy_cqes with wrapping sq.cqes buffer" { for (1..1024) |i| { for (0..4) |_| { const sqe = try ring.get_sqe(); - sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, 0); + sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, .{}); try testing.expect(try ring.submit() == 1); } var cqe_count: u32 = 0; @@ -4488,7 +7434,7 @@ test "copy_cqes with wrapping sq.cqes buffer" { } test "bind/listen/connect" { - var ring = IoUring.init(4, 0) catch |err| switch (err) { + var ring = IoUring.init(4, .{}) catch |err| switch (err) { error.SystemOutdated => return error.SkipZigTest, error.PermissionDenied => return error.SkipZigTest, else => return err, @@ -4497,46 +7443,51 @@ test "bind/listen/connect" { const probe = ring.get_probe() catch return error.SkipZigTest; // LISTEN is higher required operation - if (!probe.is_supported(.LISTEN)) return error.SkipZigTest; + if (!probe.is_supported(.listen)) return error.SkipZigTest; var addr: linux.sockaddr.in = .{ .port = 0, .addr = @bitCast([4]u8{ 127, 0, 0, 1 }), }; + // TODO: switch family to IpProto type + // const proto: linux.IpProto = switch (addr.any.family) { + // .unix => .default, + // else => .tcp, + // }; const proto: u32 = if (addr.family == linux.AF.UNIX) 0 else linux.IPPROTO.TCP; const listen_fd = brk: { // Create socket - _ = try ring.socket(1, addr.family, linux.SOCK.STREAM | linux.SOCK.CLOEXEC, proto, 0); + _ = try ring.socket(1, @enumFromInt(addr.family), .{ .type = .stream, .flags = .{ .cloexec = true } }, @enumFromInt(proto), 0); try testing.expectEqual(1, try ring.submit()); var cqe = try ring.copy_cqe(); try testing.expectEqual(1, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); const listen_fd: linux.fd_t = @intCast(cqe.res); try testing.expect(listen_fd > 2); // Prepare: set socket option * 2, bind, listen var optval: u32 = 1; - (try ring.setsockopt(2, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEADDR, mem.asBytes(&optval))).link_next(); - (try ring.setsockopt(3, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEPORT, mem.asBytes(&optval))).link_next(); - (try ring.bind(4, listen_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in), 0)).link_next(); - _ = try ring.listen(5, listen_fd, 1, 0); + (try ring.setsockopt(2, listen_fd, .socket, .reuseaddr, mem.asBytes(&optval))).link_next(); + (try ring.setsockopt(3, listen_fd, .socket, .reuseport, mem.asBytes(&optval))).link_next(); + (try ring.bind(4, listen_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in))).link_next(); + _ = try ring.listen(5, listen_fd, 1); // Submit 4 operations try testing.expectEqual(4, try ring.submit()); // Expect all to succeed for (2..6) |user_data| { cqe = try ring.copy_cqe(); try testing.expectEqual(user_data, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); } // Check that socket option is set optval = 0; - _ = try ring.getsockopt(5, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEADDR, mem.asBytes(&optval)); + _ = try ring.getsockopt(5, listen_fd, .socket, .reuseaddr, mem.asBytes(&optval)); try testing.expectEqual(1, try ring.submit()); cqe = try ring.copy_cqe(); try testing.expectEqual(5, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); try testing.expectEqual(1, optval); // Read system assigned port into addr @@ -4548,11 +7499,11 @@ test "bind/listen/connect" { const connect_fd = brk: { // Create connect socket - _ = try ring.socket(6, addr.family, linux.SOCK.STREAM | linux.SOCK.CLOEXEC, proto, 0); + _ = try ring.socket(6, @enumFromInt(addr.family), .{ .type = .stream, .flags = .{ .cloexec = true } }, @enumFromInt(proto), 0); try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); try testing.expectEqual(6, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); // Get connect socket fd const connect_fd: linux.fd_t = @intCast(cqe.res); try testing.expect(connect_fd > 2 and connect_fd != listen_fd); @@ -4560,14 +7511,14 @@ test "bind/listen/connect" { }; // Prepare accept/connect operations - _ = try ring.accept(7, listen_fd, null, null, 0); + _ = try ring.accept(7, listen_fd, null, null, .{}); _ = try ring.connect(8, connect_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in)); try testing.expectEqual(2, try ring.submit()); // Get listener accepted socket - var accept_fd: posix.socket_t = 0; + var accept_fd: linux.socket_t = 0; for (0..2) |_| { const cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); if (cqe.user_data == 7) { accept_fd = @intCast(cqe.res); } else { @@ -4581,41 +7532,41 @@ test "bind/listen/connect" { try testSendRecv(&ring, accept_fd, connect_fd); // Shutdown and close all sockets - for ([_]posix.socket_t{ connect_fd, accept_fd, listen_fd }) |fd| { - (try ring.shutdown(9, fd, posix.SHUT.RDWR)).link_next(); + for ([_]linux.socket_t{ connect_fd, accept_fd, listen_fd }) |fd| { + (try ring.shutdown(9, fd, .rdwr)).link_next(); _ = try ring.close(10, fd); try testing.expectEqual(2, try ring.submit()); for (0..2) |i| { const cqe = try ring.copy_cqe(); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); try testing.expectEqual(9 + i, cqe.user_data); } } } -fn testSendRecv(ring: *IoUring, send_fd: posix.socket_t, recv_fd: posix.socket_t) !void { +fn testSendRecv(ring: *IoUring, send_fd: linux.socket_t, recv_fd: linux.socket_t) !void { const buffer_send = "0123456789abcdf" ** 10; var buffer_recv: [buffer_send.len * 2]u8 = undefined; // 2 sends - _ = try ring.send(1, send_fd, buffer_send, linux.MSG.WAITALL); - _ = try ring.send(2, send_fd, buffer_send, linux.MSG.WAITALL); + _ = try ring.send(1, send_fd, buffer_send, .{ .waitall = true }); + _ = try ring.send(2, send_fd, buffer_send, .{ .waitall = true }); try testing.expectEqual(2, try ring.submit()); for (0..2) |i| { const cqe = try ring.copy_cqe(); try testing.expectEqual(1 + i, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); - try testing.expectEqual(buffer_send.len, @as(usize, @intCast(cqe.res))); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); + try testing.expectEqual(@as(i32, buffer_send.len), cqe.res); } // receive var recv_len: usize = 0; while (recv_len < buffer_send.len * 2) { - _ = try ring.recv(3, recv_fd, .{ .buffer = buffer_recv[recv_len..] }, 0); + _ = try ring.recv(3, recv_fd, .{ .buffer = buffer_recv[recv_len..] }, .{}); try testing.expectEqual(1, try ring.submit()); const cqe = try ring.copy_cqe(); try testing.expectEqual(3, cqe.user_data); - try testing.expectEqual(posix.E.SUCCESS, cqe.err()); + try testing.expectEqual(linux.E.SUCCESS, cqe.err()); recv_len += @intCast(cqe.res); } diff --git a/lib/std/os/linux/io_uring_sqe.zig b/lib/std/os/linux/io_uring_sqe.zig deleted file mode 100644 index 5658206a66a8..000000000000 --- a/lib/std/os/linux/io_uring_sqe.zig +++ /dev/null @@ -1,679 +0,0 @@ -//! Contains only the definition of `io_uring_sqe`. -//! Split into its own file to compartmentalize the initialization methods. - -const std = @import("../../std.zig"); -const linux = std.os.linux; - -pub const io_uring_sqe = extern struct { - opcode: linux.IORING_OP, - flags: u8, - ioprio: u16, - fd: i32, - off: u64, - addr: u64, - len: u32, - rw_flags: u32, - user_data: u64, - buf_index: u16, - personality: u16, - splice_fd_in: i32, - addr3: u64, - resv: u64, - - pub fn prep_nop(sqe: *linux.io_uring_sqe) void { - sqe.* = .{ - .opcode = .NOP, - .flags = 0, - .ioprio = 0, - .fd = 0, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_fsync(sqe: *linux.io_uring_sqe, fd: linux.fd_t, flags: u32) void { - sqe.* = .{ - .opcode = .FSYNC, - .flags = 0, - .ioprio = 0, - .fd = fd, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = flags, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_rw( - sqe: *linux.io_uring_sqe, - op: linux.IORING_OP, - fd: linux.fd_t, - addr: u64, - len: usize, - offset: u64, - ) void { - sqe.* = .{ - .opcode = op, - .flags = 0, - .ioprio = 0, - .fd = fd, - .off = offset, - .addr = addr, - .len = @intCast(len), - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_read(sqe: *linux.io_uring_sqe, fd: linux.fd_t, buffer: []u8, offset: u64) void { - sqe.prep_rw(.READ, fd, @intFromPtr(buffer.ptr), buffer.len, offset); - } - - pub fn prep_write(sqe: *linux.io_uring_sqe, fd: linux.fd_t, buffer: []const u8, offset: u64) void { - sqe.prep_rw(.WRITE, fd, @intFromPtr(buffer.ptr), buffer.len, offset); - } - - pub fn prep_splice(sqe: *linux.io_uring_sqe, fd_in: linux.fd_t, off_in: u64, fd_out: linux.fd_t, off_out: u64, len: usize) void { - sqe.prep_rw(.SPLICE, fd_out, undefined, len, off_out); - sqe.addr = off_in; - sqe.splice_fd_in = fd_in; - } - - pub fn prep_readv( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - iovecs: []const std.posix.iovec, - offset: u64, - ) void { - sqe.prep_rw(.READV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); - } - - pub fn prep_writev( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - iovecs: []const std.posix.iovec_const, - offset: u64, - ) void { - sqe.prep_rw(.WRITEV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset); - } - - pub fn prep_read_fixed(sqe: *linux.io_uring_sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { - sqe.prep_rw(.READ_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); - sqe.buf_index = buffer_index; - } - - pub fn prep_write_fixed(sqe: *linux.io_uring_sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void { - sqe.prep_rw(.WRITE_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset); - sqe.buf_index = buffer_index; - } - - pub fn prep_accept( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: u32, - ) void { - // `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to socklen_t`. - // `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). - sqe.prep_rw(.ACCEPT, fd, @intFromPtr(addr), 0, @intFromPtr(addrlen)); - sqe.rw_flags = flags; - } - - pub fn prep_accept_direct( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: u32, - file_index: u32, - ) void { - prep_accept(sqe, fd, addr, addrlen, flags); - __io_uring_set_target_fixed_file(sqe, file_index); - } - - pub fn prep_multishot_accept_direct( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: u32, - ) void { - prep_multishot_accept(sqe, fd, addr, addrlen, flags); - __io_uring_set_target_fixed_file(sqe, linux.IORING_FILE_INDEX_ALLOC); - } - - fn __io_uring_set_target_fixed_file(sqe: *linux.io_uring_sqe, file_index: u32) void { - const sqe_file_index: u32 = if (file_index == linux.IORING_FILE_INDEX_ALLOC) - linux.IORING_FILE_INDEX_ALLOC - else - // 0 means no fixed files, indexes should be encoded as "index + 1" - file_index + 1; - // This filed is overloaded in liburing: - // splice_fd_in: i32 - // sqe_file_index: u32 - sqe.splice_fd_in = @bitCast(sqe_file_index); - } - - pub fn prep_connect( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - addr: *const linux.sockaddr, - addrlen: linux.socklen_t, - ) void { - // `addrlen` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32). - sqe.prep_rw(.CONNECT, fd, @intFromPtr(addr), 0, addrlen); - } - - pub fn prep_epoll_ctl( - sqe: *linux.io_uring_sqe, - epfd: linux.fd_t, - fd: linux.fd_t, - op: u32, - ev: ?*linux.epoll_event, - ) void { - sqe.prep_rw(.EPOLL_CTL, epfd, @intFromPtr(ev), op, @intCast(fd)); - } - - pub fn prep_recv(sqe: *linux.io_uring_sqe, fd: linux.fd_t, buffer: []u8, flags: u32) void { - sqe.prep_rw(.RECV, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; - } - - pub fn prep_recv_multishot( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - buffer: []u8, - flags: u32, - ) void { - sqe.prep_recv(fd, buffer, flags); - sqe.ioprio |= linux.IORING_RECV_MULTISHOT; - } - - pub fn prep_recvmsg( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - msg: *linux.msghdr, - flags: u32, - ) void { - sqe.prep_rw(.RECVMSG, fd, @intFromPtr(msg), 1, 0); - sqe.rw_flags = flags; - } - - pub fn prep_recvmsg_multishot( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - msg: *linux.msghdr, - flags: u32, - ) void { - sqe.prep_recvmsg(fd, msg, flags); - sqe.ioprio |= linux.IORING_RECV_MULTISHOT; - } - - pub fn prep_send(sqe: *linux.io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: u32) void { - sqe.prep_rw(.SEND, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; - } - - pub fn prep_send_zc(sqe: *linux.io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: u32, zc_flags: u16) void { - sqe.prep_rw(.SEND_ZC, fd, @intFromPtr(buffer.ptr), buffer.len, 0); - sqe.rw_flags = flags; - sqe.ioprio = zc_flags; - } - - pub fn prep_send_zc_fixed(sqe: *linux.io_uring_sqe, fd: linux.fd_t, buffer: []const u8, flags: u32, zc_flags: u16, buf_index: u16) void { - prep_send_zc(sqe, fd, buffer, flags, zc_flags); - sqe.ioprio |= linux.IORING_RECVSEND_FIXED_BUF; - sqe.buf_index = buf_index; - } - - pub fn prep_sendmsg_zc( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - msg: *const linux.msghdr_const, - flags: u32, - ) void { - prep_sendmsg(sqe, fd, msg, flags); - sqe.opcode = .SENDMSG_ZC; - } - - pub fn prep_sendmsg( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - msg: *const linux.msghdr_const, - flags: u32, - ) void { - sqe.prep_rw(.SENDMSG, fd, @intFromPtr(msg), 1, 0); - sqe.rw_flags = flags; - } - - pub fn prep_openat( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.O, - mode: linux.mode_t, - ) void { - sqe.prep_rw(.OPENAT, fd, @intFromPtr(path), mode, 0); - sqe.rw_flags = @bitCast(flags); - } - - pub fn prep_openat_direct( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - path: [*:0]const u8, - flags: linux.O, - mode: linux.mode_t, - file_index: u32, - ) void { - prep_openat(sqe, fd, path, flags, mode); - __io_uring_set_target_fixed_file(sqe, file_index); - } - - pub fn prep_close(sqe: *linux.io_uring_sqe, fd: linux.fd_t) void { - sqe.* = .{ - .opcode = .CLOSE, - .flags = 0, - .ioprio = 0, - .fd = fd, - .off = 0, - .addr = 0, - .len = 0, - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_close_direct(sqe: *linux.io_uring_sqe, file_index: u32) void { - prep_close(sqe, 0); - __io_uring_set_target_fixed_file(sqe, file_index); - } - - pub fn prep_timeout( - sqe: *linux.io_uring_sqe, - ts: *const linux.kernel_timespec, - count: u32, - flags: u32, - ) void { - sqe.prep_rw(.TIMEOUT, -1, @intFromPtr(ts), 1, count); - sqe.rw_flags = flags; - } - - pub fn prep_timeout_remove(sqe: *linux.io_uring_sqe, timeout_user_data: u64, flags: u32) void { - sqe.* = .{ - .opcode = .TIMEOUT_REMOVE, - .flags = 0, - .ioprio = 0, - .fd = -1, - .off = 0, - .addr = timeout_user_data, - .len = 0, - .rw_flags = flags, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_link_timeout( - sqe: *linux.io_uring_sqe, - ts: *const linux.kernel_timespec, - flags: u32, - ) void { - sqe.prep_rw(.LINK_TIMEOUT, -1, @intFromPtr(ts), 1, 0); - sqe.rw_flags = flags; - } - - pub fn prep_poll_add( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - poll_mask: u32, - ) void { - sqe.prep_rw(.POLL_ADD, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0); - // Poll masks previously used to comprise of 16 bits in the flags union of - // a SQE, but were then extended to comprise of 32 bits in order to make - // room for additional option flags. To ensure that the correct bits of - // poll masks are consistently and properly read across multiple kernel - // versions, poll masks are enforced to be little-endian. - // https://www.spinics.net/lists/io-uring/msg02848.html - sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); - } - - pub fn prep_poll_remove( - sqe: *linux.io_uring_sqe, - target_user_data: u64, - ) void { - sqe.prep_rw(.POLL_REMOVE, -1, target_user_data, 0, 0); - } - - pub fn prep_poll_update( - sqe: *linux.io_uring_sqe, - old_user_data: u64, - new_user_data: u64, - poll_mask: u32, - flags: u32, - ) void { - sqe.prep_rw(.POLL_REMOVE, -1, old_user_data, flags, new_user_data); - // Poll masks previously used to comprise of 16 bits in the flags union of - // a SQE, but were then extended to comprise of 32 bits in order to make - // room for additional option flags. To ensure that the correct bits of - // poll masks are consistently and properly read across multiple kernel - // versions, poll masks are enforced to be little-endian. - // https://www.spinics.net/lists/io-uring/msg02848.html - sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask); - } - - pub fn prep_fallocate( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - mode: i32, - offset: u64, - len: u64, - ) void { - sqe.* = .{ - .opcode = .FALLOCATE, - .flags = 0, - .ioprio = 0, - .fd = fd, - .off = offset, - .addr = len, - .len = @intCast(mode), - .rw_flags = 0, - .user_data = 0, - .buf_index = 0, - .personality = 0, - .splice_fd_in = 0, - .addr3 = 0, - .resv = 0, - }; - } - - pub fn prep_statx( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - path: [*:0]const u8, - flags: u32, - mask: u32, - buf: *linux.Statx, - ) void { - sqe.prep_rw(.STATX, fd, @intFromPtr(path), mask, @intFromPtr(buf)); - sqe.rw_flags = flags; - } - - pub fn prep_cancel( - sqe: *linux.io_uring_sqe, - cancel_user_data: u64, - flags: u32, - ) void { - sqe.prep_rw(.ASYNC_CANCEL, -1, cancel_user_data, 0, 0); - sqe.rw_flags = flags; - } - - pub fn prep_cancel_fd( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - flags: u32, - ) void { - sqe.prep_rw(.ASYNC_CANCEL, fd, 0, 0, 0); - sqe.rw_flags = flags | linux.IORING_ASYNC_CANCEL_FD; - } - - pub fn prep_shutdown( - sqe: *linux.io_uring_sqe, - sockfd: linux.socket_t, - how: u32, - ) void { - sqe.prep_rw(.SHUTDOWN, sockfd, 0, how, 0); - } - - pub fn prep_renameat( - sqe: *linux.io_uring_sqe, - old_dir_fd: linux.fd_t, - old_path: [*:0]const u8, - new_dir_fd: linux.fd_t, - new_path: [*:0]const u8, - flags: u32, - ) void { - sqe.prep_rw( - .RENAMEAT, - old_dir_fd, - @intFromPtr(old_path), - 0, - @intFromPtr(new_path), - ); - sqe.len = @bitCast(new_dir_fd); - sqe.rw_flags = flags; - } - - pub fn prep_unlinkat( - sqe: *linux.io_uring_sqe, - dir_fd: linux.fd_t, - path: [*:0]const u8, - flags: u32, - ) void { - sqe.prep_rw(.UNLINKAT, dir_fd, @intFromPtr(path), 0, 0); - sqe.rw_flags = flags; - } - - pub fn prep_mkdirat( - sqe: *linux.io_uring_sqe, - dir_fd: linux.fd_t, - path: [*:0]const u8, - mode: linux.mode_t, - ) void { - sqe.prep_rw(.MKDIRAT, dir_fd, @intFromPtr(path), mode, 0); - } - - pub fn prep_symlinkat( - sqe: *linux.io_uring_sqe, - target: [*:0]const u8, - new_dir_fd: linux.fd_t, - link_path: [*:0]const u8, - ) void { - sqe.prep_rw( - .SYMLINKAT, - new_dir_fd, - @intFromPtr(target), - 0, - @intFromPtr(link_path), - ); - } - - pub fn prep_linkat( - sqe: *linux.io_uring_sqe, - old_dir_fd: linux.fd_t, - old_path: [*:0]const u8, - new_dir_fd: linux.fd_t, - new_path: [*:0]const u8, - flags: u32, - ) void { - sqe.prep_rw( - .LINKAT, - old_dir_fd, - @intFromPtr(old_path), - 0, - @intFromPtr(new_path), - ); - sqe.len = @bitCast(new_dir_fd); - sqe.rw_flags = flags; - } - - pub fn prep_files_update( - sqe: *linux.io_uring_sqe, - fds: []const linux.fd_t, - offset: u32, - ) void { - sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, @intCast(offset)); - } - - pub fn prep_files_update_alloc( - sqe: *linux.io_uring_sqe, - fds: []linux.fd_t, - ) void { - sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, linux.IORING_FILE_INDEX_ALLOC); - } - - pub fn prep_provide_buffers( - sqe: *linux.io_uring_sqe, - buffers: [*]u8, - buffer_len: usize, - num: usize, - group_id: usize, - buffer_id: usize, - ) void { - const ptr = @intFromPtr(buffers); - sqe.prep_rw(.PROVIDE_BUFFERS, @intCast(num), ptr, buffer_len, buffer_id); - sqe.buf_index = @intCast(group_id); - } - - pub fn prep_remove_buffers( - sqe: *linux.io_uring_sqe, - num: usize, - group_id: usize, - ) void { - sqe.prep_rw(.REMOVE_BUFFERS, @intCast(num), 0, 0, 0); - sqe.buf_index = @intCast(group_id); - } - - pub fn prep_multishot_accept( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - addr: ?*linux.sockaddr, - addrlen: ?*linux.socklen_t, - flags: u32, - ) void { - prep_accept(sqe, fd, addr, addrlen, flags); - sqe.ioprio |= linux.IORING_ACCEPT_MULTISHOT; - } - - pub fn prep_socket( - sqe: *linux.io_uring_sqe, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, - ) void { - sqe.prep_rw(.SOCKET, @intCast(domain), 0, protocol, socket_type); - sqe.rw_flags = flags; - } - - pub fn prep_socket_direct( - sqe: *linux.io_uring_sqe, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, - file_index: u32, - ) void { - prep_socket(sqe, domain, socket_type, protocol, flags); - __io_uring_set_target_fixed_file(sqe, file_index); - } - - pub fn prep_socket_direct_alloc( - sqe: *linux.io_uring_sqe, - domain: u32, - socket_type: u32, - protocol: u32, - flags: u32, - ) void { - prep_socket(sqe, domain, socket_type, protocol, flags); - __io_uring_set_target_fixed_file(sqe, linux.IORING_FILE_INDEX_ALLOC); - } - - pub fn prep_waitid( - sqe: *linux.io_uring_sqe, - id_type: linux.P, - id: i32, - infop: *linux.siginfo_t, - options: u32, - flags: u32, - ) void { - sqe.prep_rw(.WAITID, id, 0, @intFromEnum(id_type), @intFromPtr(infop)); - sqe.rw_flags = flags; - sqe.splice_fd_in = @bitCast(options); - } - - pub fn prep_bind( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - addr: *const linux.sockaddr, - addrlen: linux.socklen_t, - flags: u32, - ) void { - sqe.prep_rw(.BIND, fd, @intFromPtr(addr), 0, addrlen); - sqe.rw_flags = flags; - } - - pub fn prep_listen( - sqe: *linux.io_uring_sqe, - fd: linux.fd_t, - backlog: usize, - flags: u32, - ) void { - sqe.prep_rw(.LISTEN, fd, 0, backlog, 0); - sqe.rw_flags = flags; - } - - pub fn prep_cmd_sock( - sqe: *linux.io_uring_sqe, - cmd_op: linux.IO_URING_SOCKET_OP, - fd: linux.fd_t, - level: u32, - optname: u32, - optval: u64, - optlen: u32, - ) void { - sqe.prep_rw(.URING_CMD, fd, 0, 0, 0); - // off is overloaded with cmd_op, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L39 - sqe.off = @intFromEnum(cmd_op); - // addr is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L46 - sqe.addr = @bitCast(packed struct { - level: u32, - optname: u32, - }{ - .level = level, - .optname = optname, - }); - // splice_fd_in if overloaded u32 -> i32 - sqe.splice_fd_in = @bitCast(optlen); - // addr3 is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L102 - sqe.addr3 = optval; - } - - pub fn set_flags(sqe: *linux.io_uring_sqe, flags: u8) void { - sqe.flags |= flags; - } - - /// This SQE forms a link with the next SQE in the submission ring. Next SQE - /// will not be started before this one completes. Forms a chain of SQEs. - pub fn link_next(sqe: *linux.io_uring_sqe) void { - sqe.flags |= linux.IOSQE_IO_LINK; - } -}; diff --git a/lib/std/os/linux/test.zig b/lib/std/os/linux/test.zig index 6b658b87c0c0..50b3658d671c 100644 --- a/lib/std/os/linux/test.zig +++ b/lib/std/os/linux/test.zig @@ -45,34 +45,33 @@ test "timer" { var err: linux.E = linux.E.init(epoll_fd); try expect(err == .SUCCESS); - const timer_fd = linux.timerfd_create(linux.TIMERFD_CLOCK.MONOTONIC, .{}); + const timer_fd = linux.timerfd_create(.MONOTONIC, .{}); try expect(linux.E.init(timer_fd) == .SUCCESS); - const time_interval = linux.timespec{ + const time_interval: linux.timespec = .{ .sec = 0, .nsec = 2000000, }; - const new_time = linux.itimerspec{ + const new_time: linux.itimerspec = .{ .it_interval = time_interval, .it_value = time_interval, }; - err = linux.E.init(linux.timerfd_settime(@as(i32, @intCast(timer_fd)), .{}, &new_time, null)); + err = linux.E.init(linux.timerfd_settime(@intCast(timer_fd), .{}, &new_time, null)); try expect(err == .SUCCESS); - var event = linux.epoll_event{ + var event: linux.epoll_event = .{ .events = linux.EPOLL.IN | linux.EPOLL.OUT | linux.EPOLL.ET, - .data = linux.epoll_data{ .ptr = 0 }, + .data = .{ .ptr = 0 }, }; - err = linux.E.init(linux.epoll_ctl(@as(i32, @intCast(epoll_fd)), linux.EPOLL.CTL_ADD, @as(i32, @intCast(timer_fd)), &event)); + err = linux.E.init(linux.epoll_ctl(@intCast(epoll_fd), .ctl_add, @intCast(timer_fd), &event)); try expect(err == .SUCCESS); - const events_one: linux.epoll_event = undefined; - var events = [_]linux.epoll_event{events_one} ** 8; + var events: [8]linux.epoll_event = @splat(undefined); - err = linux.E.init(linux.epoll_wait(@as(i32, @intCast(epoll_fd)), &events, 8, -1)); + err = linux.E.init(linux.epoll_wait(@intCast(epoll_fd), &events, 8, -1)); try expect(err == .SUCCESS); } @@ -85,7 +84,7 @@ test "statx" { defer file.close(); var statx_buf: linux.Statx = undefined; - switch (linux.E.init(linux.statx(file.handle, "", linux.AT.EMPTY_PATH, linux.STATX_BASIC_STATS, &statx_buf))) { + switch (linux.E.init(linux.statx(file.handle, "", .{ .empty_path = true }, linux.Statx.Mask.basic_stats, &statx_buf))) { .SUCCESS => {}, else => unreachable, } @@ -93,17 +92,17 @@ test "statx" { if (builtin.cpu.arch == .riscv32 or builtin.cpu.arch.isLoongArch()) return error.SkipZigTest; // No fstatat, so the rest of the test is meaningless. var stat_buf: linux.Stat = undefined; - switch (linux.E.init(linux.fstatat(file.handle, "", &stat_buf, linux.AT.EMPTY_PATH))) { + switch (linux.E.init(linux.fstatat(file.handle, "", &stat_buf, .{ .empty_path = true }))) { .SUCCESS => {}, else => unreachable, } try expect(stat_buf.mode == statx_buf.mode); - try expect(@as(u32, @bitCast(stat_buf.uid)) == statx_buf.uid); - try expect(@as(u32, @bitCast(stat_buf.gid)) == statx_buf.gid); - try expect(@as(u64, @bitCast(@as(i64, stat_buf.size))) == statx_buf.size); - try expect(@as(u64, @bitCast(@as(i64, stat_buf.blksize))) == statx_buf.blksize); - try expect(@as(u64, @bitCast(@as(i64, stat_buf.blocks))) == statx_buf.blocks); + try expect(stat_buf.uid == statx_buf.uid); + try expect(stat_buf.gid == statx_buf.gid); + try expect(stat_buf.size == statx_buf.size); + try expect(stat_buf.blksize == statx_buf.blksize); + try expect(stat_buf.blocks == statx_buf.blocks); } test "user and group ids" { @@ -190,39 +189,39 @@ comptime { assert(256 == @as(u32, @bitCast(linux.FUTEX_OP{ .cmd = @enumFromInt(0), .private = false, .realtime = true }))); // Check futex_param4 union is packed correctly - const param_union = linux.futex_param4{ + const param_union: linux.futex_param4 = .{ .val2 = 0xaabbcc, }; assert(@intFromPtr(param_union.timeout) == 0xaabbcc); } test "futex v1" { - var lock: std.atomic.Value(u32) = std.atomic.Value(u32).init(1); + var lock: std.atomic.Value(u32) = .init(1); var rc: usize = 0; // No-op wait, lock value is not expected value - rc = linux.futex(&lock.raw, .{ .cmd = .WAIT, .private = true }, 2, .{ .timeout = null }, null, 0); + rc = linux.futex(&lock, .{ .cmd = .WAIT, .private = true }, 2, .{ .timeout = null }, null, 0); try expectEqual(.AGAIN, linux.E.init(rc)); - rc = linux.futex_4arg(&lock.raw, .{ .cmd = .WAIT, .private = true }, 2, null); + rc = linux.futex_4arg(&lock, .{ .cmd = .WAIT, .private = true }, 2, null); try expectEqual(.AGAIN, linux.E.init(rc)); // Short-fuse wait, timeout kicks in - rc = linux.futex(&lock.raw, .{ .cmd = .WAIT, .private = true }, 1, .{ .timeout = &.{ .sec = 0, .nsec = 2 } }, null, 0); + rc = linux.futex(&lock, .{ .cmd = .WAIT, .private = true }, 1, .{ .timeout = &.{ .sec = 0, .nsec = 2 } }, null, 0); try expectEqual(.TIMEDOUT, linux.E.init(rc)); - rc = linux.futex_4arg(&lock.raw, .{ .cmd = .WAIT, .private = true }, 1, &.{ .sec = 0, .nsec = 2 }); + rc = linux.futex_4arg(&lock, .{ .cmd = .WAIT, .private = true }, 1, &.{ .sec = 0, .nsec = 2 }); try expectEqual(.TIMEDOUT, linux.E.init(rc)); // Wakeup (no waiters) - rc = linux.futex(&lock.raw, .{ .cmd = .WAKE, .private = true }, 2, .{ .timeout = null }, null, 0); + rc = linux.futex(&lock, .{ .cmd = .WAKE, .private = true }, 2, .{ .timeout = null }, null, 0); try expectEqual(0, rc); - rc = linux.futex_3arg(&lock.raw, .{ .cmd = .WAKE, .private = true }, 2); + rc = linux.futex_3arg(&lock, .{ .cmd = .WAKE, .private = true }, 2); try expectEqual(0, rc); // CMP_REQUEUE - val3 mismatch - rc = linux.futex(&lock.raw, .{ .cmd = .CMP_REQUEUE, .private = true }, 2, .{ .val2 = 0 }, null, 99); + rc = linux.futex(&lock, .{ .cmd = .CMP_REQUEUE, .private = true }, 2, .{ .val2 = 0 }, null, 99); try expectEqual(.AGAIN, linux.E.init(rc)); // CMP_REQUEUE - requeue (but no waiters, so ... not much) @@ -230,14 +229,14 @@ test "futex v1" { const val3 = 1; const wake_nr = 3; const requeue_max = std.math.maxInt(u31); - var target_lock: std.atomic.Value(u32) = std.atomic.Value(u32).init(1); - rc = linux.futex(&lock.raw, .{ .cmd = .CMP_REQUEUE, .private = true }, wake_nr, .{ .val2 = requeue_max }, &target_lock.raw, val3); + const target_lock: std.atomic.Value(u32) = .init(1); + rc = linux.futex(&lock, .{ .cmd = .CMP_REQUEUE, .private = true }, wake_nr, .{ .val2 = requeue_max }, &target_lock, val3); try expectEqual(0, rc); } // WAKE_OP - just to see if we can construct the arguments ... { - var lock2: std.atomic.Value(u32) = std.atomic.Value(u32).init(1); + const lock2: std.atomic.Value(u32) = .init(1); const wake1_nr = 2; const wake2_nr = 3; const wake_op = linux.FUTEX_WAKE_OP{ @@ -248,65 +247,66 @@ test "futex v1" { .cmdarg = 5, }; - rc = linux.futex(&lock.raw, .{ .cmd = .WAKE_OP, .private = true }, wake1_nr, .{ .val2 = wake2_nr }, &lock2.raw, @bitCast(wake_op)); + rc = linux.futex(&lock, .{ .cmd = .WAKE_OP, .private = true }, wake1_nr, .{ .val2 = wake2_nr }, &lock2, @bitCast(wake_op)); try expectEqual(0, rc); } // WAIT_BITSET { // val1 return early - rc = linux.futex(&lock.raw, .{ .cmd = .WAIT_BITSET, .private = true }, 2, .{ .timeout = null }, null, 0xfff); + rc = linux.futex(&lock, .{ .cmd = .WAIT_BITSET, .private = true }, 2, .{ .timeout = null }, null, 0xfff); try expectEqual(.AGAIN, linux.E.init(rc)); // timeout wait const timeout: linux.timespec = .{ .sec = 0, .nsec = 2 }; - rc = linux.futex(&lock.raw, .{ .cmd = .WAIT_BITSET, .private = true }, 1, .{ .timeout = &timeout }, null, 0xfff); + rc = linux.futex(&lock, .{ .cmd = .WAIT_BITSET, .private = true }, 1, .{ .timeout = &timeout }, null, 0xfff); try expectEqual(.TIMEDOUT, linux.E.init(rc)); } // WAKE_BITSET { - rc = linux.futex(&lock.raw, .{ .cmd = .WAKE_BITSET, .private = true }, 2, .{ .timeout = null }, null, 0xfff000); + rc = linux.futex(&lock, .{ .cmd = .WAKE_BITSET, .private = true }, 2, .{ .timeout = null }, null, 0xfff000); try expectEqual(0, rc); // bitmask must have at least 1 bit set: - rc = linux.futex(&lock.raw, .{ .cmd = .WAKE_BITSET, .private = true }, 2, .{ .timeout = null }, null, 0); + rc = linux.futex(&lock, .{ .cmd = .WAKE_BITSET, .private = true }, 2, .{ .timeout = null }, null, 0); try expectEqual(.INVAL, linux.E.init(rc)); } } comptime { - assert(2 == @as(u32, @bitCast(linux.FUTEX2_FLAGS{ .size = .U32, .private = false }))); - assert(128 == @as(u32, @bitCast(linux.FUTEX2_FLAGS{ .size = @enumFromInt(0), .private = true }))); + std.debug.assert(2 == @as(u32, @bitCast(linux.Futex2.Wait{ .size = .U32, .private = false }))); + std.debug.assert(128 == @as(u32, @bitCast(linux.Futex2.Wait{ .size = @enumFromInt(0), .private = true }))); } test "futex2_waitv" { - const locks = [_]std.atomic.Value(u32){ - std.atomic.Value(u32).init(1), - std.atomic.Value(u32).init(1), - std.atomic.Value(u32).init(1), + const locks: [3]std.atomic.Value(u32) = .{ + .init(1), + .init(1), + .init(1), }; - const futexes = [_]linux.futex2_waitone{ + const futexes: [3]linux.Futex2.WaitOne = .{ .{ .val = 1, - .uaddr = @intFromPtr(&locks[0].raw), + .uaddr = @intFromPtr(&locks[0]), .flags = .{ .size = .U32, .private = true }, }, .{ .val = 1, - .uaddr = @intFromPtr(&locks[1].raw), + .uaddr = @intFromPtr(&locks[1]), .flags = .{ .size = .U32, .private = true }, }, .{ .val = 1, - .uaddr = @intFromPtr(&locks[2].raw), + .uaddr = @intFromPtr(&locks[2]), .flags = .{ .size = .U32, .private = true }, }, }; - const timeout = linux.kernel_timespec{ .sec = 0, .nsec = 2 }; // absolute timeout, so this is 1970... - const rc = linux.futex2_waitv(&futexes, futexes.len, .{}, &timeout, .MONOTONIC); + // absolute timeout, so this is 1970... + const timeout: linux.kernel_timespec = .{ .sec = 0, .nsec = 2 }; + const rc = linux.futex2_waitv(futexes[0..], .{}, &timeout, .MONOTONIC); switch (linux.E.init(rc)) { .NOSYS => return error.SkipZigTest, // futex2_waitv added in kernel v5.16 else => |err| try expectEqual(.TIMEDOUT, err), @@ -316,40 +316,40 @@ test "futex2_waitv" { // Futex v2 API is only supported on recent kernels (v6.7), so skip tests if the syscalls // return ENOSYS. fn futex2_skip_if_unsupported() !void { - const lock: u32 = 0; - const rc = linux.futex2_wake(&lock, 0, 1, .{ .size = .U32, .private = true }); + const lock: std.atomic.Value(u32) = .init(0); + const rc = linux.futex2_wake(&lock, .empty, 1, .{ .size = .U32, .private = true }); if (linux.E.init(rc) == .NOSYS) { return error.SkipZigTest; } } test "futex2_wait" { - var lock: std.atomic.Value(u32) = std.atomic.Value(u32).init(1); + const lock: std.atomic.Value(u32) = .init(1); var rc: usize = 0; - const mask = 0x1; + const mask: linux.Futex2.Bitset = .{ .waiter1 = true }; try futex2_skip_if_unsupported(); // The API for 8,16,64 bit futexes is defined, but as of kernel v6.14 // (at least) they're not implemented. if (false) { - rc = linux.futex2_wait(&lock.raw, 1, mask, .{ .size = .U8, .private = true }, null, .MONOTONIC); + rc = linux.futex2_wait(&lock, 1, mask, .{ .size = .U8, .private = true }, null, .MONOTONIC); try expectEqual(.INVAL, linux.E.init(rc)); - rc = linux.futex2_wait(&lock.raw, 1, mask, .{ .size = .U16, .private = true }, null, .MONOTONIC); + rc = linux.futex2_wait(&lock, 1, mask, .{ .size = .U16, .private = true }, null, .MONOTONIC); try expectEqual(.INVAL, linux.E.init(rc)); - rc = linux.futex2_wait(&lock.raw, 1, mask, .{ .size = .U64, .private = true }, null, .MONOTONIC); + rc = linux.futex2_wait(&lock, 1, mask, .{ .size = .U64, .private = true }, null, .MONOTONIC); try expectEqual(.INVAL, linux.E.init(rc)); } - const flags = linux.FUTEX2_FLAGS{ .size = .U32, .private = true }; + const flags: linux.Futex2.Wait = .{ .size = .U32, .private = true }; // no-wait, lock state mismatch - rc = linux.futex2_wait(&lock.raw, 2, mask, flags, null, .MONOTONIC); + rc = linux.futex2_wait(&lock, 2, mask, flags, null, .MONOTONIC); try expectEqual(.AGAIN, linux.E.init(rc)); // hit timeout on wait - rc = linux.futex2_wait(&lock.raw, 1, mask, flags, &.{ .sec = 0, .nsec = 2 }, .MONOTONIC); + rc = linux.futex2_wait(&lock, 1, mask, flags, &.{ .sec = 0, .nsec = 2 }, .MONOTONIC); try expectEqual(.TIMEDOUT, linux.E.init(rc)); // timeout is absolute @@ -363,40 +363,40 @@ test "futex2_wait" { .sec = curr.sec, .nsec = curr.nsec + 2, }; - rc = linux.futex2_wait(&lock.raw, 1, mask, flags, &timeout, .MONOTONIC); + rc = linux.futex2_wait(&lock, 1, mask, flags, &timeout, .MONOTONIC); try expectEqual(.TIMEDOUT, linux.E.init(rc)); } - rc = linux.futex2_wait(&lock.raw, 1, mask, flags, &.{ .sec = 0, .nsec = 2 }, .REALTIME); + rc = linux.futex2_wait(&lock, 1, mask, flags, &.{ .sec = 0, .nsec = 2 }, .REALTIME); try expectEqual(.TIMEDOUT, linux.E.init(rc)); } test "futex2_wake" { - var lock: std.atomic.Value(u32) = std.atomic.Value(u32).init(1); + const lock: std.atomic.Value(u32) = .init(1); try futex2_skip_if_unsupported(); - const rc = linux.futex2_wake(&lock.raw, 0xFF, 1, .{ .size = .U32, .private = true }); + const rc = linux.futex2_wake(&lock, .fromInt(0xFF), 1, .{ .size = .U32, .private = true }); try expectEqual(0, rc); } test "futex2_requeue" { try futex2_skip_if_unsupported(); - const locks = [_]std.atomic.Value(u32){ - std.atomic.Value(u32).init(1), - std.atomic.Value(u32).init(1), + const locks: [2]std.atomic.Value(u32) = .{ + .init(1), + .init(1), }; - const futexes = [_]linux.futex2_waitone{ + const futexes: [2]linux.Futex2.WaitOne = .{ .{ .val = 1, - .uaddr = @intFromPtr(&locks[0].raw), + .uaddr = @intFromPtr(&locks[0]), .flags = .{ .size = .U32, .private = true }, }, .{ .val = 1, - .uaddr = @intFromPtr(&locks[1].raw), + .uaddr = @intFromPtr(&locks[1]), .flags = .{ .size = .U32, .private = true }, }, }; diff --git a/lib/std/posix.zig b/lib/std/posix.zig index 1a969210aa78..b51be9f6513d 100644 --- a/lib/std/posix.zig +++ b/lib/std/posix.zig @@ -3953,7 +3953,7 @@ pub fn fstatatZ(dirfd: fd_t, pathname: [*:0]const u8, flags: u32) FStatAtError!S const fstatat_sym = if (lfs64_abi) system.fstatat64 else system.fstatat; var stat = mem.zeroes(Stat); - switch (errno(fstatat_sym(dirfd, pathname, &stat, flags))) { + switch (errno(fstatat_sym(dirfd, pathname, &stat, @bitCast(flags)))) { .SUCCESS => return stat, .INVAL => unreachable, .BADF => unreachable, // Always a race condition. diff --git a/lib/std/process/Child.zig b/lib/std/process/Child.zig index c84c87897277..66961fbb7ed3 100644 --- a/lib/std/process/Child.zig +++ b/lib/std/process/Child.zig @@ -523,14 +523,15 @@ fn cleanupStreams(self: *ChildProcess) void { } fn statusToTerm(status: u32) Term { - return if (posix.W.IFEXITED(status)) - Term{ .Exited = posix.W.EXITSTATUS(status) } - else if (posix.W.IFSIGNALED(status)) - Term{ .Signal = posix.W.TERMSIG(status) } - else if (posix.W.IFSTOPPED(status)) - Term{ .Stopped = posix.W.STOPSIG(status) } + const w: posix.W = @bitCast(status); + return if (w.ifExited()) + .{ .Exited = w.exitStatus() } + else if (w.ifSignaled()) + .{ .Signal = w.termSig() } + else if (w.ifStopped()) + .{ .Stopped = w.stopSig() } else - Term{ .Unknown = status }; + .{ .Unknown = status }; } fn spawnPosix(self: *ChildProcess) SpawnError!void {