diff --git a/Docker/Dockerfile.e2e b/Docker/Dockerfile.e2e index 51759bb52..910283d66 100644 --- a/Docker/Dockerfile.e2e +++ b/Docker/Dockerfile.e2e @@ -106,19 +106,21 @@ RUN rustup component add rust-src --toolchain nightly-2026-02-11-x86_64-unknown- # Build lind-boot FROM base as build-lind-boot # NOTE: Using 'make' risks cache invalidation on unrelated Makefile changes -COPY --parents src/lind-boot src/wasmtime src/rawposix src/cage src/threei src/typemap src/fdtables src/sysdefs Makefile rust-toolchain.toml . +COPY --parents src/lind-boot src/wasmtime src/rawposix src/cage src/threei src/typemap src/fdtables src/sysdefs src/lind-perf Makefile rust-toolchain.toml . RUN rm -f src/wasmtime/crates/cage \ src/wasmtime/crates/threei \ src/wasmtime/crates/fdtables \ src/wasmtime/crates/typemap \ src/wasmtime/crates/sysdefs \ src/wasmtime/crates/rawposix \ + src/wasmtime/crates/lind-perf \ && ln -s ../../cage src/wasmtime/crates/cage \ && ln -s ../../threei src/wasmtime/crates/threei \ && ln -s ../../fdtables src/wasmtime/crates/fdtables \ && ln -s ../../typemap src/wasmtime/crates/typemap \ && ln -s ../../sysdefs src/wasmtime/crates/sysdefs \ - && ln -s ../../rawposix src/wasmtime/crates/rawposix + && ln -s ../../rawposix src/wasmtime/crates/rawposix \ + && ln -s ../../lind-perf src/wasmtime/crates/lind-perf RUN make lind-boot diff --git a/src/lind-boot/Cargo.toml b/src/lind-boot/Cargo.toml index 3105f8e94..3ae7105e0 100644 --- a/src/lind-boot/Cargo.toml +++ b/src/lind-boot/Cargo.toml @@ -7,6 +7,12 @@ edition = "2024" disable_signals = ["cage/disable_signals", "wasmtime-lind-multi-process/disable_signals"] secure = ["typemap/secure"] lind_debug = ["wasmtime-lind-common/lind_debug"] +lind_perf = [ + "lind-perf/enabled", + "threei/lind_perf", + "rawposix/lind_perf", + "wasmtime-lind-common/lind_perf", +] debug-dylink = ["wasmtime-lind-dylink/debug-dylink", "wasmtime-lind-multi-process/debug-dylink", "wasmtime-lind-utils/debug-dylink", "wasmtime/debug-dylink"] debug-grate-calls = ["wasmtime-lind-3i/debug-grate-calls"] @@ -23,6 +29,7 @@ typemap = { path = "../wasmtime/crates/typemap" } wasmtime-lind-3i = { path = "../wasmtime/crates/lind-3i" } wasmtime = { path = "../wasmtime/crates/wasmtime", features = ["cranelift", "pooling-allocator", "gc", "threads", "demangle", "addr2line", "cache"], default-features = false } wasmtime-wasi = { version = "23.0.0", features = ["preview1"] , default-features = false } +lind-perf = { path = "../wasmtime/crates/lind-perf" } anyhow = { version = "1.0.66", default-features = false } clap = { version = "4", features = ["derive"] } diff --git a/src/lind-boot/src/README.md b/src/lind-boot/src/README.md index 030b09ca9..3e0281f24 100644 --- a/src/lind-boot/src/README.md +++ b/src/lind-boot/src/README.md @@ -10,7 +10,8 @@ At a high level, lind-boot sits at the boundary between the command-line interfa src/ ├── main.rs ├── cli.rs -└── lind-wasmtime/ +├── perf.rs +└── lind_wasmtime/ ├── mod.rs ├── execute.rs ├── host.rs @@ -32,7 +33,10 @@ Supported flags: ```sh --verbose --debug + --precompile + --wasmtime-backtrace --env NAME[=VAL] + --perf[=clock|tsc] ``` ## Design Overview @@ -43,6 +47,11 @@ From the user’s perspective, lind-boot behaves like a conventional process lau Execution begins in main.rs, where command-line arguments are parsed and passed to the core execution logic. The entry point accepts a WebAssembly binary followed by program arguments. +The control flow is: +1. Handle `--precompile` early and exit. +2. If `--perf` is set, try to run in benchmark mode. +3. Otherwise run the normal single-execution path. + ### host.rs Host-side runtime state is encapsulated in `HostCtx`, defined in host.rs. This structure holds the WASI Preview1 context, the WASI threads context, and the Lind multi-process context. @@ -59,9 +68,34 @@ Before execution begins, lind-boot attaches all required host-side APIs to the W Module instantiation occurs in `load_main_module`. The WebAssembly module is instantiated inside a Lind cage, after which the runtime checks for and invokes the `main` function because of our glibc modification. The main entry point is then resolved, stack bounds are initialized, and signal and epoch-related state is set up for the main thread of the cage. At this point, the WebAssembly program is fully initialized and starts running code logic. -One responsibility of lind-boot is capturing and managing Wasmtime’s internal `VMContext` pointers. After instantiation, lind-boot extracts the `VMContext` associated with the running instance and stores it in a global table indexed by cage ID. Additional backup instances are created to populate a pool of `VMContext`s that can be reused during grate calls and syscall re-entry. (See more comments on lind-wasm/src/wasmtime/crates/lind-3i) +One responsibility of lind-boot is capturing and managing Wasmtime’s internal `VMContext` pointers. After instantiation, lind-boot extracts the `VMContext` associated with the running instance and stores it in a global table indexed by cage ID. Additional backup instances are created to populate a pool of `VMContext`s that can be reused during grate calls and syscall re-entry. (See more comments on `src/wasmtime/crates/lind-3i`) ### trampoline.rs The re-entry mechanism is implemented in trampoline.rs. When 3i routes a syscall to a grate, it invokes a unified callback function registered by lind-boot. This trampoline retrieves the appropriate `VMContext` for the target cage, re-enters the Wasmtime runtime using `Caller::with`, and invokes a unified entry function inside the WebAssembly module. Control is then dispatched to the appropriate syscall implementation based on the function pointer originally registered with 3i. Once execution completes, the VMContext is returned to the global pool for future use. +### perf.rs + +`perf.rs` defines `lind-perf` counters for lind-boot and dependency crates which are used by `main.rs` to run benchmarks. + +`lind-boot` supports performance benchmarking via `--perf`, with optional timer selection: + +```sh +# Default timer backend is CLOCK_MONOTONIC_RAW +lind-boot --perf=clock program.wasm + +# `=clock` is optional +lind-boot --perf program.wasm + +# Cycle-counter backend (RDTSC/RDTSCP on x86_64) +lind-boot --perf=tsc program.wasm +``` + +Perf mode runs the same workload multiple times, enabling one counter per run, then prints a final report. + +Important behavior: + +1. `--perf` is accepted by the CLI regardless of build mode. +2. If lind-boot is compiled without the crate feature `lind_perf`, `--perf` exits early with an explicit error. +3. The `lind_perf` feature in lind-boot enables `lind-perf/enabled`, which turns timing/reporting on. Without it, `lind-perf` stays linked but behaves as no-op. + diff --git a/src/lind-boot/src/cli.rs b/src/lind-boot/src/cli.rs index e0dc954d2..0c93e25d9 100644 --- a/src/lind-boot/src/cli.rs +++ b/src/lind-boot/src/cli.rs @@ -3,6 +3,14 @@ use std::path::PathBuf; use anyhow::{Result, bail}; use clap::*; +#[derive(Debug, Clone, Copy, ValueEnum)] +pub enum PerfTimer { + /// Use `clock_gettime(CLOCK_MONOTONIC_RAW)` based timing. + Clock, + /// Use RDTSC/RDTSCP cycle counter timing. + Tsc, +} + fn parse_preloads(s: &str) -> Result<(String, PathBuf)> { let parts: Vec<&str> = s.splitn(2, '=').collect(); if parts.len() != 2 { @@ -56,6 +64,23 @@ pub struct CliOptions { #[arg(long = "env", number_of_values = 1, value_name = "NAME[=VAL]", value_parser = parse_env_var)] pub vars: Vec<(String, Option)>, + /// Get performance information for the running module. + /// + /// `--perf` defaults to `clock`; pass `--perf=tsc` for cycle-based timing. + /// + /// `--perf` is always accepted by the CLI, but execution only proceeds when + /// lind-boot is compiled with the crate feature `lind_perf` (which wires + /// `lind-perf/enabled`). + #[arg( + long, + value_enum, + default_missing_value = "clock", + value_name = "clock|tsc", + num_args = 0..=1, + require_equals = true, + )] + pub perf: Option, + /// Load the given WebAssembly module before the main module #[arg( long = "preload", @@ -78,4 +103,25 @@ impl CliOptions { pub fn wasm_file(&self) -> &str { &self.args[0] } + + pub fn perf_timer_kind(&self) -> Option { + // Runtime gate for the perf CLI path: + // - if lind-boot was compiled without `lind_perf`, reject `--perf` early + // with a clear error. + // - otherwise map the CLI timer selection to lind-perf's timer backend. + match lind_perf::ENABLED { + false => match self.perf { + Some(_) => { + eprintln!("--perf needs compilation with the feature `lind_perf` enabled."); + std::process::exit(1); + } + None => None, + }, + true => match self.perf { + Some(PerfTimer::Clock) => Some(lind_perf::TimerKind::Clock), + Some(PerfTimer::Tsc) => Some(lind_perf::TimerKind::Rdtsc), + None => None, + }, + } + } } diff --git a/src/lind-boot/src/lind_wasmtime/trampoline.rs b/src/lind-boot/src/lind_wasmtime/trampoline.rs index 683f8ee7e..ebef761fc 100644 --- a/src/lind-boot/src/lind_wasmtime/trampoline.rs +++ b/src/lind-boot/src/lind_wasmtime/trampoline.rs @@ -4,6 +4,8 @@ use threei::threei_const; use wasmtime_lind_3i::*; use wasmtime_lind_multi_process; +use crate::perf; + /// The callback function registered with 3i uses a unified Wasm entry /// function as the single re-entry point into the Wasm executable. /// @@ -27,6 +29,10 @@ pub extern "C" fn grate_callback_trampoline( arg6: u64, arg6cageid: u64, ) -> i32 { + // This timer measures the entire function since it is never explicitly dropped. The timer + // therefore ends only when the function exits. + let _grate_callback_timer = lind_perf::get_timer!(perf::GRATE_CALLBACK_TRAMPOLINE); + // Form the grate request with the provided arguments and the handler address let req = GrateRequest { handler_addr: in_grate_fn_ptr_u64, diff --git a/src/lind-boot/src/main.rs b/src/lind-boot/src/main.rs index 70720d130..1a8cd6db2 100644 --- a/src/lind-boot/src/main.rs +++ b/src/lind-boot/src/main.rs @@ -1,5 +1,6 @@ mod cli; mod lind_wasmtime; +mod perf; use crate::{ cli::CliOptions, @@ -64,10 +65,35 @@ fn main() -> Result<(), Box> { precompile_module(&lindboot_cli)?; return Ok(()); } - // Not a precompile command, chroot to lindfs chroot_to_lindfs(); + // Check if --perf is enabled and avaible to decide whether to run in benchmarking mode. + if let Some(kind) = lindboot_cli.perf_timer_kind() { + // Initialize all counters. + perf::perf_init(kind); + + let counters = perf::all_counter_names(); + + // Iterate over all counters: + // - Exclusively enable the counter + // - Run the program to gather timing data. + for counter in counters { + perf::enable_one_counter(counter); + + // Each sample run gets a fresh RawPOSIX lifecycle boundary to imitate actual + // behaviour. + rawposix_start(0); + let _ = execute_wasmtime(lindboot_cli.clone()); + rawposix_shutdown(); + } + + // Output final numbers to stdout. + perf::perf_report(); + + return Ok(()); + } + // Initialize RawPOSIX and register RawPOSIX syscalls with 3i rawposix_start(0); diff --git a/src/lind-boot/src/perf.rs b/src/lind-boot/src/perf.rs new file mode 100644 index 000000000..42b0038c4 --- /dev/null +++ b/src/lind-boot/src/perf.rs @@ -0,0 +1,49 @@ +/// lind-boot's perf file binds together every other module's perf file. +/// +/// This involves: +/// - Reading their COUNTERS +/// - Initializing them +/// - Combining all the COUNTERS into one list to iterate over and sequentially enable +/// - Printing a combined lind-perf report. +use crate::cli::CliOptions; +use lind_perf::{Counter, TimerKind}; + +// These are counters defined within lind-boot. +pub static GRATE_CALLBACK_TRAMPOLINE: Counter = + Counter::new("lind_boot::grate_callback_trampoline"); +pub static TYPED_FUNC_CALL: Counter = Counter::new("lind_boot::typed_func_call"); + +// Counter list used by the perf runner in `main.rs`. Each benchmark iteration +// enables exactly one counter name from this list. +pub static LIND_BOOT_COUNTERS: &[&Counter] = &[&GRATE_CALLBACK_TRAMPOLINE, &TYPED_FUNC_CALL]; + +/// Initialize counters for all modules, involves setting the TimerKind and resetting the +/// counts. +pub fn perf_init(kind: TimerKind) { + // Configure timer backend (Clock or TSC) for all local counters. + lind_perf::set_timer(all_counters(), kind); + // Reset all accumulated measurements before benchmark runs begin. + lind_perf::reset_all_counters(all_counters()); +} + +/// Finds a counter by it's name and searches for it across modules to enable it. Disables all +/// other counters. +pub fn enable_one_counter(name: &str) { + lind_perf::enable_counter_by_name(all_counters(), name); +} + +fn all_counters() -> impl Iterator { + LIND_BOOT_COUNTERS.iter().copied() +} + +/// Get a list of all counter names. +pub fn all_counter_names() -> Vec<&'static str> { + all_counters().filter_map(|c| c.get_name()).collect() +} + +/// Print a report for every module. +pub fn perf_report() { + // Note: `lind_perf::report*` are no-ops when lind-perf is built without + // its internal `enabled` feature. + lind_perf::report(LIND_BOOT_COUNTERS, format!("LIND-BOOT")); +} diff --git a/src/lind-perf/Cargo.toml b/src/lind-perf/Cargo.toml new file mode 100644 index 000000000..6d0147707 --- /dev/null +++ b/src/lind-perf/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "lind-perf" +version = "0.1.0" +edition = "2024" + +[features] +default = [] +enabled = [] + +[dependencies] +libc = "0.2" diff --git a/src/lind-perf/README.md b/src/lind-perf/README.md new file mode 100644 index 000000000..9140f59b9 --- /dev/null +++ b/src/lind-perf/README.md @@ -0,0 +1,88 @@ +# lind-perf + +`lind-perf` is the instrumentation crate used by Lind crates (for example `lind-boot`) to measure hot paths. + +This crate is defined in a manner where the callsites remain clean i.e. without needing conditional flags. This is implemented using the `enabled` feature. + +The public APIs used remain the same, but if the crate is compiled without the `enabled` feature, each operation is a no-op ensuring that the final binary is not polluted with unused codepaths. + +## Build Modes + +`lind-perf` supports two compile-time modes via Cargo feature `enabled`: + +1. `enabled` on: real counter accumulation + real reporting. +2. `enabled` off (default): API-compatible no-op behavior. + +`lind-boot` maps its crate feature `lind_perf` to `lind-perf/enabled`. + +## Public API + +Main exports: + +- `struct Counter` : Responsible for recording information such as cycles spent, calls made for a benchmarking site. +- `TimerKind::{Clock, Rdtsc}` : Clock uses `CLOCK_MONOTONIC_RAW`, Rdtsc: Time Stamp Counter. +- `fn set_timer(...)` : Set timer kind for list of Counters. +- `fn reset_all_counters(...)` : Reset Counters. +- `fn enable_counter_by_name(...)` : Enable Counter that matches the input name, disable the rest. +- `fn report(...)` : Print results from a set of counters. +- `static ENABLED: bool` : Check if `lind-perf` uses the `enabled` feature. + +Macro: + +- `lind_perf::get_timer!(COUNTER_PATH)` : Used to introduce a timer to a scope and start it. + +## Typical Usage + +Define a counters: + +```rust +pub static MY_COUNTER: lind_perf::Counter = lind_perf::Counter::new("my_crate::my_counter"); +``` + +Use timer to time a scope: + +```rust +(|| { + let _timer = lind_perf::get_timer!(crate::perf::MY_COUNTER); // Starts the timer + // measured work +})(); // Timer stops when dropped. +``` + +Timers can also be dropped manually for timing non-scope snippets: + +```rust +let _timer = lind_perf::get_timer!(crate::perf::MY_COUNTER); // Starts the timer +// measured work +drop(_timer); // Implicit drop +``` + +Counters can be enabled or disabled during runtime. The most common use-case for this is to sequentially enable a timer exclusively to avoid performance overheads. + +```rust +lind_perf::set_timer(ALL_COUNTERS, lind_perf::TimerKind::Clock); +lind_perf::reset_all(ALL_COUNTERS); +lind_perf::enable_name(ALL_COUNTERS, "my_crate::my_counter"); +``` + +Print report: + +```rust +lind_perf::report_header("MY-CRATE".to_string()); +lind_perf::report(ALL_COUNTERS); +``` + +## Disabled Mode Semantics + +When `enabled` is not set: + +- `Counter` is a lightweight no-op type. +- `get_timer!` returns a no-op scope guard. +- `set_timer/reset_all/enable_name/report*` are no-ops. +- `read_start/read_end` return `0`. + +This allows instrumentation to remain in code without `cfg` guards at callsites. + +## Timer Backends + +- `TimerKind::Clock`: uses `clock_gettime(CLOCK_MONOTONIC_RAW)` in enabled mode. +- `TimerKind::Rdtsc`: uses RDTSC/RDTSCP on `x86_64` (falls back to clock timing on non-`x86_64`). diff --git a/src/lind-perf/src/disabled.rs b/src/lind-perf/src/disabled.rs new file mode 100644 index 000000000..83b3aeefc --- /dev/null +++ b/src/lind-perf/src/disabled.rs @@ -0,0 +1,67 @@ +use crate::timers::TimerKind; + +/// Lightweight no-op counter representation for disabled builds. +pub struct Counter; + +impl Counter { + pub const fn new(_name: &'static str) -> Self { + Self + } + + pub fn get_name(&self) -> Option<&'static str> { + None + } + + #[inline(always)] + pub fn start(&self) -> u64 { + let _ = self; + 0 + } + + #[inline(always)] + pub fn record(&self, _start: u64) { + let _ = self; + } + + #[inline(always)] + pub fn get_timer(&self) -> Scope { + let _ = self; + Scope + } + + pub fn enable(&self) { + let _ = self; + } + + pub fn disable(&self) { + let _ = self; + } + + pub fn reset(&self) { + let _ = self; + } + + pub fn set_timer_kind(&self, _kind: TimerKind) { + let _ = self; + } + + pub fn timer_kind(&self) -> TimerKind { + TimerKind::Clock + } +} + +/// No-op RAII guard for disabled builds. +pub struct Scope; + +impl Drop for Scope { + fn drop(&mut self) {} +} + +// No-op implementations for the rest. +pub fn reset_all_counters(_counters: impl IntoIterator) {} + +pub fn set_timer(_counters: impl IntoIterator, _kind: TimerKind) {} + +pub fn enable_counter_by_name(_counters: impl IntoIterator, _name: &str) {} + +pub fn report(_counters: &[&Counter], _header: String) {} diff --git a/src/lind-perf/src/enabled/counter.rs b/src/lind-perf/src/enabled/counter.rs new file mode 100644 index 000000000..0d83d74d7 --- /dev/null +++ b/src/lind-perf/src/enabled/counter.rs @@ -0,0 +1,140 @@ +use crate::timers::{TimerKind, default_timer_kind, read_end, read_start}; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU64, Ordering}; + +/// Counter stores information pertaining to a specific benchmarking site. +/// +/// Timing starts when the counter is introduced to the scope using get_timer() and ends once it +/// gets dropped. +pub struct Counter { + /// Counts the total number of CPU cycles or Nanoseconds spent. + pub cycles: AtomicU64, + /// Counts the total number of invocations. + pub calls: AtomicU64, + /// Globally unique name for the counter. + pub name: &'static str, + /// Depending on the workflow being timed, having multiple timers enabled at the same time can + /// lead to unacceptable timing overheads. To avoid this, each Counter can be dynamically + /// enabled or disabled. + /// + /// When disabled, operations such as start() or end() are no-ops. + pub enabled: AtomicBool, + /// Stores TimerKind. + timer: AtomicU8, +} + +impl Counter { + /// Create a counter with the default timer. + pub const fn new(name: &'static str) -> Self { + Self { + cycles: AtomicU64::new(0), + calls: AtomicU64::new(0), + name, + enabled: AtomicBool::new(false), + timer: AtomicU8::new(default_timer_kind() as u8), + } + } + + /// Get the name of the Counter + pub fn get_name(&self) -> Option<&'static str> { + Some(self.name) + } + + #[inline(always)] + /// Start a measurement for this counter. + /// + /// Returns `0` if the counter is disabled. + pub fn start(&self) -> u64 { + if self.enabled.load(Ordering::Relaxed) { + read_start(self.timer_kind()) + } else { + 0 + } + } + + #[inline(always)] + /// Record a measurement using the start timestamp. + /// + /// This is a no-op when the counter is disabled. + pub fn record(&self, start: u64) { + if self.enabled.load(Ordering::Relaxed) { + let elapsed = read_end(self.timer_kind()).saturating_sub(start); + self.cycles.fetch_add(elapsed, Ordering::Relaxed); + self.calls.fetch_add(1, Ordering::Relaxed); + } + } + + #[inline(always)] + /// Create an RAII scope guard that records on drop. + pub fn get_timer(&self) -> Scope<'_> { + Scope { + counter: self, + start: self.start(), + } + } + + /// Enable this counter. + pub fn enable(&self) { + self.enabled.store(true, Ordering::Relaxed); + } + + /// Disable this counter. + pub fn disable(&self) { + self.enabled.store(false, Ordering::Relaxed); + } + + /// Reset totals for this counter. + pub fn reset(&self) { + self.cycles.store(0, Ordering::Relaxed); + self.calls.store(0, Ordering::Relaxed); + } + + /// Set the timer backend for this counter. + pub fn set_timer_kind(&self, kind: TimerKind) { + self.timer.store(kind as u8, Ordering::Relaxed); + } + + /// Read the current timer backend. + pub fn timer_kind(&self) -> TimerKind { + match self.timer.load(Ordering::Relaxed) { + 0 => TimerKind::Rdtsc, + _ => TimerKind::Clock, + } + } +} + +/// Scope is the RAII guard that records elapsed time on drop. +pub struct Scope<'a> { + counter: &'a Counter, + start: u64, +} + +impl Drop for Scope<'_> { + fn drop(&mut self) { + self.counter.record(self.start); + } +} + +/// Reset all counters in a group. +pub fn reset_all_counters(counters: impl IntoIterator) { + for c in counters { + c.reset(); + } +} + +/// Set a timer for a counter group. +pub fn set_timer(counters: impl IntoIterator, kind: TimerKind) { + for c in counters { + c.set_timer_kind(kind); + } +} + +/// Enable only the named counter in a group. +pub fn enable_counter_by_name(counters: impl IntoIterator, name: &str) { + for c in counters { + if c.name == name { + c.enable(); + } else { + c.disable(); + } + } +} diff --git a/src/lind-perf/src/enabled/mod.rs b/src/lind-perf/src/enabled/mod.rs new file mode 100644 index 000000000..5fb15b0bd --- /dev/null +++ b/src/lind-perf/src/enabled/mod.rs @@ -0,0 +1,5 @@ +mod counter; +mod report; + +pub use counter::*; +pub use report::*; diff --git a/src/lind-perf/src/enabled/report.rs b/src/lind-perf/src/enabled/report.rs new file mode 100644 index 000000000..035c07638 --- /dev/null +++ b/src/lind-perf/src/enabled/report.rs @@ -0,0 +1,86 @@ +use crate::{Counter, TimerKind}; +use std::sync::atomic::Ordering; +use std::time::Duration; + +/// Formats nanosecond totals for reports. Converts nanosecond input to larger units where +/// appropriate and truncates to 3 decimal points. +pub struct PrettyDuration(pub Duration); + +impl std::fmt::Display for PrettyDuration { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let ns_f = self.0.as_nanos() as f64; + + let format = if ns_f < 1_000.0 { + format!("{:.3}ns", ns_f) + } else if ns_f < 1_000_000.0 { + format!("{:.3}us", ns_f / 1_000.0) + } else if ns_f < 1_000_000_000.0 { + format!("{:.3}ms", ns_f / 1_000_000.0) + } else { + format!("{:.3}s", ns_f / 1_000_000_000.0) + }; + + write!(f, "{}", format) + } +} + +/// Print a report for a counter group. +pub fn report(counters: &[&Counter], header: String) { + const NAME_W: usize = 60; + const CALLS_W: usize = 10; + const NUM_W: usize = 12; + + let pad = "-"; + let total = 97 - header.len(); + let left = total / 2; + let right = total - left; + + println!("\n{}{}{}", pad.repeat(left), header, pad.repeat(right),); + let mut rows: Vec = Vec::new(); + + for c in counters { + let calls = c.calls.load(Ordering::Relaxed); + if calls == 0 { + continue; + } + + let cycles = match c.timer_kind() { + TimerKind::Rdtsc => format!("{:#?}", c.cycles.load(Ordering::Relaxed)), + TimerKind::Clock => format!( + "{}", + PrettyDuration(Duration::from_nanos(c.cycles.load(Ordering::Relaxed))) + ), + }; + + let avg = match c.timer_kind() { + TimerKind::Rdtsc => format!("{:#?}", c.cycles.load(Ordering::Relaxed) / calls), + TimerKind::Clock => format!( + "{}", + PrettyDuration(Duration::from_nanos( + c.cycles.load(Ordering::Relaxed) / calls + )) + ), + }; + + rows.push(format!( + "{:CALLS_W$} {:>NUM_W$} {:>NUM_W$}", + c.name, calls, cycles, avg, + )); + } + + if rows.is_empty() { + return; + } + + println!( + "{:CALLS_W$} {:>NUM_W$} {:>NUM_W$}", + "name", "calls", "total", "avg", + ); + println!("{}", "-".repeat(NAME_W + CALLS_W + NUM_W * 2 + 3)); + + for row in rows { + println!("{}", row); + } + + println!(""); +} diff --git a/src/lind-perf/src/lib.rs b/src/lind-perf/src/lib.rs new file mode 100644 index 000000000..17b465bb4 --- /dev/null +++ b/src/lind-perf/src/lib.rs @@ -0,0 +1,27 @@ +mod timers; + +#[cfg(not(feature = "enabled"))] +mod disabled; +#[cfg(feature = "enabled")] +mod enabled; + +pub use timers::*; + +#[cfg(not(feature = "enabled"))] +pub use disabled::*; +#[cfg(feature = "enabled")] +pub use enabled::*; + +// Exported runtime flag used by callers (for example lind-boot CLI handling) +// to decide whether a requested perf mode can actually run. +#[cfg(not(feature = "enabled"))] +pub static ENABLED: bool = false; +#[cfg(feature = "enabled")] +pub static ENABLED: bool = true; + +#[macro_export] +macro_rules! get_timer { + // Always available macro. In disabled builds, `get_timer()` returns a + // no-op scope object from `disabled::Counter`. + ($counter:path) => {{ $counter.get_timer() }}; +} diff --git a/src/lind-perf/src/timers.rs b/src/lind-perf/src/timers.rs new file mode 100644 index 000000000..022e9a308 --- /dev/null +++ b/src/lind-perf/src/timers.rs @@ -0,0 +1,95 @@ +/// TimerKind defines the timer-backend to be used for benchmarks. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum TimerKind { + Rdtsc = 0, + Clock = 1, +} + +/// Get the default timer. +pub const fn default_timer_kind() -> TimerKind { + TimerKind::Clock +} + +/// Public functions to record start and end times depending on the TimerKind being used. +#[inline(always)] +pub fn read_start(_kind: TimerKind) -> u64 { + #[cfg(feature = "enabled")] + match _kind { + TimerKind::Rdtsc => rdtsc_start(), + TimerKind::Clock => clock_now(), + } + + #[cfg(not(feature = "enabled"))] + 0 +} + +#[inline(always)] +pub fn read_end(_kind: TimerKind) -> u64 { + #[cfg(feature = "enabled")] + match _kind { + TimerKind::Rdtsc => rdtsc_end(), + TimerKind::Clock => clock_now(), + } + + #[cfg(not(feature = "enabled"))] + 0 +} + +// RDTSC timers for measuring CPU Cycles. +// Only available on x86 based processors. +#[inline(always)] +fn rdtsc_start() -> u64 { + #[cfg(target_arch = "x86_64")] + unsafe { + // Serialize execution before reading the TSC so that + // no prior loads or instructions are speculatively + // reordered past the timestamp read. + // + // See: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence&ig_expand=3977 + core::arch::x86_64::_mm_lfence(); + + // Get the RDTSC counter. + return core::arch::x86_64::_rdtsc(); + } + + #[cfg(not(target_arch = "x86_64"))] + clock_now() +} + +// Separate start/end functions are required for RDTSC because +// fencing semantics differ before and after the measurement. +#[inline(always)] +fn rdtsc_end() -> u64 { + #[cfg(target_arch = "x86_64")] + unsafe { + let mut aux = 0u32; + // RDTSCP is partially serializing: it waits for prior + // instructions to complete before reading the TSC. + let tsc = core::arch::x86_64::__rdtscp(&mut aux); + + // Fence after the read to prevent subsequent instructions + // from being speculatively executed before the timestamp. + core::arch::x86_64::_mm_lfence(); + return tsc; + } + + #[cfg(not(target_arch = "x86_64"))] + clock_now() +} + +// CLOCK_MONOTONIC_RAW based timer used for nanoseconds measurements. Same function can be used for +// start and end. +#[inline(always)] +fn clock_now() -> u64 { + let mut ts = libc::timespec { + tv_sec: 0, + tv_nsec: 0, + }; + let rc = unsafe { libc::clock_gettime(libc::CLOCK_MONOTONIC_RAW, &mut ts) }; + if rc != 0 { + panic!("Unable to get a CLOCK_MONOTONIC_RAW time. Aborting benchmarks."); + } + (ts.tv_sec as u64) + .saturating_mul(1_000_000_000) + .saturating_add(ts.tv_nsec as u64) +} diff --git a/src/rawposix/Cargo.toml b/src/rawposix/Cargo.toml index 27cc71a42..aa88c9a88 100644 --- a/src/rawposix/Cargo.toml +++ b/src/rawposix/Cargo.toml @@ -16,10 +16,12 @@ sysdefs = { path = "../sysdefs" } typemap = { path = "../typemap" } cage = { path = "../cage" } threei = { path = "../threei" } +lind-perf = { path = "../lind-perf" } [features] default = ["fast"] fast = [] secure = [] +lind_perf = ["lind-perf/enabled"] [dev-dependencies] diff --git a/src/threei/Cargo.toml b/src/threei/Cargo.toml index e3f838b13..f16f42f4f 100644 --- a/src/threei/Cargo.toml +++ b/src/threei/Cargo.toml @@ -13,11 +13,13 @@ once_cell = "1.18" lazy_static = "1.4" parking_lot = "0.12" nodit = "0.9.2" # Used for VMMAP +lind-perf = { path = "../lind-perf" } [features] -default = ["hashmap"] +default = ["hashmap"] hashmap = [] -dashmap = [] +dashmap = [] +lind_perf = ["lind-perf/enabled"] [dev-dependencies] serial_test = "3" diff --git a/src/wasmtime/crates/lind-common/Cargo.toml b/src/wasmtime/crates/lind-common/Cargo.toml index dfc146711..17e9a9f8e 100644 --- a/src/wasmtime/crates/lind-common/Cargo.toml +++ b/src/wasmtime/crates/lind-common/Cargo.toml @@ -23,6 +23,8 @@ sysdefs = { path = "../sysdefs" } wasmtime-lind-3i = { path = "../lind-3i" } cage = { path = "../cage" } typemap = { path = "../typemap" } +lind-perf = { path = "../lind-perf" } [features] lind_debug = [] +lind_perf = ["lind-perf/enabled", "threei/lind_perf", "rawposix/lind_perf"] diff --git a/src/wasmtime/crates/lind-perf b/src/wasmtime/crates/lind-perf new file mode 120000 index 000000000..fd055b135 --- /dev/null +++ b/src/wasmtime/crates/lind-perf @@ -0,0 +1 @@ +../../lind-perf \ No newline at end of file