diff --git a/Cargo.lock b/Cargo.lock index cffde82832..98f7faecb6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4688,6 +4688,7 @@ name = "nvme_driver" version = "0.0.0" dependencies = [ "anyhow", + "async-trait", "chipset_device", "disklayer_ram", "event-listener", @@ -4699,6 +4700,7 @@ dependencies = [ "mesh", "nvme", "nvme_spec", + "nvme_test", "pal_async", "parking_lot", "pci_core", @@ -4736,6 +4738,40 @@ dependencies = [ "zerocopy 0.8.24", ] +[[package]] +name = "nvme_test" +version = "0.0.0" +dependencies = [ + "async-trait", + "chipset_device", + "device_emulators", + "disk_backend", + "event-listener", + "futures", + "futures-concurrency", + "guestmem", + "guid", + "inspect", + "mesh", + "nvme_common", + "nvme_resources", + "nvme_spec", + "pal_async", + "parking_lot", + "pci_core", + "pci_resources", + "scsi_buffers", + "task_control", + "thiserror 2.0.12", + "tracelimit", + "tracing", + "unicycle", + "user_driver", + "vm_resource", + "vmcore", + "zerocopy 0.8.24", +] + [[package]] name = "object" version = "0.36.7" diff --git a/Cargo.toml b/Cargo.toml index 48382cd0be..6a4cef7a64 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,7 +45,7 @@ members = [ "petri/petri-tool", "vm/loader/igvmfilegen", "vm/vmgs/vmgs_lib", - "vm/vmgs/vmgstool", + "vm/vmgs/vmgstool", ] exclude = [ "xsync", @@ -248,6 +248,7 @@ nvme_common = { path = "vm/devices/storage/nvme_common" } nvme_driver = { path = "vm/devices/storage/disk_nvme/nvme_driver" } nvme_resources = { path = "vm/devices/storage/nvme_resources" } nvme_spec = { path = "vm/devices/storage/nvme_spec" } +nvme_test = { path = "vm/devices/storage/nvme_test" } storage_string = { path = "vm/devices/storage/storage_string" } vmswitch = { path = "vm/devices/net/vmswitch" } pci_bus = { path = "vm/devices/pci/pci_bus" } diff --git a/vm/devices/storage/disk_nvme/nvme_driver/Cargo.toml b/vm/devices/storage/disk_nvme/nvme_driver/Cargo.toml index 1af330a415..c63694be73 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/Cargo.toml +++ b/vm/devices/storage/disk_nvme/nvme_driver/Cargo.toml @@ -36,6 +36,8 @@ pci_core.workspace = true scsi_buffers.workspace = true test_with_tracing.workspace = true user_driver_emulated_mock.workspace = true +nvme_test.workspace = true +async-trait.workspace = true guid.workspace = true diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs index e87f8246fd..a001f16eec 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs @@ -10,13 +10,19 @@ use inspect::Inspect; use inspect::InspectMut; use nvme::NvmeControllerCaps; use nvme_spec::Cap; +use nvme_spec::Command; +use nvme_spec::Completion; use nvme_spec::nvm::DsmRange; +use nvme_test::FaultConfiguration; +use nvme_test::QueueFaultBehavior; use pal_async::DefaultDriver; use pal_async::async_test; +use pal_async::timer::PolledTimer; use parking_lot::Mutex; use pci_core::msi::MsiInterruptSet; use scsi_buffers::OwnedRequestBuffers; use std::sync::Arc; +use std::time::Duration; use test_with_tracing::test; use user_driver::DeviceBacking; use user_driver::DeviceRegisterIo; @@ -26,9 +32,57 @@ use user_driver_emulated_mock::DeviceTestMemory; use user_driver_emulated_mock::EmulatedDevice; use user_driver_emulated_mock::Mapping; use vmcore::vm_task::SingleDriverBackend; +use vmcore::vm_task::VmTaskDriver; use vmcore::vm_task::VmTaskDriverSource; use zerocopy::IntoBytes; +struct AdminQueueFault { + pub driver: VmTaskDriver, +} + +#[async_trait::async_trait] +impl nvme_test::QueueFault for AdminQueueFault { + async fn fault_submission_queue(&self, mut command: Command) -> QueueFaultBehavior { + tracing::info!("Faulting submission queue using cid sequence number mismatch"); + let opcode = nvme_spec::AdminOpcode(command.cdw0.opcode()); + match opcode { + nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE => { + // Overwrite the previous cid to cause a panic. + command.cdw0.set_cid(0); + QueueFaultBehavior::Update(command) + } + _ => QueueFaultBehavior::Default, + } + } + + async fn fault_completion_queue( + &self, + _completion: Completion, + ) -> QueueFaultBehavior { + tracing::info!("Faulting completion queue using delay"); + PolledTimer::new(&self.driver) + .sleep(Duration::from_millis(100)) + .await; + QueueFaultBehavior::Default + } +} + +#[async_test] +#[should_panic(expected = "assertion `left == right` failed: cid sequence number mismatch:")] +async fn test_nvme_command_fault(driver: DefaultDriver) { + let task_driver = VmTaskDriverSource::new(SingleDriverBackend::new(driver.clone())).simple(); + + test_nvme_fault_injection( + driver, + FaultConfiguration { + admin_fault: Some(Box::new(AdminQueueFault { + driver: task_driver, + })), + }, + ) + .await; +} + #[async_test] async fn test_nvme_driver_direct_dma(driver: DefaultDriver) { test_nvme_driver(driver, true).await; @@ -309,6 +363,62 @@ async fn test_nvme_save_restore_inner(driver: DefaultDriver) { // .unwrap(); } +async fn test_nvme_fault_injection(driver: DefaultDriver, fault_configuration: FaultConfiguration) { + const MSIX_COUNT: u16 = 2; + const IO_QUEUE_COUNT: u16 = 64; + const CPU_COUNT: u32 = 64; + + // Arrange: Create 8MB of space. First 4MB for the device and second 4MB for the payload. + let pages = 1024; // 4MB + let device_test_memory = DeviceTestMemory::new(pages * 2, false, "test_nvme_driver"); + let guest_mem = device_test_memory.guest_memory(); // Access to 0-8MB + let dma_client = device_test_memory.dma_client(); // Access 0-4MB + let payload_mem = device_test_memory.payload_mem(); // allow_dma is false, so this will follow the 'normal' test path (i.e. with bounce buffering behind the scenes) + + // Arrange: Create the NVMe controller and driver. + let driver_source = VmTaskDriverSource::new(SingleDriverBackend::new(driver)); + let mut msi_set = MsiInterruptSet::new(); + let nvme = nvme_test::NvmeFaultController::new( + &driver_source, + guest_mem.clone(), + &mut msi_set, + &mut ExternallyManagedMmioIntercepts, + nvme_test::NvmeFaultControllerCaps { + msix_count: MSIX_COUNT, + max_io_queues: IO_QUEUE_COUNT, + subsystem_id: Guid::new_random(), + }, + fault_configuration, + ); + + nvme.client() // 2MB namespace + .add_namespace(1, disklayer_ram::ram_disk(2 << 20, false).unwrap()) + .await + .unwrap(); + let device = NvmeTestEmulatedDevice::new(nvme, msi_set, dma_client.clone()); + let driver = NvmeDriver::new(&driver_source, CPU_COUNT, device, false) + .await + .unwrap(); + let namespace = driver.namespace(1).await.unwrap(); + + // Act: Write 1024 bytes of data to disk starting at LBA 1. + let buf_range = OwnedRequestBuffers::linear(0, 16384, true); // 32 blocks + payload_mem.write_at(0, &[0xcc; 4096]).unwrap(); + namespace + .write( + 0, + 1, + 2, + false, + &payload_mem, + buf_range.buffer(&payload_mem).range(), + ) + .await + .unwrap(); + + driver.shutdown().await; +} + #[derive(Inspect)] pub struct NvmeTestEmulatedDevice { device: EmulatedDevice, diff --git a/vm/devices/storage/nvme/Cargo.toml b/vm/devices/storage/nvme/Cargo.toml index 0bbe454681..4bf530c2db 100644 --- a/vm/devices/storage/nvme/Cargo.toml +++ b/vm/devices/storage/nvme/Cargo.toml @@ -26,12 +26,12 @@ guid.workspace = true inspect.workspace = true mesh.workspace = true pal_async.workspace = true -task_control.workspace = true async-trait.workspace = true event-listener.workspace = true futures.workspace = true futures-concurrency.workspace = true parking_lot.workspace = true +task_control.workspace = true thiserror.workspace = true tracelimit.workspace = true tracing.workspace = true diff --git a/vm/devices/storage/nvme_spec/src/lib.rs b/vm/devices/storage/nvme_spec/src/lib.rs index c66d11d5a4..f1843c695e 100644 --- a/vm/devices/storage/nvme_spec/src/lib.rs +++ b/vm/devices/storage/nvme_spec/src/lib.rs @@ -209,7 +209,7 @@ open_enum! { } #[repr(C)] -#[derive(Debug, IntoBytes, Immutable, KnownLayout, FromBytes)] +#[derive(Debug, Clone, IntoBytes, Immutable, KnownLayout, FromBytes)] pub struct Completion { pub dw0: u32, pub dw1: u32, diff --git a/vm/devices/storage/nvme_test/Cargo.toml b/vm/devices/storage/nvme_test/Cargo.toml new file mode 100644 index 0000000000..1598dcad6d --- /dev/null +++ b/vm/devices/storage/nvme_test/Cargo.toml @@ -0,0 +1,45 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +[package] +name = "nvme_test" +edition.workspace = true +rust-version.workspace = true + +[dependencies] +disk_backend.workspace = true +nvme_common.workspace = true +nvme_resources.workspace = true +nvme_spec.workspace = true +scsi_buffers.workspace = true + +device_emulators.workspace = true +pci_core.workspace = true +pci_resources.workspace = true + +chipset_device.workspace = true +guestmem.workspace = true +vmcore.workspace = true +vm_resource.workspace = true + +guid.workspace = true +inspect.workspace = true +mesh.workspace = true +pal_async.workspace = true +async-trait.workspace = true +event-listener.workspace = true +futures.workspace = true +futures-concurrency.workspace = true +parking_lot.workspace = true +task_control.workspace = true +thiserror.workspace = true +tracelimit.workspace = true +tracing.workspace = true +unicycle.workspace = true +zerocopy = { workspace = true, features = ["alloc"] } + +[dev-dependencies] +user_driver.workspace = true + +[lints] +workspace = true diff --git a/vm/devices/storage/nvme_test/src/error.rs b/vm/devices/storage/nvme_test/src/error.rs new file mode 100644 index 0000000000..69a589d52c --- /dev/null +++ b/vm/devices/storage/nvme_test/src/error.rs @@ -0,0 +1,78 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Error and result related types. + +use crate::spec; +use std::error::Error; + +/// An NVMe error, consisting of a status code and optional error source. +#[derive(Debug)] +pub struct NvmeError { + status: spec::Status, + source: Option>, +} + +impl NvmeError { + pub fn new(status: spec::Status, source: impl Into>) -> Self { + Self { + status, + source: Some(source.into()), + } + } +} + +impl Error for NvmeError { + fn source(&self) -> Option<&(dyn Error + 'static)> { + self.source.as_ref().map(|x| x.as_ref() as _) + } +} + +impl std::fmt::Display for NvmeError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.status.status_code_type() { + spec::StatusCodeType::GENERIC => { + write!(f, "general error {:#x?}", self.status) + } + spec::StatusCodeType::COMMAND_SPECIFIC => { + write!(f, "command-specific error {:#x?}", self.status) + } + spec::StatusCodeType::MEDIA_ERROR => { + write!(f, "media error {:#x?}", self.status) + } + _ => write!(f, "{:#x?}", self.status), + } + } +} + +impl From for NvmeError { + fn from(status: spec::Status) -> Self { + NvmeError { + status, + source: None, + } + } +} + +/// The result of an NVMe command. +#[derive(Default)] +pub struct CommandResult { + pub status: spec::Status, + pub dw: [u32; 2], +} + +impl> From for CommandResult { + fn from(status: T) -> Self { + CommandResult::new(status, [0; 2]) + } +} + +impl CommandResult { + pub fn new(status: impl Into, dw: [u32; 2]) -> Self { + let status = status.into(); + Self { + status: status.status, + dw, + } + } +} diff --git a/vm/devices/storage/nvme_test/src/lib.rs b/vm/devices/storage/nvme_test/src/lib.rs new file mode 100644 index 0000000000..d92d6f6104 --- /dev/null +++ b/vm/devices/storage/nvme_test/src/lib.rs @@ -0,0 +1,74 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! An implementation of an NVMe controller emulator. + +#![forbid(unsafe_code)] + +mod error; +mod namespace; +mod pci; +mod prp; +mod queue; +pub mod resolver; +mod workers; + +#[cfg(test)] +mod tests; + +pub use pci::NvmeFaultController; +pub use pci::NvmeFaultControllerCaps; +pub use workers::NvmeFaultControllerClient; + +use guestmem::ranges::PagedRange; +use nvme_spec as spec; +use workers::NsidConflict; + +// Device configuration shared by PCI and NVMe. +const DOORBELL_STRIDE_BITS: u8 = 2; +const VENDOR_ID: u16 = 0x1414; +const NVME_VERSION: u32 = 0x00020000; +const MAX_QES: u16 = 256; +const BAR0_LEN: u64 = 0x10000; +const IOSQES: u8 = 6; +const IOCQES: u8 = 4; + +// NVMe page sizes. This must match the `PagedRange` page size. +const PAGE_SIZE: usize = 4096; +const PAGE_SIZE64: u64 = 4096; +const PAGE_MASK: u64 = !(PAGE_SIZE64 - 1); +const PAGE_SHIFT: u32 = PAGE_SIZE.trailing_zeros(); +const _: () = assert!(PAGE_SIZE == PagedRange::PAGE_SIZE); + +/// Supported fault behaviour for NVMe queues +#[derive(Debug, Clone, Copy)] +pub enum QueueFaultBehavior { + /// Update the queue entry with the returned data + Update(T), + /// Drop the queue entry + Drop, + /// No Fault, proceed as normal + Default, +} + +/// Provides fault logic for a pair of submission and completion queue. +#[async_trait::async_trait] +pub trait QueueFault { + /// Provided a command in the submission queue, return the appropriate fault behavior. + async fn fault_submission_queue( + &self, + command: spec::Command, + ) -> QueueFaultBehavior; + + /// Provided a command in the completion queue, return the appropriate fault behavior. + async fn fault_completion_queue( + &self, + completion: spec::Completion, + ) -> QueueFaultBehavior; +} + +/// Configuration for NVMe controller faults. +pub struct FaultConfiguration { + /// Fault to apply to the admin queues + pub admin_fault: Option>, +} diff --git a/vm/devices/storage/nvme_test/src/namespace.rs b/vm/devices/storage/nvme_test/src/namespace.rs new file mode 100644 index 0000000000..ffd1662b66 --- /dev/null +++ b/vm/devices/storage/nvme_test/src/namespace.rs @@ -0,0 +1,259 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! NVMe NVM namespace implementation. + +mod reservations; + +use crate::error::CommandResult; +use crate::error::NvmeError; +use crate::prp::PrpRange; +use crate::spec; +use crate::spec::nvm; +use disk_backend::Disk; +use guestmem::GuestMemory; +use inspect::Inspect; +use scsi_buffers::RequestBuffers; +use zerocopy::FromBytes; +use zerocopy::FromZeros; +use zerocopy::IntoBytes; + +/// An NVMe namespace built on top of a [`Disk`]. +#[derive(Inspect)] +pub struct Namespace { + disk: Disk, + nsid: u32, + mem: GuestMemory, + block_shift: u32, + pr: bool, +} + +impl Namespace { + pub fn new(mem: GuestMemory, nsid: u32, disk: Disk) -> Self { + Self { + block_shift: disk.sector_size().trailing_zeros(), + pr: disk.pr().is_some(), + mem, + disk, + nsid, + } + } + + pub fn identify(&self, buf: &mut [u8]) { + let id = nvm::IdentifyNamespace::mut_from_prefix(buf).unwrap().0; // TODO: zerocopy: from-prefix (mut_from_prefix): use-rest-of-range (https://github.com/microsoft/openvmm/issues/759) + let size = self.disk.sector_count(); + + let rescap = if let Some(pr) = self.disk.pr() { + let caps = pr.capabilities(); + nvm::ReservationCapabilities::new() + .with_write_exclusive(caps.write_exclusive) + .with_exclusive_access(caps.exclusive_access) + .with_write_exclusive_registrants_only(caps.write_exclusive_registrants_only) + .with_exclusive_access_registrants_only(caps.exclusive_access_registrants_only) + .with_write_exclusive_all_registrants(caps.write_exclusive_all_registrants) + .with_exclusive_access_all_registrants(caps.exclusive_access_all_registrants) + } else { + nvm::ReservationCapabilities::new() + }; + + *id = nvm::IdentifyNamespace { + nsze: size, + ncap: size, + nuse: size, + nlbaf: 0, + flbas: nvm::Flbas::new().with_low_index(0), + rescap, + ..FromZeros::new_zeroed() + }; + id.lbaf[0] = nvm::Lbaf::new().with_lbads(self.block_shift as u8); + } + + pub fn namespace_id_descriptor(&self, buf: &mut [u8]) { + let id = nvm::NamespaceIdentificationDescriptor::mut_from_prefix(buf) + .unwrap() + .0; // TODO: zerocopy: from-prefix (mut_from_prefix): use-rest-of-range (https://github.com/microsoft/openvmm/issues/759) + let mut nid = [0u8; 0x10]; + if let Some(guid) = self.disk.disk_id() { + nid = guid; + } + *id = nvm::NamespaceIdentificationDescriptor { + nidt: nvm::NamespaceIdentifierType::NSGUID.0, + nidl: size_of_val(&nid) as u8, + rsvd: [0, 0], + nid, + }; + } + + pub async fn get_feature(&self, command: &spec::Command) -> Result { + let cdw10: spec::Cdw10GetFeatures = command.cdw10.into(); + let mut dw = [0; 2]; + + // Note that we don't support non-zero cdw10.sel, since ONCS.save == 0. + match spec::Feature(cdw10.fid()) { + spec::Feature::NVM_RESERVATION_PERSISTENCE if self.pr => { + dw[0] = self + .get_reservation_persistence(self.disk.pr().unwrap()) + .await? + .into(); + } + feature => { + tracelimit::warn_ratelimited!(nsid = self.nsid, ?feature, "unsupported feature"); + return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()); + } + } + Ok(CommandResult::new(spec::Status::SUCCESS, dw)) + } + + /// Waits for the namespace identify result to change. + /// + /// Returns an opaque token to use for the next wait. + pub async fn wait_change(&self, token: Option) -> u64 { + // Use the sector count as the token, since that's the only thing that + // can currently change. + let sector_count = token.unwrap_or_else(|| self.disk.sector_count()); + self.disk.wait_resize(sector_count).await + } + + pub async fn nvm_command( + &self, + max_data_transfer_size: usize, + command: &spec::Command, + ) -> Result { + let opcode = nvm::NvmOpcode(command.cdw0.opcode()); + tracing::trace!(nsid = self.nsid, ?opcode, ?command, "nvm command"); + + match opcode { + nvm::NvmOpcode::READ => { + let cdw10 = nvm::Cdw10ReadWrite::from(command.cdw10); + let cdw11 = nvm::Cdw11ReadWrite::from(command.cdw11); + let cdw12 = nvm::Cdw12ReadWrite::from(command.cdw12); + let lba = cdw10.sbla_low() as u64 | ((cdw11.sbla_high() as u64) << 32); + let count = cdw12.nlb_z() as usize + 1; + let byte_count = count << self.block_shift; + if byte_count > max_data_transfer_size { + return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()); + } + let range = PrpRange::parse(&self.mem, byte_count, command.dptr)?; + + let disk_sector_count = self.disk.sector_count(); + if disk_sector_count < lba || disk_sector_count - lba < count as u64 { + return Err(spec::Status::LBA_OUT_OF_RANGE.into()); + } + + tracing::trace!(nsid = self.nsid, lba, count, byte_count, "read"); + + let buffers = RequestBuffers::new(&self.mem, range.range(), true); + self.disk + .read_vectored(&buffers, lba) + .await + .map_err(map_disk_error)?; + } + nvm::NvmOpcode::WRITE => { + let cdw10 = nvm::Cdw10ReadWrite::from(command.cdw10); + let cdw11 = nvm::Cdw11ReadWrite::from(command.cdw11); + let cdw12 = nvm::Cdw12ReadWrite::from(command.cdw12); + let lba = cdw10.sbla_low() as u64 | ((cdw11.sbla_high() as u64) << 32); + let count = cdw12.nlb_z() as usize + 1; + let byte_count = count << self.block_shift; + if byte_count > max_data_transfer_size { + return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()); + } + let range = PrpRange::parse(&self.mem, byte_count, command.dptr)?; + + let disk_sector_count = self.disk.sector_count(); + if disk_sector_count < lba || disk_sector_count - lba < count as u64 { + return Err(spec::Status::LBA_OUT_OF_RANGE.into()); + } + + tracing::trace!(nsid = self.nsid, lba, count, byte_count, "write"); + + let buffers = RequestBuffers::new(&self.mem, range.range(), false); + self.disk + .write_vectored(&buffers, lba, cdw12.fua()) + .await + .map_err(map_disk_error)?; + } + nvm::NvmOpcode::FLUSH => { + tracing::debug!(nsid = self.nsid, "flush"); + if !self.disk.is_read_only() { + self.disk.sync_cache().await.map_err(map_disk_error)?; + } + } + nvm::NvmOpcode::DSM => { + let cdw10 = nvm::Cdw10Dsm::from(command.cdw10); + let cdw11 = nvm::Cdw11Dsm::from(command.cdw11); + // TODO: zerocopy: manual: review carefully! (https://github.com/microsoft/openvmm/issues/759) + let mut dsm_ranges = + <[nvm::DsmRange]>::new_box_zeroed_with_elems(cdw10.nr_z() as usize + 1) + .unwrap(); + let prp = + PrpRange::parse(&self.mem, size_of_val(dsm_ranges.as_ref()), command.dptr)?; + prp.read(&self.mem, dsm_ranges.as_mut_bytes())?; + tracing::debug!(nsid = self.nsid, ?cdw11, ?dsm_ranges, "dsm"); + if cdw11.ad() { + for range in dsm_ranges.as_ref() { + self.disk + .unmap(range.starting_lba, range.lba_count.into(), false) + .await + .map_err(map_disk_error)?; + } + } + } + nvm::NvmOpcode::RESERVATION_REGISTER if self.pr => { + self.reservation_register(self.disk.pr().unwrap(), command) + .await? + } + nvm::NvmOpcode::RESERVATION_REPORT if self.pr => { + self.reservation_report(self.disk.pr().unwrap(), command) + .await? + } + nvm::NvmOpcode::RESERVATION_ACQUIRE if self.pr => { + self.reservation_acquire(self.disk.pr().unwrap(), command) + .await? + } + nvm::NvmOpcode::RESERVATION_RELEASE if self.pr => { + self.reservation_release(self.disk.pr().unwrap(), command) + .await? + } + opcode => { + tracelimit::warn_ratelimited!(nsid = self.nsid, ?opcode, "unsupported nvm opcode"); + return Err(spec::Status::INVALID_COMMAND_OPCODE.into()); + } + } + Ok(Default::default()) + } +} + +fn map_disk_error(err: disk_backend::DiskError) -> NvmeError { + match err { + disk_backend::DiskError::ReservationConflict => spec::Status::RESERVATION_CONFLICT.into(), + disk_backend::DiskError::MemoryAccess(err) => { + NvmeError::new(spec::Status::DATA_TRANSFER_ERROR, err) + } + disk_backend::DiskError::AbortDueToPreemptAndAbort => { + NvmeError::new(spec::Status::COMMAND_ABORTED_DUE_TO_PREEMPT_AND_ABORT, err) + } + disk_backend::DiskError::IllegalBlock => spec::Status::LBA_OUT_OF_RANGE.into(), + disk_backend::DiskError::InvalidInput => spec::Status::INVALID_FIELD_IN_COMMAND.into(), + disk_backend::DiskError::Io(err) => NvmeError::new(spec::Status::DATA_TRANSFER_ERROR, err), + disk_backend::DiskError::MediumError(_, details) => match details { + disk_backend::MediumErrorDetails::ApplicationTagCheckFailed => { + spec::Status::MEDIA_END_TO_END_APPLICATION_TAG_CHECK_ERROR.into() + } + disk_backend::MediumErrorDetails::GuardCheckFailed => { + spec::Status::MEDIA_END_TO_END_GUARD_CHECK_ERROR.into() + } + disk_backend::MediumErrorDetails::ReferenceTagCheckFailed => { + spec::Status::MEDIA_END_TO_END_REFERENCE_TAG_CHECK_ERROR.into() + } + disk_backend::MediumErrorDetails::UnrecoveredReadError => { + spec::Status::MEDIA_UNRECOVERED_READ_ERROR.into() + } + disk_backend::MediumErrorDetails::WriteFault => spec::Status::MEDIA_WRITE_FAULT.into(), + }, + disk_backend::DiskError::ReadOnly => { + spec::Status::ATTEMPTED_WRITE_TO_READ_ONLY_RANGE.into() + } + disk_backend::DiskError::UnsupportedEject => spec::Status::INVALID_COMMAND_OPCODE.into(), + } +} diff --git a/vm/devices/storage/nvme_test/src/namespace/reservations.rs b/vm/devices/storage/nvme_test/src/namespace/reservations.rs new file mode 100644 index 0000000000..cbe497e951 --- /dev/null +++ b/vm/devices/storage/nvme_test/src/namespace/reservations.rs @@ -0,0 +1,219 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Persistent reservation support + +use super::Namespace; +use super::map_disk_error; +use crate::error::NvmeError; +use crate::prp::PrpRange; +use crate::spec; +use crate::spec::nvm; +use disk_backend::pr::PersistentReservation; +use nvme_common::to_nvme_reservation_type; +use zerocopy::FromZeros; +use zerocopy::IntoBytes; + +impl Namespace { + pub(super) async fn reservation_register( + &self, + pr: &dyn PersistentReservation, + command: &spec::Command, + ) -> Result<(), NvmeError> { + let cdw10 = nvm::Cdw10ReservationRegister::from(command.cdw10); + let mut data = nvm::ReservationRegister::new_zeroed(); + let range = PrpRange::parse(&self.mem, size_of_val(&data), command.dptr)?; + range.read(&self.mem, data.as_mut_bytes())?; + + let current_key = (!cdw10.iekey()).then_some(data.crkey); + let ptpl = if pr.capabilities().persist_through_power_loss { + match nvm::ChangePersistThroughPowerLoss(cdw10.cptpl()) { + nvm::ChangePersistThroughPowerLoss::NO_CHANGE => None, + nvm::ChangePersistThroughPowerLoss::CLEAR => Some(false), + nvm::ChangePersistThroughPowerLoss::SET => Some(true), + _ => return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()), + } + } else { + None + }; + + match nvm::ReservationRegisterAction(cdw10.rrega()) { + nvm::ReservationRegisterAction::REGISTER => { + pr.register(None, data.nrkey, ptpl) + .await + .map_err(map_disk_error)?; + } + nvm::ReservationRegisterAction::UNREGISTER => { + pr.register(current_key, 0, ptpl) + .await + .map_err(map_disk_error)?; + } + nvm::ReservationRegisterAction::REPLACE => { + pr.register(current_key, data.nrkey, ptpl) + .await + .map_err(map_disk_error)?; + } + action => { + tracelimit::warn_ratelimited!(?action, "unsupported reservation register action"); + return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()); + } + } + Ok(()) + } + + pub(super) async fn reservation_report( + &self, + pr: &dyn PersistentReservation, + command: &spec::Command, + ) -> Result<(), NvmeError> { + let cdw10 = nvm::Cdw10ReservationReport::from(command.cdw10); + let cdw11 = nvm::Cdw11ReservationReport::from(command.cdw11); + let numd = cdw10.numd_z().saturating_add(1) as usize; + let len = numd * 4; + let range = PrpRange::parse(&self.mem, len, command.dptr)?; + + let report = pr.report().await.map_err(map_disk_error)?; + + let report_header = nvm::ReservationReportExtended { + report: nvm::ReservationReport { + generation: report.generation, + rtype: report + .reservation_type + .map_or(nvm::ReservationType(0), to_nvme_reservation_type), + regctl: (report.controllers.len() as u16).into(), + ptpls: 0, + ..FromZeros::new_zeroed() + }, + ..FromZeros::new_zeroed() + }; + + let controllers = report.controllers.iter().map(|controller| { + let mut hostid = [0; 16]; + let hostid_len = controller.host_id.len().min(hostid.len()); + hostid[..hostid_len].copy_from_slice(&controller.host_id[..hostid_len]); + nvm::RegisteredControllerExtended { + cntlid: controller.controller_id, + rcsts: nvm::ReservationStatus::new() + .with_holds_reservation(controller.holds_reservation), + hostid, + rkey: controller.key, + ..FromZeros::new_zeroed() + } + }); + + let mut data; + if cdw11.eds() { + data = report_header.as_bytes().to_vec(); + for controller in controllers { + data.extend(controller.as_bytes()); + } + } else { + data = report_header.report.as_bytes().to_vec(); + for controller in controllers { + data.extend( + nvm::RegisteredController { + cntlid: controller.cntlid, + rcsts: controller.rcsts, + hostid: controller.hostid[..8].try_into().unwrap(), + rkey: controller.rkey, + ..FromZeros::new_zeroed() + } + .as_bytes(), + ); + } + }; + + range.write(&self.mem, &data)?; + Ok(()) + } + + pub(super) async fn reservation_acquire( + &self, + pr: &dyn PersistentReservation, + command: &spec::Command, + ) -> Result<(), NvmeError> { + let cdw10 = nvm::Cdw10ReservationAcquire::from(command.cdw10); + let mut data = nvm::ReservationAcquire::new_zeroed(); + let range = PrpRange::parse(&self.mem, size_of_val(&data), command.dptr)?; + range.read(&self.mem, data.as_mut_bytes())?; + + // According to the spec, this is never to be set. + if cdw10.iekey() { + return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()); + } + + let reservation_type = from_nvme_reservation_type(cdw10.rtype())?; + match nvm::ReservationAcquireAction(cdw10.racqa()) { + nvm::ReservationAcquireAction::ACQUIRE => { + pr.reserve(data.crkey, reservation_type) + .await + .map_err(map_disk_error)?; + } + nvm::ReservationAcquireAction::PREEMPT => { + pr.preempt(data.crkey, data.prkey, reservation_type, false) + .await + .map_err(map_disk_error)?; + } + nvm::ReservationAcquireAction::PREEMPT_AND_ABORT => { + pr.preempt(data.crkey, data.prkey, reservation_type, true) + .await + .map_err(map_disk_error)?; + } + action => { + tracelimit::warn_ratelimited!(?action, "unsupported reservation acquire action"); + return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()); + } + } + + Ok(()) + } + + pub(super) async fn reservation_release( + &self, + pr: &dyn PersistentReservation, + command: &spec::Command, + ) -> Result<(), NvmeError> { + let cdw10 = nvm::Cdw10ReservationRelease::from(command.cdw10); + let mut data = nvm::ReservationRelease::new_zeroed(); + let range = PrpRange::parse(&self.mem, size_of_val(&data), command.dptr)?; + range.read(&self.mem, data.as_mut_bytes())?; + + // According to the spec, this is never to be set. + if cdw10.iekey() { + return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()); + } + + match nvm::ReservationReleaseAction(cdw10.rrela()) { + nvm::ReservationReleaseAction::RELEASE => pr + .release(data.crkey, from_nvme_reservation_type(cdw10.rtype())?) + .await + .map_err(map_disk_error)?, + nvm::ReservationReleaseAction::CLEAR => { + pr.clear(data.crkey).await.map_err(map_disk_error)? + } + action => { + tracelimit::warn_ratelimited!(?action, "unsupported reservation release action"); + return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()); + } + } + + Ok(()) + } + + pub(super) async fn get_reservation_persistence( + &self, + pr: &dyn PersistentReservation, + ) -> Result { + let report = pr.report().await.map_err(map_disk_error)?; + Ok(nvme_spec::Cdw11FeatureReservationPersistence::new() + .with_ptpl(report.persist_through_power_loss)) + } +} + +fn from_nvme_reservation_type( + nvme_type: u8, +) -> Result { + let reservation_type = nvm::ReservationType(nvme_type); + nvme_common::from_nvme_reservation_type(reservation_type) + .map_err(|err| NvmeError::new(spec::Status::INVALID_FIELD_IN_COMMAND, err)) +} diff --git a/vm/devices/storage/nvme_test/src/pci.rs b/vm/devices/storage/nvme_test/src/pci.rs new file mode 100644 index 0000000000..53ba0083c4 --- /dev/null +++ b/vm/devices/storage/nvme_test/src/pci.rs @@ -0,0 +1,503 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! The NVMe (Fault Injection) PCI device implementation. + +use crate::BAR0_LEN; +use crate::DOORBELL_STRIDE_BITS; +use crate::FaultConfiguration; +use crate::IOCQES; +use crate::IOSQES; +use crate::MAX_QES; +use crate::NVME_VERSION; +use crate::NvmeFaultControllerClient; +use crate::PAGE_MASK; +use crate::VENDOR_ID; +use crate::spec; +use crate::workers::IoQueueEntrySizes; +use crate::workers::NvmeWorkers; +use chipset_device::ChipsetDevice; +use chipset_device::io::IoError; +use chipset_device::io::IoError::InvalidRegister; +use chipset_device::io::IoResult; +use chipset_device::mmio::MmioIntercept; +use chipset_device::mmio::RegisterMmioIntercept; +use chipset_device::pci::PciConfigSpace; +use device_emulators::ReadWriteRequestType; +use device_emulators::read_as_u32_chunks; +use device_emulators::write_as_u32_chunks; +use guestmem::GuestMemory; +use guid::Guid; +use inspect::Inspect; +use inspect::InspectMut; +use parking_lot::Mutex; +use pci_core::capabilities::msix::MsixEmulator; +use pci_core::cfg_space_emu::BarMemoryKind; +use pci_core::cfg_space_emu::ConfigSpaceType0Emulator; +use pci_core::cfg_space_emu::DeviceBars; +use pci_core::msi::RegisterMsi; +use pci_core::spec::hwid::ClassCode; +use pci_core::spec::hwid::HardwareIds; +use pci_core::spec::hwid::ProgrammingInterface; +use pci_core::spec::hwid::Subclass; +use std::sync::Arc; +use vmcore::device_state::ChangeDeviceState; +use vmcore::save_restore::SaveError; +use vmcore::save_restore::SaveRestore; +use vmcore::save_restore::SavedStateNotSupported; +use vmcore::vm_task::VmTaskDriverSource; + +/// An NVMe controller. +#[derive(InspectMut)] +pub struct NvmeFaultController { + cfg_space: ConfigSpaceType0Emulator, + #[inspect(skip)] + msix: MsixEmulator, + registers: RegState, + #[inspect(skip)] + qe_sizes: Arc>, + #[inspect(flatten, mut)] + workers: NvmeWorkers, +} + +#[derive(Inspect)] +struct RegState { + #[inspect(hex)] + interrupt_mask: u32, + cc: spec::Cc, + csts: spec::Csts, + aqa: spec::Aqa, + #[inspect(hex)] + asq: u64, + #[inspect(hex)] + acq: u64, +} + +impl RegState { + fn new() -> Self { + Self { + interrupt_mask: 0, + cc: spec::Cc::new(), + csts: spec::Csts::new(), + aqa: spec::Aqa::new(), + asq: 0, + acq: 0, + } + } +} + +const CAP: spec::Cap = spec::Cap::new() + .with_dstrd(DOORBELL_STRIDE_BITS - 2) + .with_mqes_z(MAX_QES - 1) + .with_cqr(true) + .with_css_nvm(true) + .with_to(!0); + +/// The NVMe controller's capabilities. +#[derive(Debug, Copy, Clone)] +pub struct NvmeFaultControllerCaps { + /// The number of entries in the MSI-X table. + pub msix_count: u16, + /// The maximum number of IO submission and completion queues. + pub max_io_queues: u16, + /// The subsystem ID, used as part of the subnqn field of the identify + /// controller response. + pub subsystem_id: Guid, +} + +impl NvmeFaultController { + /// Creates a new NVMe controller. + pub fn new( + driver_source: &VmTaskDriverSource, + guest_memory: GuestMemory, + register_msi: &mut dyn RegisterMsi, + register_mmio: &mut dyn RegisterMmioIntercept, + caps: NvmeFaultControllerCaps, + fault_configuration: FaultConfiguration, + ) -> Self { + let (msix, msix_cap) = MsixEmulator::new(4, caps.msix_count, register_msi); + let bars = DeviceBars::new() + .bar0( + BAR0_LEN, + BarMemoryKind::Intercept(register_mmio.new_io_region("bar0", BAR0_LEN)), + ) + .bar4( + msix.bar_len(), + BarMemoryKind::Intercept(register_mmio.new_io_region("msix", msix.bar_len())), + ); + + let cfg_space = ConfigSpaceType0Emulator::new( + HardwareIds { + vendor_id: VENDOR_ID, + device_id: 0x00a9, + revision_id: 0, + prog_if: ProgrammingInterface::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY_NVME, + sub_class: Subclass::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY, + base_class: ClassCode::MASS_STORAGE_CONTROLLER, + type0_sub_vendor_id: 0, + type0_sub_system_id: 0, + }, + vec![Box::new(msix_cap)], + bars, + ); + + let interrupts = (0..caps.msix_count) + .map(|i| msix.interrupt(i).unwrap()) + .collect(); + + let qe_sizes = Arc::new(Default::default()); + let admin = NvmeWorkers::new( + driver_source, + guest_memory, + interrupts, + caps.max_io_queues, + caps.max_io_queues, + Arc::clone(&qe_sizes), + caps.subsystem_id, + fault_configuration, + ); + + Self { + cfg_space, + msix, + registers: RegState::new(), + workers: admin, + qe_sizes, + } + } + + /// Returns a client for manipulating the NVMe controller at runtime. + pub fn client(&self) -> NvmeFaultControllerClient { + self.workers.client() + } + + /// Reads from the virtual BAR 0. + pub fn read_bar0(&mut self, addr: u16, data: &mut [u8]) -> IoResult { + if data.len() < 4 { + return IoResult::Err(IoError::InvalidAccessSize); + } + if addr & (data.len() - 1) as u16 != 0 { + return IoResult::Err(IoError::UnalignedAccess); + } + + // Check for 64-bit registers. + let d: Option = match spec::Register(addr & !7) { + spec::Register::CAP => Some(CAP.into()), + spec::Register::ASQ => Some(self.registers.asq), + spec::Register::ACQ => Some(self.registers.acq), + spec::Register::BPMBL => Some(0), + _ => None, + }; + if let Some(d) = d { + if data.len() == 8 { + data.copy_from_slice(&d.to_ne_bytes()); + } else if addr & 7 == 0 { + data.copy_from_slice(&(d as u32).to_ne_bytes()); + } else { + data.copy_from_slice(&((d >> 32) as u32).to_ne_bytes()); + } + return IoResult::Ok; + } + + if data.len() != 4 { + return IoResult::Err(IoError::InvalidAccessSize); + } + + // Handle 32-bit registers. + let d: u32 = match spec::Register(addr) { + spec::Register::VS => NVME_VERSION, + spec::Register::INTMS => self.registers.interrupt_mask, + spec::Register::INTMC => self.registers.interrupt_mask, + spec::Register::CC => self.registers.cc.into(), + spec::Register::RESERVED => 0, + spec::Register::CSTS => self.get_csts(), + spec::Register::NSSR => 0, + spec::Register::AQA => self.registers.aqa.into(), + spec::Register::CMBLOC => 0, + spec::Register::CMBSZ => 0, + spec::Register::BPINFO => 0, + spec::Register::BPRSEL => 0, + _ => return IoResult::Err(InvalidRegister), + }; + data.copy_from_slice(&d.to_ne_bytes()); + IoResult::Ok + } + + /// Writes to the virtual BAR 0. + pub fn write_bar0(&mut self, addr: u16, data: &[u8]) -> IoResult { + if addr >= 0x1000 { + // Doorbell write. + let base = addr - 0x1000; + let index = base >> DOORBELL_STRIDE_BITS; + if (index << DOORBELL_STRIDE_BITS) != base { + return IoResult::Err(InvalidRegister); + } + let Ok(data) = data.try_into() else { + return IoResult::Err(IoError::InvalidAccessSize); + }; + let data = u32::from_ne_bytes(data); + self.workers.doorbell(index, data); + return IoResult::Ok; + } + + if data.len() < 4 { + return IoResult::Err(IoError::InvalidAccessSize); + } + if addr & (data.len() - 1) as u16 != 0 { + return IoResult::Err(IoError::UnalignedAccess); + } + + let update_reg = |x: u64| { + if data.len() == 8 { + u64::from_ne_bytes(data.try_into().unwrap()) + } else { + let data = u32::from_ne_bytes(data.try_into().unwrap()) as u64; + if addr & 7 == 0 { + (x & !(u32::MAX as u64)) | data + } else { + (x & u32::MAX as u64) | (data << 32) + } + } + }; + + // Check for 64-bit registers. + let handled = match spec::Register(addr & !7) { + spec::Register::ASQ => { + if !self.registers.cc.en() { + self.registers.asq = update_reg(self.registers.asq) & PAGE_MASK; + } else { + tracelimit::warn_ratelimited!("attempt to set asq while enabled"); + } + true + } + spec::Register::ACQ => { + if !self.registers.cc.en() { + self.registers.acq = update_reg(self.registers.acq) & PAGE_MASK; + } else { + tracelimit::warn_ratelimited!("attempt to set acq while enabled"); + } + true + } + _ => false, + }; + if handled { + return IoResult::Ok; + } + + let Ok(data) = data.try_into() else { + return IoResult::Err(IoError::InvalidAccessSize); + }; + let data = u32::from_ne_bytes(data); + + // Handle 32-bit registers. + match spec::Register(addr) { + spec::Register::INTMS => self.registers.interrupt_mask |= data, + spec::Register::INTMC => self.registers.interrupt_mask &= !data, + spec::Register::CC => self.set_cc(data.into()), + spec::Register::AQA => self.registers.aqa = data.into(), + _ => return IoResult::Err(InvalidRegister), + } + IoResult::Ok + } + + fn set_cc(&mut self, cc: spec::Cc) { + tracing::debug!(?cc, "set cc"); + + if cc.mps() != 0 { + tracelimit::warn_ratelimited!( + "This implementation only supports memory page sizes of 4K." + ); + self.fatal_error(); + return; + } + + if cc.css() != 0 { + tracelimit::warn_ratelimited!("This implementation only supports the NVM command set."); + self.fatal_error(); + return; + } + + if let 2..=6 = cc.ams() { + tracelimit::warn_ratelimited!("Undefined arbitration mechanism."); + self.fatal_error(); + } + + let mask: u32 = u32::from( + spec::Cc::new() + .with_en(true) + .with_shn(0b11) + .with_iosqes(0b1111) + .with_iocqes(0b1111), + ); + let mut cc: spec::Cc = (u32::from(cc) & mask).into(); + + if cc.shn() != 0 { + // It is unclear in the spec (to me) what guarantees a + // controller is supposed to make after shutdown. For now, just + // complete shutdown immediately. + self.registers.csts.set_shst(0b10); + } + + if cc.en() != self.registers.cc.en() { + if cc.en() { + // Some drivers will write zeros to IOSQES and IOCQES, assuming that the defaults will work. + if cc.iocqes() == 0 { + cc.set_iocqes(IOCQES); + } else if cc.iocqes() != IOCQES { + tracelimit::warn_ratelimited!( + "This implementation only supports CQEs of the default size." + ); + self.fatal_error(); + return; + } + + if cc.iosqes() == 0 { + cc.set_iosqes(IOSQES); + } else if cc.iosqes() != IOSQES { + tracelimit::warn_ratelimited!( + "This implementation only supports SQEs of the default size." + ); + self.fatal_error(); + return; + } + + if self.registers.csts.rdy() { + tracelimit::warn_ratelimited!("enabling during reset"); + return; + } + if cc.shn() == 0 { + self.registers.csts.set_shst(0); + } + + self.workers.enable( + self.registers.asq, + self.registers.aqa.asqs_z().max(1) + 1, + self.registers.acq, + self.registers.aqa.acqs_z().max(1) + 1, + ); + } else if self.registers.csts.rdy() { + self.workers.controller_reset(); + } else { + tracelimit::warn_ratelimited!("disabling while not ready"); + return; + } + } + + self.registers.cc = cc; + *self.qe_sizes.lock() = IoQueueEntrySizes { + sqe_bits: cc.iosqes(), + cqe_bits: cc.iocqes(), + }; + } + + fn get_csts(&mut self) -> u32 { + if !self.registers.cc.en() && self.registers.csts.rdy() { + // Keep trying to disable. + if self.workers.poll_controller_reset() { + // AQA, ASQ, and ACQ are not reset by controller reset. + self.registers.csts = 0.into(); + self.registers.cc = 0.into(); + self.registers.interrupt_mask = 0; + } + } else if self.registers.cc.en() && !self.registers.csts.rdy() { + if self.workers.poll_enabled() { + self.registers.csts.set_rdy(true); + } + } + + let csts = self.registers.csts; + tracing::debug!(?csts, "get csts"); + csts.into() + } + + /// Sets the CFS bit in the controller status register (CSTS), indicating + /// that the controller has experienced "undefined" behavior. + pub fn fatal_error(&mut self) { + self.registers.csts.set_cfs(true); + } +} + +impl ChangeDeviceState for NvmeFaultController { + fn start(&mut self) {} + + async fn stop(&mut self) {} + + async fn reset(&mut self) { + let Self { + cfg_space, + msix: _, + registers, + qe_sizes, + workers, + } = self; + workers.reset().await; + cfg_space.reset(); + *registers = RegState::new(); + *qe_sizes.lock() = Default::default(); + } +} + +impl ChipsetDevice for NvmeFaultController { + fn supports_mmio(&mut self) -> Option<&mut dyn MmioIntercept> { + Some(self) + } + + fn supports_pci(&mut self) -> Option<&mut dyn PciConfigSpace> { + Some(self) + } +} + +impl MmioIntercept for NvmeFaultController { + fn mmio_read(&mut self, addr: u64, data: &mut [u8]) -> IoResult { + match self.cfg_space.find_bar(addr) { + Some((0, offset)) => self.read_bar0(offset, data), + Some((4, offset)) => { + read_as_u32_chunks(offset, data, |offset| self.msix.read_u32(offset)); + IoResult::Ok + } + _ => IoResult::Err(InvalidRegister), + } + } + + fn mmio_write(&mut self, addr: u64, data: &[u8]) -> IoResult { + match self.cfg_space.find_bar(addr) { + Some((0, offset)) => self.write_bar0(offset, data), + Some((4, offset)) => { + write_as_u32_chunks(offset, data, |offset, ty| match ty { + ReadWriteRequestType::Read => Some(self.msix.read_u32(offset)), + ReadWriteRequestType::Write(val) => { + self.msix.write_u32(offset, val); + None + } + }); + IoResult::Ok + } + _ => IoResult::Err(InvalidRegister), + } + } +} + +impl PciConfigSpace for NvmeFaultController { + fn pci_cfg_read(&mut self, offset: u16, value: &mut u32) -> IoResult { + self.cfg_space.read_u32(offset, value) + } + + fn pci_cfg_write(&mut self, offset: u16, value: u32) -> IoResult { + self.cfg_space.write_u32(offset, value) + } +} + +impl SaveRestore for NvmeFaultController { + type SavedState = SavedStateNotSupported; + + fn save(&mut self) -> Result { + Err(SaveError::NotSupported) + } + + fn restore( + &mut self, + state: Self::SavedState, + ) -> Result<(), vmcore::save_restore::RestoreError> { + match state {} + } +} diff --git a/vm/devices/storage/nvme_test/src/prp.rs b/vm/devices/storage/nvme_test/src/prp.rs new file mode 100644 index 0000000000..242852fa31 --- /dev/null +++ b/vm/devices/storage/nvme_test/src/prp.rs @@ -0,0 +1,124 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Types for parsing NVMe PRP (Physical Region Page) entries and lists. + +use crate::PAGE_MASK; +use crate::PAGE_SHIFT; +use crate::PAGE_SIZE; +#[cfg(test)] +use crate::PAGE_SIZE64; +use crate::error::NvmeError; +use crate::spec; +use guestmem::GuestMemory; +use guestmem::ranges::PagedRange; +use zerocopy::IntoBytes; + +const PRP_PER_PAGE: usize = PAGE_SIZE / 8; + +enum PrpPfns { + Short([u64; 2]), + Long(Vec), +} + +pub struct PrpRange { + offset: usize, + len: usize, + pfns: PrpPfns, +} + +impl PrpRange { + /// Parses a PRP range for memory of `len` bytes, from the two PRP values + /// in `prp`. + pub fn parse(mem: &GuestMemory, len: usize, prp: [u64; 2]) -> Result { + let offset = prp[0] as usize & (PAGE_SIZE - 1); + let pfns = if len + offset <= PAGE_SIZE * 2 { + PrpPfns::Short(prp.map(|x| x >> PAGE_SHIFT)) + } else { + let count = (offset + len).div_ceil(PAGE_SIZE); + let mut v = vec![0; count]; + v[0] = prp[0]; + let mut pfns = &mut v[1..]; + let mut next_prp_list = prp[1]; + loop { + let n = pfns.len().min(PRP_PER_PAGE); + mem.read_at(next_prp_list, pfns[..n].as_mut_bytes()) + .map_err(|err| NvmeError::new(spec::Status::DATA_TRANSFER_ERROR, err))?; + if n == pfns.len() { + break; + } + next_prp_list = pfns[n - 1] & PAGE_MASK; + pfns = &mut pfns[n - 1..]; + } + for gpa in &mut v { + *gpa >>= PAGE_SHIFT; + } + PrpPfns::Long(v) + }; + Ok(Self { offset, len, pfns }) + } + + #[cfg(test)] + pub fn new(mut gpas: Vec, offset: usize, len: u64) -> Result { + for gpa in &mut gpas { + if *gpa % PAGE_SIZE64 != 0 { + return Err("page address is not 4KB aligned"); + } + *gpa /= PAGE_SIZE64; + } + if len == 0 { + return Err("empty region"); + } + if offset >= PAGE_SIZE { + return Err("start offset too large"); + } + + let pfns = { + match gpas.len() { + 2 => PrpPfns::Short([gpas[0], gpas[1]]), + 1 => PrpPfns::Short([gpas[0], 0]), + _ => PrpPfns::Long(gpas), + } + }; + + Ok(Self { + len: len as usize, + offset, + pfns, + }) + } + + /// Returns the range as a [`PagedRange`]. + pub fn range(&self) -> PagedRange<'_> { + PagedRange::new( + self.offset, + self.len, + match &self.pfns { + PrpPfns::Short(pfns) => pfns, + PrpPfns::Long(pfns) => pfns, + }, + ) + .unwrap() + } + + /// Reads from the range. + pub fn read(&self, mem: &GuestMemory, buf: &mut [u8]) -> Result<(), NvmeError> { + mem.read_range(&self.range().subrange(0, buf.len()), buf) + .map_err(|err| NvmeError::new(spec::Status::DATA_TRANSFER_ERROR, err))?; + Ok(()) + } + + /// Writes to the range. + pub fn write(&self, mem: &GuestMemory, buf: &[u8]) -> Result<(), NvmeError> { + mem.write_range(&self.range().subrange(0, buf.len()), buf) + .map_err(|err| NvmeError::new(spec::Status::DATA_TRANSFER_ERROR, err))?; + Ok(()) + } + + /// Writes zeroes to the range. + pub fn zero(&self, mem: &GuestMemory, len: usize) -> Result<(), NvmeError> { + mem.zero_range(&self.range().subrange(0, len)) + .map_err(|err| NvmeError::new(spec::Status::DATA_TRANSFER_ERROR, err))?; + Ok(()) + } +} diff --git a/vm/devices/storage/nvme_test/src/queue.rs b/vm/devices/storage/nvme_test/src/queue.rs new file mode 100644 index 0000000000..0b603c1c94 --- /dev/null +++ b/vm/devices/storage/nvme_test/src/queue.rs @@ -0,0 +1,326 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! NVMe submission and completion queue types. + +use crate::spec; +use guestmem::GuestMemory; +use guestmem::GuestMemoryError; +use inspect::Inspect; +use std::sync::Arc; +use std::sync::atomic::AtomicU32; +use std::sync::atomic::Ordering; +use thiserror::Error; +use vmcore::interrupt::Interrupt; + +pub const ILLEGAL_DOORBELL_VALUE: u32 = 0xffffffff; + +#[derive(Default, Inspect)] +#[inspect(transparent)] +pub struct DoorbellRegister { + #[inspect(hex)] + value: AtomicU32, + #[inspect(skip)] + event: event_listener::Event, +} + +impl DoorbellRegister { + pub fn new() -> Self { + Self::default() + } + + pub fn write(&self, value: u32) { + self.value.store(value, Ordering::SeqCst); + self.event.notify(usize::MAX); + } + + pub fn read(&self) -> u32 { + self.value.load(Ordering::SeqCst) + } + + pub async fn wait_read(&self, value: u32) -> u32 { + let v = self.read(); + if value != v { + return v; + } + loop { + let listener = self.event.listen(); + let v = self.read(); + if value != v { + break v; + } + listener.await; + } + } +} + +#[derive(Copy, Clone, Default, Inspect, Debug)] +pub struct ShadowDoorbell { + #[inspect(hex)] + pub shadow_db_gpa: u64, + #[inspect(hex)] + pub event_idx_gpa: u64, +} + +impl ShadowDoorbell { + // See NVMe Spec version 2.0a, Section 5.8 -- Doorbell Buffer Config Command for + // an explanation of this math. + pub fn new( + shadow_db_evt_idx_base: ShadowDoorbell, + qid: u16, + is_sq: bool, + doorbell_stride_bits: usize, + ) -> ShadowDoorbell { + let offset = match is_sq { + true => 0u64, + false => 1u64, + }; + let shadow_db_gpa = shadow_db_evt_idx_base.shadow_db_gpa + + (qid as u64 * 2 + offset) * (4 << (doorbell_stride_bits - 2)); + let event_idx_gpa = shadow_db_evt_idx_base.event_idx_gpa + + (qid as u64 * 2 + offset) * (4 << (doorbell_stride_bits - 2)); + ShadowDoorbell { + shadow_db_gpa, + event_idx_gpa, + } + } +} + +#[derive(Inspect)] +pub struct SubmissionQueue { + #[inspect(hex)] + cached_tail: u32, + tail: Arc, + #[inspect(hex)] + head: u32, + #[inspect(hex)] + gpa: u64, + #[inspect(hex)] + len: u32, + #[inspect(with = "Option::is_some")] + shadow_db_evt_idx: Option, + #[inspect(hex)] + evt_idx: u32, +} + +#[derive(Debug, Error)] +pub enum QueueError { + #[error("invalid doorbell tail {0:#x}")] + InvalidTail(u32), + #[error("invalid doorbell head {0:#x}")] + InvalidHead(u32), + #[error("queue access error")] + Memory(#[source] GuestMemoryError), +} + +impl SubmissionQueue { + pub fn new( + tail: Arc, + gpa: u64, + len: u16, + shadow_db_evt_idx: Option, + ) -> Self { + tail.write(0); + Self { + cached_tail: 0, + tail, + head: 0, + gpa, + len: len.into(), + shadow_db_evt_idx, + evt_idx: 0, + } + } + + /// This function returns a future for the next entry in the submission queue. It also + /// has a side effect of updating the tail. + /// + /// Note that this function returns a future that must be cancellable, which means that the + /// parts after an await may never run. The tail update side effect is benign, so + /// that can happen before the await. + pub async fn next(&mut self, mem: &GuestMemory) -> Result { + // If shadow doorbells are in use, use that instead of what was written to the doorbell + // register, as it may be more current. + if let Some(shadow_db_evt_idx) = self.shadow_db_evt_idx { + let shadow_tail = mem + .read_plain(shadow_db_evt_idx.shadow_db_gpa) + .map_err(QueueError::Memory)?; + + // ILLEGAL_DOORBELL_VALUE is the initial state. The guest will overwrite + // it when it first uses the shadow. + if shadow_tail != ILLEGAL_DOORBELL_VALUE { + self.cached_tail = shadow_tail; + self.tail.write(self.cached_tail); + } + } + while self.cached_tail == self.head { + self.cached_tail = self.tail.wait_read(self.cached_tail).await; + } + if self.cached_tail >= self.len { + return Err(QueueError::InvalidTail(self.cached_tail)); + } + let command: spec::Command = mem + .read_plain(self.gpa.wrapping_add(self.head as u64 * 64)) + .map_err(QueueError::Memory)?; + + self.head = advance(self.head, self.len); + Ok(command) + } + + pub fn sqhd(&self) -> u16 { + self.head as u16 + } + + /// This function lets the driver know what doorbell value we consumed, allowing + /// it to elide the next ring, maybe. + pub fn advance_evt_idx(&mut self, mem: &GuestMemory) -> Result<(), QueueError> { + self.evt_idx = advance(self.evt_idx, self.len); + if let Some(shadow_db_evt_idx) = self.shadow_db_evt_idx { + mem.write_plain(shadow_db_evt_idx.event_idx_gpa, &self.evt_idx) + .map_err(QueueError::Memory)?; + } + Ok(()) + } + + /// This function updates the shadow doorbell values of a queue that is + /// potentially already in use. + pub fn update_shadow_db(&mut self, mem: &GuestMemory, sdb: ShadowDoorbell) { + self.shadow_db_evt_idx = Some(sdb); + self.evt_idx = self.cached_tail; + // Write the illegal value out to the buffer, so that we can tell + // if Linux has ever written a valid value. + let _ = mem.write_plain(sdb.shadow_db_gpa, &ILLEGAL_DOORBELL_VALUE); + } +} + +#[derive(Inspect)] +pub struct CompletionQueue { + #[inspect(hex)] + tail: u32, + #[inspect(hex)] + cached_head: u32, + head: Arc, + phase: bool, + #[inspect(hex)] + gpa: u64, + #[inspect(hex)] + len: u32, + #[inspect(with = "Option::is_some")] + interrupt: Option, + shadow_db_evt_idx: Option, +} + +impl CompletionQueue { + pub fn new( + head: Arc, + interrupt: Option, + gpa: u64, + len: u16, + shadow_db_evt_idx: Option, + ) -> Self { + head.write(0); + Self { + tail: 0, + cached_head: 0, + head, + phase: true, + gpa, + len: len.into(), + interrupt, + shadow_db_evt_idx, + } + } + + /// Wait for free completions. + pub async fn wait_ready(&mut self, mem: &GuestMemory) -> Result<(), QueueError> { + let next_tail = advance(self.tail, self.len); + // If shadow doorbells are in use, use that instead of what was written to the doorbell + // register, as it may be more current. + if let Some(shadow_db_evt_idx) = self.shadow_db_evt_idx { + let shadow_head = mem + .read_plain(shadow_db_evt_idx.shadow_db_gpa) + .map_err(QueueError::Memory)?; + + // ILLEGAL_DOORBELL_VALUE is the initial state. The guest will overwrite + // it when it first uses the shadow. + if shadow_head != ILLEGAL_DOORBELL_VALUE { + self.cached_head = shadow_head; + self.head.write(self.cached_head); + } + } + while self.cached_head == next_tail { + self.cached_head = self.head.wait_read(self.cached_head).await; + } + if self.cached_head >= self.len { + return Err(QueueError::InvalidHead(self.cached_head)); + } + Ok(()) + } + + pub fn write( + &mut self, + mem: &GuestMemory, + mut data: spec::Completion, + ) -> Result { + if self.cached_head == advance(self.tail, self.len) { + return Ok(false); + } + data.status.set_phase(self.phase); + + // Atomically write the low part of the completion entry first, then the + // high part, using release fences to ensure ordering. + // + // This is necessary to ensure the guest can observe the full completion + // once it observes the phase bit change (which is in the high part). + let [low, high]: [u64; 2] = zerocopy::transmute!(data); + let gpa = self.gpa.wrapping_add(self.tail as u64 * 16); + mem.write_plain(gpa, &low).map_err(QueueError::Memory)?; + std::sync::atomic::fence(Ordering::Release); + mem.write_plain(gpa + 8, &high) + .map_err(QueueError::Memory)?; + std::sync::atomic::fence(Ordering::Release); + + if let Some(interrupt) = &self.interrupt { + interrupt.deliver(); + } + self.tail = advance(self.tail, self.len); + if self.tail == 0 { + self.phase = !self.phase; + } + Ok(true) + } + + /// This method updates the EVT_IDX field to match the shadow doorbell + /// value, thus signalling to the guest driver that the next completion + /// removed should involve a doorbell ring. In this emulator, such + /// a thing (the ring) is only necessary when the number of un-spoken-for + /// completion queue entries is getting small. (Completion queue entries + /// are spoken for when a command is removed from the SQ). + pub fn catch_up_evt_idx( + &mut self, + force: bool, + io_outstanding: u32, + mem: &GuestMemory, + ) -> Result<(), QueueError> { + if let Some(shadow_db_evt_idx) = self.shadow_db_evt_idx { + if force | (io_outstanding >= self.len - 3) { + mem.write_plain(shadow_db_evt_idx.event_idx_gpa, &self.cached_head) + .map_err(QueueError::Memory)?; + } + } + Ok(()) + } + + /// This function updates the shadow doorbell values of a queue that is + /// potentially already in use. + pub fn update_shadow_db(&mut self, mem: &GuestMemory, sdb: ShadowDoorbell) { + self.shadow_db_evt_idx = Some(sdb); + // Write the illegal value out to the buffer, so that we can tell + // if Linux has ever written a valid value. + let _ = mem.write_plain(sdb.shadow_db_gpa, &ILLEGAL_DOORBELL_VALUE); + } +} + +fn advance(n: u32, l: u32) -> u32 { + if n + 1 < l { n + 1 } else { 0 } +} diff --git a/vm/devices/storage/nvme_test/src/resolver.rs b/vm/devices/storage/nvme_test/src/resolver.rs new file mode 100644 index 0000000000..92e4f10c9c --- /dev/null +++ b/vm/devices/storage/nvme_test/src/resolver.rs @@ -0,0 +1,92 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Resource resolver for the nvme controller. + +use crate::FaultConfiguration; +use crate::NsidConflict; +use crate::NvmeFaultController; +use crate::NvmeFaultControllerCaps; +use async_trait::async_trait; +use disk_backend::resolve::ResolveDiskParameters; +use nvme_resources::NamespaceDefinition; +use nvme_resources::NvmeControllerHandle; +use pci_resources::ResolvePciDeviceHandleParams; +use pci_resources::ResolvedPciDevice; +use thiserror::Error; +use vm_resource::AsyncResolveResource; +use vm_resource::ResolveError; +use vm_resource::ResourceResolver; +use vm_resource::declare_static_async_resolver; +use vm_resource::kind::PciDeviceHandleKind; + +/// Resource resolver for [`NvmeControllerHandle`]. +pub struct NvmeControllerResolver; + +declare_static_async_resolver! { + NvmeControllerResolver, + (PciDeviceHandleKind, NvmeControllerHandle), +} + +/// Error returned by [`NvmeControllerResolver`]. +#[derive(Debug, Error)] +#[expect(missing_docs)] +pub enum Error { + #[error("failed to resolve namespace {nsid}")] + NamespaceResolve { + nsid: u32, + #[source] + source: ResolveError, + }, + #[error(transparent)] + NsidConflict(NsidConflict), +} + +#[async_trait] +impl AsyncResolveResource for NvmeControllerResolver { + type Output = ResolvedPciDevice; + type Error = Error; + + async fn resolve( + &self, + resolver: &ResourceResolver, + resource: NvmeControllerHandle, + input: ResolvePciDeviceHandleParams<'_>, + ) -> Result { + let controller = NvmeFaultController::new( + input.driver_source, + input.guest_memory.clone(), + input.register_msi, + input.register_mmio, + NvmeFaultControllerCaps { + msix_count: resource.msix_count, + max_io_queues: resource.max_io_queues, + subsystem_id: resource.subsystem_id, + }, + FaultConfiguration { admin_fault: None }, + ); + for NamespaceDefinition { + nsid, + read_only, + disk, + } in resource.namespaces + { + let disk = resolver + .resolve( + disk, + ResolveDiskParameters { + read_only, + driver_source: input.driver_source, + }, + ) + .await + .map_err(|source| Error::NamespaceResolve { nsid, source })?; + controller + .client() + .add_namespace(nsid, disk.0) + .await + .map_err(Error::NsidConflict)?; + } + Ok(controller.into()) + } +} diff --git a/vm/devices/storage/nvme_test/src/tests.rs b/vm/devices/storage/nvme_test/src/tests.rs new file mode 100644 index 0000000000..f0a40aba7e --- /dev/null +++ b/vm/devices/storage/nvme_test/src/tests.rs @@ -0,0 +1,6 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +mod controller_tests; +mod shadow_doorbell_tests; +mod test_helpers; diff --git a/vm/devices/storage/nvme_test/src/tests/controller_tests.rs b/vm/devices/storage/nvme_test/src/tests/controller_tests.rs new file mode 100644 index 0000000000..8a7008184e --- /dev/null +++ b/vm/devices/storage/nvme_test/src/tests/controller_tests.rs @@ -0,0 +1,419 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +use super::test_helpers::TestNvmeMmioRegistration; +use crate::BAR0_LEN; +use crate::FaultConfiguration; +use crate::NvmeFaultController; +use crate::NvmeFaultControllerCaps; +use crate::PAGE_SIZE64; +use crate::QueueFault; +use crate::QueueFaultBehavior; +use crate::prp::PrpRange; +use crate::spec; +use crate::tests::test_helpers::read_completion_from_queue; +use crate::tests::test_helpers::test_memory; +use crate::tests::test_helpers::write_command_to_queue; +use chipset_device::mmio::MmioIntercept; +use chipset_device::pci::PciConfigSpace; +use guestmem::GuestMemory; +use guid::Guid; +use nvme_spec::Command; +use nvme_spec::Completion; +use pal_async::DefaultDriver; +use pal_async::async_test; +use pci_core::msi::MsiInterruptSet; +use pci_core::test_helpers::TestPciInterruptController; +use user_driver::backoff::Backoff; +use vmcore::vm_task::SingleDriverBackend; +use vmcore::vm_task::VmTaskDriverSource; +use zerocopy::FromZeros; +use zerocopy::IntoBytes; + +fn instantiate_controller( + driver: DefaultDriver, + gm: &GuestMemory, + int_controller: Option<&TestPciInterruptController>, + fault_configuration: FaultConfiguration, +) -> NvmeFaultController { + let mut mmio_reg = TestNvmeMmioRegistration {}; + let vm_task_driver = &VmTaskDriverSource::new(SingleDriverBackend::new(driver)); + let mut msi_interrupt_set = MsiInterruptSet::new(); + let controller = NvmeFaultController::new( + vm_task_driver, + gm.clone(), + &mut msi_interrupt_set, + &mut mmio_reg, + NvmeFaultControllerCaps { + msix_count: 64, + max_io_queues: 64, + subsystem_id: Guid::new_random(), + }, + fault_configuration, + ); + + if let Some(intc) = int_controller { + msi_interrupt_set.connect(intc); + } + controller +} + +fn write_msix_table_entry( + controller: &mut NvmeFaultController, + table_index: u16, + address: u64, + data: u32, + masked: bool, +) { + // This code works by writing to MMIO space, as if all the BARs are squished together. + // The first BAR is of length DOORBELLS.end. The MSI-X table comes after that. + let mmio_address = BAR0_LEN + (table_index as u64 * 16); + let mut data_control = data as u64; + if masked { + data_control |= 1u64 << 32; + } + controller + .mmio_write(mmio_address, address.as_bytes()) + .unwrap(); + controller + .mmio_write(mmio_address + 8, data_control.as_bytes()) + .unwrap(); +} + +pub async fn wait_for_msi( + driver: DefaultDriver, + intc: &TestPciInterruptController, + timeout_in_milliseconds: u32, + expected_address: u64, + expected_data: u32, +) { + let wait_periods = timeout_in_milliseconds / 10; + let mut backoff = Backoff::new(&driver); + + for _i in 0..wait_periods { + let int = intc.get_next_interrupt(); + if let Some(int_inner) = int { + assert_eq!(int_inner.0, expected_address); + assert_eq!(int_inner.1, expected_data); + return; + } + + backoff.back_off().await; + } + + // Should never drop off the end, here. + panic!(); +} + +pub async fn instantiate_and_build_admin_queue( + acq_buffer: &PrpRange, + acq_entries: u32, + asq_buffer: &PrpRange, + asq_entries: u32, + trigger_interrupt: bool, + int_controller: Option<&TestPciInterruptController>, + driver: DefaultDriver, + gm: &GuestMemory, + fault_configuration: FaultConfiguration, +) -> NvmeFaultController { + let mut nvmec = instantiate_controller(driver.clone(), gm, int_controller, fault_configuration); + // Set the BARs. + nvmec.pci_cfg_write(0x10, 0).unwrap(); + nvmec.pci_cfg_write(0x20, BAR0_LEN as u32).unwrap(); + + // Find the MSI-X cap struct. + let mut cfg_dword = 0; + nvmec.pci_cfg_read(0x34, &mut cfg_dword).unwrap(); + cfg_dword &= 0xff; + loop { + // Read a cap struct header and pull out the fields. + let mut cap_header = 0; + nvmec + .pci_cfg_read(cfg_dword as u16, &mut cap_header) + .unwrap(); + if cap_header & 0xff == 0x11 { + // Read the table BIR and offset. + let mut table_loc = 0; + nvmec + .pci_cfg_read(cfg_dword as u16 + 4, &mut table_loc) + .unwrap(); + // Code in other places assumes that the MSI-X table is at the beginning + // of BAR 4. If this becomes a fluid concept, capture the values + // here and use them, rather than just asserting on them. + assert_eq!(table_loc & 0x7, 4); + assert_eq!(table_loc >> 3, 0); + + // Found MSI-X, enable it. + nvmec.pci_cfg_write(cfg_dword as u16, 0x80000000).unwrap(); + break; + } + // Isolate the ptr to the next cap struct. + cfg_dword = (cap_header >> 8) & 0xff; + if cfg_dword == 0 { + // Hit the end. + panic!(); + } + } + + // Turn on MMIO access by writing to the Command register in config space. Enable + // MMIO and DMA. + nvmec.pci_cfg_write(4, 6).unwrap(); + + // Set the ACQ base. + let base = acq_buffer.range().gpns()[0] * PAGE_SIZE64; + nvmec.write_bar0(0x30, base.as_bytes()).unwrap(); + + // Set ASQ base. + let base = asq_buffer.range().gpns()[0] * PAGE_SIZE64; + nvmec.write_bar0(0x28, base.as_bytes()).unwrap(); + + // Set AQA. + let aqa: u32 = (asq_entries - 1) | ((acq_entries - 1) << 16); + nvmec.write_bar0(0x24, aqa.as_bytes()).unwrap(); + + // Set MSI-X table entry for the admin queue. + write_msix_table_entry(&mut nvmec, 0, 0xfeed0000, 0x1111, !trigger_interrupt); + + let mut backoff = Backoff::new(&driver); + + // Enable the controller. + let mut dword = 0u32; + nvmec.read_bar0(0x14, dword.as_mut_bytes()).unwrap(); + dword |= 1; + nvmec.write_bar0(0x14, dword.as_bytes()).unwrap(); + backoff.back_off().await; + nvmec.read_bar0(0x14, dword.as_mut_bytes()).unwrap(); + assert!(dword & 1 != 0); + + // Read CSTS + let mut ready = false; + for _i in 0..5 { + nvmec.read_bar0(0x1c, dword.as_mut_bytes()).unwrap(); + let csts = spec::Csts::from(dword); + assert_eq!(csts.cfs(), false); + if csts.rdy() { + ready = true; + break; + } + backoff.back_off().await; + } + assert!(ready); + nvmec +} + +#[async_test] +async fn test_basic_registers(driver: DefaultDriver) { + let gm = test_memory(); + let fault_configuration = FaultConfiguration { admin_fault: None }; + let mut nvmec = instantiate_controller(driver, &gm, None, fault_configuration); + let mut dword = 0u32; + + // Read controller caps, version. + nvmec.read_bar0(0, dword.as_mut_bytes()).unwrap(); + assert_eq!(dword, 0xFF0100FF); + let mut qword = 0u64; + nvmec.read_bar0(0, qword.as_mut_bytes()).unwrap(); + assert_eq!(qword, 0x20FF0100FF); + nvmec.read_bar0(8, dword.as_mut_bytes()).unwrap(); + assert_eq!(dword, 0x20000); + + // Read ACQ and write it back, see that it sticks. + nvmec.read_bar0(0x30, qword.as_mut_bytes()).unwrap(); + assert_eq!(qword, 0); + qword = 0x1000; + nvmec.write_bar0(0x30, qword.as_bytes()).unwrap(); + nvmec.read_bar0(0x30, qword.as_mut_bytes()).unwrap(); + assert_eq!(qword, 0x1000); +} + +#[async_test] +async fn test_invalid_configuration(driver: DefaultDriver) { + let gm = test_memory(); + let fault_configuration = FaultConfiguration { admin_fault: None }; + let mut nvmec = instantiate_controller(driver, &gm, None, fault_configuration); + let mut dword = 0u32; + nvmec.read_bar0(0x14, dword.as_mut_bytes()).unwrap(); + // Set MPS to some disallowed value + dword |= 0x380; + nvmec.write_bar0(0x14, dword.as_bytes()).unwrap(); + // Read CSTS, expect fatal error + nvmec.read_bar0(0x1c, dword.as_mut_bytes()).unwrap(); + assert!(dword & 2 != 0); +} + +#[async_test] +async fn test_enable_controller(driver: DefaultDriver) { + let gm = test_memory(); + let fault_configuration = FaultConfiguration { admin_fault: None }; + let mut nvmec = instantiate_controller(driver, &gm, None, fault_configuration); + + // Set the ACQ base to 0x1000 and the ASQ base to 0x2000. + let mut qword = 0x1000; + nvmec.write_bar0(0x30, qword.as_bytes()).unwrap(); + qword = 0x2000; + nvmec.write_bar0(0x28, qword.as_bytes()).unwrap(); + + // Set the queues so that they have four entries apiece. + let mut dword = 0x30003; + nvmec.write_bar0(0x24, dword.as_bytes()).unwrap(); + + // Enable the controller. + nvmec.read_bar0(0x14, dword.as_mut_bytes()).unwrap(); + dword |= 1; + nvmec.write_bar0(0x14, dword.as_bytes()).unwrap(); + nvmec.read_bar0(0x14, dword.as_mut_bytes()).unwrap(); + assert!(dword & 1 != 0); + + // Read CSTS + nvmec.read_bar0(0x1c, dword.as_mut_bytes()).unwrap(); + assert!(dword & 2 == 0); +} + +#[async_test] +async fn test_multi_page_admin_queues(driver: DefaultDriver) { + let gm = test_memory(); + let fault_configuration = FaultConfiguration { admin_fault: None }; + let mut nvmec = instantiate_controller(driver, &gm, None, fault_configuration); + + // Set the ACQ base to 0x1000 and the ASQ base to 0x3000. + let mut qword = 0x1000; + nvmec.write_bar0(0x30, qword.as_bytes()).unwrap(); + qword = 0x3000; + nvmec.write_bar0(0x28, qword.as_bytes()).unwrap(); + + // Set the queues so that they have 512 entries apiece. + let mut dword = 0x1ff01ff; + nvmec.write_bar0(0x24, dword.as_bytes()).unwrap(); + + // Enable the controller. + nvmec.read_bar0(0x14, dword.as_mut_bytes()).unwrap(); + dword |= 1; + nvmec.write_bar0(0x14, dword.as_bytes()).unwrap(); + nvmec.read_bar0(0x14, dword.as_mut_bytes()).unwrap(); + assert!(dword & 1 != 0); + + // Read CSTS + nvmec.read_bar0(0x1c, dword.as_mut_bytes()).unwrap(); + assert!(dword & 2 == 0); +} + +async fn send_identify( + driver: DefaultDriver, + fault_configuration: FaultConfiguration, +) -> Completion { + let dm1 = PrpRange::new(vec![0], 0, PAGE_SIZE64).unwrap(); + let dm2 = PrpRange::new(vec![0x1000], 0, PAGE_SIZE64).unwrap(); + let gm = test_memory(); + let int_controller = TestPciInterruptController::new(); + + // Build a controller with 64 entries in the admin queue (just so that the ASQ fits in one page). + let mut nvmec = instantiate_and_build_admin_queue( + &dm1, + 64, + &dm2, + 64, + true, + Some(&int_controller), + driver.clone(), + &gm, + fault_configuration, + ) + .await; + + // There should be no MSI-X triggered at this point. + let next_int = int_controller.get_next_interrupt(); + assert!(next_int.is_none()); + + // Construct an admin queue command into the first entry in the ASQ, which is at 0x1000 in the "test memory". + let mut entry = Command::new_zeroed(); + entry.cdw0.set_opcode(spec::AdminOpcode::IDENTIFY.0); + let cdw10 = spec::Cdw10Identify::new().with_cns(spec::Cns::CONTROLLER.0); + entry.cdw10 = u32::from(cdw10); + entry.dptr[0] = 1; + + write_command_to_queue(&gm, &dm2, 0, &entry); + + // Ring the admin queue doorbell. + nvmec.write_bar0(0x1000, 1u32.as_bytes()).unwrap(); + + wait_for_msi(driver.clone(), &int_controller, 1000, 0xfeed0000, 0x1111).await; + + read_completion_from_queue(&gm, &dm1, 0) +} + +#[async_test] +async fn test_send_identify_no_fault(driver: DefaultDriver) { + let fault_configuration = FaultConfiguration { admin_fault: None }; + let cqe = send_identify(driver, fault_configuration).await; + + assert_eq!(cqe.status.status(), spec::Status::SUCCESS.0); + assert_eq!(cqe.cid, 0); +} + +struct TestAdminSQFault; + +#[async_trait::async_trait] +impl QueueFault for TestAdminSQFault { + async fn fault_submission_queue(&self, mut command: Command) -> QueueFaultBehavior { + let opcode = nvme_spec::AdminOpcode(command.cdw0.opcode()); + match opcode { + nvme_spec::AdminOpcode::IDENTIFY => { + // Overwrite the previous cid to cause a panic. + command.cdw0.set_cid(10); + QueueFaultBehavior::Update(command) + } + _ => QueueFaultBehavior::Default, + } + } + + async fn fault_completion_queue( + &self, + _completion: Completion, + ) -> QueueFaultBehavior { + QueueFaultBehavior::Default + } +} + +#[async_test] +async fn test_send_identify_with_sq_fault(driver: DefaultDriver) { + let fault_configuration = FaultConfiguration { + admin_fault: Some(Box::new(TestAdminSQFault)), + }; + let cqe = send_identify(driver, fault_configuration).await; + + assert_eq!(cqe.status.status(), spec::Status::SUCCESS.0); + assert_eq!(cqe.cid, 10); // The CID should have been overwritten by the fault. +} + +struct TestAdminCQFault; + +#[async_trait::async_trait] +impl QueueFault for TestAdminCQFault { + async fn fault_submission_queue(&self, _command: Command) -> QueueFaultBehavior { + QueueFaultBehavior::Default + } + + async fn fault_completion_queue( + &self, + mut completion: Completion, + ) -> QueueFaultBehavior { + let status = spec::Status(completion.status.status()); + match status { + spec::Status::SUCCESS => { + // Overwrite the CID to cause a panic. + completion.status.set_status(spec::Status::INVALID_FORMAT.0); + QueueFaultBehavior::Update(completion) + } + _ => QueueFaultBehavior::Default, + } + } +} + +#[async_test] +async fn test_cq_fault(driver: DefaultDriver) { + let fault_configuration = FaultConfiguration { + admin_fault: Some(Box::new(TestAdminCQFault)), + }; + let cqe = send_identify(driver, fault_configuration).await; + assert_eq!(cqe.status.status(), spec::Status::INVALID_FORMAT.0); // Status should be overwritten by the fault. +} diff --git a/vm/devices/storage/nvme_test/src/tests/shadow_doorbell_tests.rs b/vm/devices/storage/nvme_test/src/tests/shadow_doorbell_tests.rs new file mode 100644 index 0000000000..4dfb828f67 --- /dev/null +++ b/vm/devices/storage/nvme_test/src/tests/shadow_doorbell_tests.rs @@ -0,0 +1,262 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +use crate::DOORBELL_STRIDE_BITS; +use crate::FaultConfiguration; +use crate::PAGE_SIZE64; +use crate::prp::PrpRange; +use crate::queue::ShadowDoorbell; +use crate::spec; +use crate::tests::controller_tests::instantiate_and_build_admin_queue; +use crate::tests::controller_tests::wait_for_msi; +use crate::tests::test_helpers::read_completion_from_queue; +use crate::tests::test_helpers::test_memory; +use crate::tests::test_helpers::write_command_to_queue; +use guestmem::GuestMemory; +use pal_async::DefaultDriver; +use pal_async::async_test; +use pci_core::test_helpers::TestPciInterruptController; +use user_driver::backoff::Backoff; +use zerocopy::FromZeros; +use zerocopy::IntoBytes; + +const CQ_BASE: u64 = 0x0; +const SQ_BASE: u64 = 0x1000; +const DOORBELL_BUFFER_BASE: u64 = 0x2000; +const EVT_IDX_BUFFER_BASE: u64 = 0x3000; +const DCQ_BASE: u64 = 0x4000; +const DSQ_BASE: u64 = 0x5000; + +/// Sets up an admin queue with a single command to configure the shadow doorbells. Leaves the +/// admin queue tail at 1, if create_dq_pair is false, 3 otherwise. +async fn setup_shadow_doorbells( + driver: DefaultDriver, + cq_buf: &PrpRange, + sq_buf: &PrpRange, + gm: &GuestMemory, + int_controller: &TestPciInterruptController, + dq_bases: Option<(u64, u64)>, +) -> crate::NvmeFaultController { + let fault_configuration = FaultConfiguration { admin_fault: None }; + // Build a controller with 64 entries in the admin queue (just so that the ASQ fits in one page). + let mut nvmec = instantiate_and_build_admin_queue( + cq_buf, + 64, + sq_buf, + 64, + true, + Some(int_controller), + driver.clone(), + gm, + fault_configuration, + ) + .await; + + let mut slot = 0; + let mut backoff = Backoff::new(&driver); + + if let Some((cq_base, sq_base)) = dq_bases { + let mut command = spec::Command::new_zeroed(); + command + .cdw0 + .set_opcode(spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0); + command.cdw10 = spec::Cdw10CreateIoQueue::new() + .with_qid(1) + .with_qsize_z(16) + .into(); + command.cdw11 = spec::Cdw11CreateIoCompletionQueue::new() + .with_pc(true) + .with_ien(false) + .with_iv(0) + .into(); + command.dptr[0] = cq_base; + command.cdw0.set_cid(10); + + write_command_to_queue(gm, sq_buf, slot, &command); + // Ring the admin queue doorbell. Doorbell base is 0x1000. + nvmec + .write_bar0(0x1000, (slot as u32 + 1).as_bytes()) + .unwrap(); + backoff.back_off().await; + wait_for_msi(driver.clone(), int_controller, 1000, 0xfeed0000, 0x1111).await; + backoff.back_off().await; + let cqe = read_completion_from_queue(gm, cq_buf, slot); + assert_eq!(cqe.cid, 10); + assert_eq!( + cqe.status.status(), + spec::Status::SUCCESS.status_code() as u16 + ); + + slot += 1; + let mut command = spec::Command::new_zeroed(); + command + .cdw0 + .set_opcode(spec::AdminOpcode::CREATE_IO_SUBMISSION_QUEUE.0); + command.cdw10 = spec::Cdw10CreateIoQueue::new() + .with_qid(1) + .with_qsize_z(16) + .into(); + command.cdw11 = spec::Cdw11CreateIoSubmissionQueue::new() + .with_pc(true) + .with_qprio(0) + .with_cqid(1) + .into(); + command.dptr[0] = sq_base; + command.cdw0.set_cid(11); + + write_command_to_queue(gm, sq_buf, slot, &command); + nvmec + .write_bar0(0x1000, (slot as u32 + 1).as_bytes()) + .unwrap(); + backoff.back_off().await; + wait_for_msi(driver.clone(), int_controller, 1000, 0xfeed0000, 0x1111).await; + backoff.back_off().await; + let cqe = read_completion_from_queue(gm, cq_buf, slot); + assert_eq!(cqe.cid, 11); + assert_eq!( + cqe.status.status(), + spec::Status::SUCCESS.status_code() as u16 + ); + + slot += 1; + } + + let mut command = spec::Command::new_zeroed(); + command + .cdw0 + .set_opcode(spec::AdminOpcode::DOORBELL_BUFFER_CONFIG.0); + command.dptr[0] = DOORBELL_BUFFER_BASE; + command.dptr[1] = EVT_IDX_BUFFER_BASE; + + write_command_to_queue(gm, sq_buf, slot, &command); + + // Update the shadow doorbell, so that uninitialized (or zeroed) memory + // doesn't get immediately misinterpreted as a doorbell value. + let new_sq_db = slot as u32 + 1; + gm.write_plain::(DOORBELL_BUFFER_BASE, &new_sq_db) + .unwrap(); + nvmec.write_bar0(0x1000, new_sq_db.as_bytes()).unwrap(); + + backoff.back_off().await; + wait_for_msi(driver.clone(), int_controller, 1000, 0xfeed0000, 0x1111).await; + backoff.back_off().await; + + let cqe = read_completion_from_queue(gm, cq_buf, slot); + assert_eq!( + cqe.status.status(), + spec::Status::SUCCESS.status_code() as u16 + ); + + nvmec +} + +#[async_test] +async fn test_setup_shadow_doorbells(driver: DefaultDriver) { + let cq_buf = PrpRange::new(vec![CQ_BASE], 0, PAGE_SIZE64).unwrap(); + let sq_buf = PrpRange::new(vec![SQ_BASE], 0, PAGE_SIZE64).unwrap(); + let _sdb_buf = PrpRange::new(vec![DOORBELL_BUFFER_BASE], 0, PAGE_SIZE64).unwrap(); + let _evt_idx_buf = PrpRange::new(vec![EVT_IDX_BUFFER_BASE], 0, PAGE_SIZE64).unwrap(); + let gm = test_memory(); + let int_controller = TestPciInterruptController::new(); + + setup_shadow_doorbells(driver.clone(), &cq_buf, &sq_buf, &gm, &int_controller, None).await; +} + +#[async_test] +async fn test_setup_sq_ring_with_shadow(driver: DefaultDriver) { + let cq_buf = PrpRange::new(vec![CQ_BASE], 0, PAGE_SIZE64).unwrap(); + let sq_buf = PrpRange::new(vec![SQ_BASE], 0, PAGE_SIZE64).unwrap(); + let gm = test_memory(); + let int_controller = TestPciInterruptController::new(); + let sdb_base = ShadowDoorbell { + shadow_db_gpa: DOORBELL_BUFFER_BASE, + event_idx_gpa: EVT_IDX_BUFFER_BASE, + }; + let sq_sdb = ShadowDoorbell::new(sdb_base, 0, true, DOORBELL_STRIDE_BITS.into()); + let mut backoff = Backoff::new(&driver); + + // Check that the old value was 0, just to be sure. + let sdb = gm.read_plain::(sq_sdb.shadow_db_gpa).unwrap(); + assert_eq!(sdb, 0); + + let mut nvmec = + setup_shadow_doorbells(driver.clone(), &cq_buf, &sq_buf, &gm, &int_controller, None).await; + + let sdb = gm.read_plain::(sq_sdb.shadow_db_gpa).unwrap(); + assert_eq!(sdb, crate::queue::ILLEGAL_DOORBELL_VALUE); + + /* From the NVMe Spec (ver. 2.0a): + B.5 Updating Controller Doorbell Properties using a Shadow Doorbell Buffer + + B.5.1. Shadow Doorbell Buffer Overview + Controllers that support the Doorbell Buffer Config command are typically emulated controllers where this + feature is used to enhance the performance of host software running in Virtual Machines. If supported by + the controller, host software may enable Shadow Doorbell buffers by submitting the Doorbell Buffer Config + command (refer to section 5.8). + + After the completion of the Doorbell Buffer Config command, host software shall submit commands by + updating the appropriate entry in the Shadow Doorbell buffer instead of updating the controller's + corresponding doorbell property. If updating an entry in the Shadow Doorbell buffer changes the value from + being less than or equal to the value of the corresponding EventIdx buffer entry to being greater than that + value, then the host shall also update the controller's corresponding doorbell property to match the value + of that entry in the Shadow Doorbell buffer. Queue wrap conditions shall be taken into account in all + comparisons in this paragraph. + + The controller may read from the Shadow Doorbell buffer and update the EventIdx buffer at any time (e.g., + before the host writes to the controller's doorbell property). + + B.5.2. Example Algorithm for Controller Doorbell Property Updates + Host software may use modular arithmetic where the modulus is the queue depth to decide if the controller + doorbell property should be updated, specifically: + + • Compute X as the new doorbell value minus the corresponding EventIdx value, modulo queue + depth; and + • Compute Y as the new doorbell value minus the old doorbell value in the shadow doorbell buffer, + also modulo queue depth. + + If X is less than or equal to Y, the controller doorbell property should be updated with the new doorbell + value. + */ + + // First, put one command into the SQ and check the EVT_IDX value. + + let mut entry = spec::Command::new_zeroed(); + entry.cdw0.set_opcode(spec::AdminOpcode::IDENTIFY.0); + let cdw10 = spec::Cdw10Identify::new().with_cns(spec::Cns::CONTROLLER.0); + entry.cdw10 = u32::from(cdw10); + entry.dptr[0] = 0x6000; // unused + write_command_to_queue(&gm, &sq_buf, 1, &entry); + + let new_sq_db = 2u32; + // Update the shadow. + gm.write_plain::(sq_sdb.shadow_db_gpa, &new_sq_db) + .unwrap(); + + // Ring the admin queue doorbell. + nvmec.write_bar0(0x1000, new_sq_db.as_bytes()).unwrap(); + backoff.back_off().await; + + let sq_evt_idx = gm.read_plain::(sq_sdb.event_idx_gpa).unwrap(); + assert_eq!(sq_evt_idx, 2); +} + +#[async_test] +async fn test_update_data_queues_with_shadow_doorbells(driver: DefaultDriver) { + let cq_buf = PrpRange::new(vec![CQ_BASE], 0, PAGE_SIZE64).unwrap(); + let sq_buf = PrpRange::new(vec![SQ_BASE], 0, PAGE_SIZE64).unwrap(); + let gm = test_memory(); + let int_controller = TestPciInterruptController::new(); + let mut backoff = Backoff::new(&driver); + + let _nvmec = setup_shadow_doorbells( + driver.clone(), + &cq_buf, + &sq_buf, + &gm, + &int_controller, + Some((DCQ_BASE, DSQ_BASE)), + ) + .await; + + backoff.back_off().await; +} diff --git a/vm/devices/storage/nvme_test/src/tests/test_helpers.rs b/vm/devices/storage/nvme_test/src/tests/test_helpers.rs new file mode 100644 index 0000000000..60a22f7eb6 --- /dev/null +++ b/vm/devices/storage/nvme_test/src/tests/test_helpers.rs @@ -0,0 +1,150 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Mock types for unit-testing various NVMe behaviors. + +use crate::PAGE_SIZE; +use crate::PAGE_SIZE64; +use crate::prp::PrpRange; +use crate::spec; +use chipset_device::mmio::ControlMmioIntercept; +use chipset_device::mmio::RegisterMmioIntercept; +use guestmem::GuestMemory; +use parking_lot::Mutex; +use pci_core::msi::MsiControl; +use pci_core::msi::MsiInterruptTarget; +use std::collections::VecDeque; +use std::sync::Arc; + +/// A test-only interrupt controller that simply stashes incoming interrupt +/// requests in a FIFO queue. Implements [`MsiInterruptTarget`]. +#[derive(Debug, Clone)] +pub struct TestPciInterruptController { + inner: Arc, +} + +#[derive(Debug)] +struct TestPciInterruptControllerInner { + msi_requests: Mutex>, // (addr, data) +} + +impl TestPciInterruptController { + /// Return a new test PCI interrupt controller + #[expect(dead_code)] + pub fn new() -> Self { + Self { + inner: Arc::new(TestPciInterruptControllerInner { + msi_requests: Mutex::new(VecDeque::new()), + }), + } + } + + /// Fetch the first (addr, data) MSI-X interrupt in the FIFO interrupt queue + #[expect(dead_code)] + pub fn get_next_interrupt(&self) -> Option<(u64, u32)> { + self.inner.msi_requests.lock().pop_front() + } +} + +impl MsiInterruptTarget for TestPciInterruptController { + fn new_interrupt(&self) -> Box { + let controller = self.inner.clone(); + Box::new(move |address, data| controller.msi_requests.lock().push_back((address, data))) + } +} + +pub fn test_memory() -> GuestMemory { + GuestMemory::allocate(PAGE_SIZE * 64) +} + +pub struct TestNvmeMmioRegistration {} + +/// A trait to register device-specific IO intercept regions. +impl RegisterMmioIntercept for TestNvmeMmioRegistration { + /// Registers a new IO region of the given length. + fn new_io_region(&mut self, _debug_name: &str, _len: u64) -> Box { + Box::new(TestNvmeControlMmioIntercept::new()) + } +} + +pub struct TestNvmeControlMmioIntercept {} + +impl TestNvmeControlMmioIntercept { + pub fn new() -> TestNvmeControlMmioIntercept { + TestNvmeControlMmioIntercept {} + } +} + +/// A trait to map/unmap a device-specific IO memory region. +impl ControlMmioIntercept for TestNvmeControlMmioIntercept { + /// Enables the IO region at the given address. + /// + /// This method will never fail, as devices are not expected to gracefully + /// handle the case where an IO region overlaps with an existing region. + fn map(&mut self, _addr: u64) {} + + /// Disables the IO region. + fn unmap(&mut self) {} + + /// Return the currently mapped address. + /// + /// Returns `None` if the region is currently unmapped. + fn addr(&self) -> Option { + None + } + + fn len(&self) -> u64 { + 8096 + } + + /// Return the offset of `addr` from the region's base address. + /// + /// Returns `None` if the provided `addr` is outside of the memory + /// region, or the region is currently unmapped. + /// + /// # Example + /// + /// ```ignore + /// let foo_region = register.new_io_region("foo", 0x10); + /// foo_region.map(0x1000); + /// assert_eq!(foo_region.offset_of(0x1003), Some(3)); + /// assert_eq!(foo_region.offset_of(0x900), None); + /// assert_eq!(foo_region.offset_of(0x1020), None); + /// foo_region.unmap(); + /// assert_eq!(foo_region.offset_of(0x1003), None); + /// ``` + fn offset_of(&self, _addr: u64) -> Option { + None + } + + fn region_name(&self) -> &str { + "???" + } +} + +pub fn write_command_to_queue( + gm: &GuestMemory, + dm: &PrpRange, + slot: usize, + command: &spec::Command, +) { + let offset_in_queue = slot * size_of::(); + let page_in_queue = offset_in_queue / PAGE_SIZE64 as usize; + let offset_in_page = offset_in_queue % PAGE_SIZE64 as usize; + let gpa = (dm.range().gpns()[page_in_queue] * PAGE_SIZE64) + offset_in_page as u64; + + gm.write_plain::(gpa, command).unwrap(); +} + +pub fn read_completion_from_queue( + gm: &GuestMemory, + dm: &PrpRange, + slot: usize, +) -> spec::Completion { + let offset_in_queue = slot * size_of::(); + let page_in_queue = offset_in_queue / PAGE_SIZE64 as usize; + let offset_in_page = offset_in_queue % PAGE_SIZE64 as usize; + let gpa = (dm.range().gpns()[page_in_queue] * PAGE_SIZE64) + offset_in_page as u64; + + gm.read_plain::(gpa).unwrap() +} diff --git a/vm/devices/storage/nvme_test/src/workers.rs b/vm/devices/storage/nvme_test/src/workers.rs new file mode 100644 index 0000000000..01ade6b3bb --- /dev/null +++ b/vm/devices/storage/nvme_test/src/workers.rs @@ -0,0 +1,29 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! NVMe admin and IO queue workers. + +mod admin; +mod coordinator; +mod io; + +pub use admin::NsidConflict; +pub use coordinator::NvmeFaultControllerClient; +pub use coordinator::NvmeWorkers; + +use crate::PAGE_SIZE; +use inspect::Inspect; + +#[derive(Debug, Copy, Clone, Inspect, Default)] +pub struct IoQueueEntrySizes { + pub sqe_bits: u8, + pub cqe_bits: u8, +} + +const MAX_DATA_TRANSFER_SIZE: usize = 256 * 1024; + +const _: () = assert!( + MAX_DATA_TRANSFER_SIZE.is_power_of_two() + && MAX_DATA_TRANSFER_SIZE % PAGE_SIZE == 0 + && MAX_DATA_TRANSFER_SIZE / PAGE_SIZE > 1 +); diff --git a/vm/devices/storage/nvme_test/src/workers/admin.rs b/vm/devices/storage/nvme_test/src/workers/admin.rs new file mode 100644 index 0000000000..0dd33d3524 --- /dev/null +++ b/vm/devices/storage/nvme_test/src/workers/admin.rs @@ -0,0 +1,1144 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Admin queue handler. + +use super::IoQueueEntrySizes; +use super::MAX_DATA_TRANSFER_SIZE; +use super::io::IoHandler; +use super::io::IoState; +use crate::DOORBELL_STRIDE_BITS; +use crate::FaultConfiguration; +use crate::MAX_QES; +use crate::NVME_VERSION; +use crate::PAGE_MASK; +use crate::PAGE_SIZE; +use crate::QueueFaultBehavior; +use crate::VENDOR_ID; +use crate::error::CommandResult; +use crate::error::NvmeError; +use crate::namespace::Namespace; +use crate::prp::PrpRange; +use crate::queue::CompletionQueue; +use crate::queue::DoorbellRegister; +use crate::queue::QueueError; +use crate::queue::ShadowDoorbell; +use crate::queue::SubmissionQueue; +use crate::spec; +use disk_backend::Disk; +use futures::FutureExt; +use futures::SinkExt; +use futures::StreamExt; +use futures_concurrency::future::Race; +use guestmem::GuestMemory; +use guid::Guid; +use inspect::Inspect; +use pal_async::task::Spawn; +use pal_async::task::Task; +use parking_lot::Mutex; +use std::collections::BTreeMap; +use std::collections::btree_map; +use std::future::pending; +use std::io::Cursor; +use std::io::Write; +use std::sync::Arc; +use task_control::AsyncRun; +use task_control::Cancelled; +use task_control::InspectTask; +use task_control::StopTask; +use task_control::TaskControl; +use thiserror::Error; +use vmcore::interrupt::Interrupt; +use vmcore::vm_task::VmTaskDriver; +use vmcore::vm_task::VmTaskDriverSource; +use zerocopy::FromBytes; +use zerocopy::FromZeros; +use zerocopy::IntoBytes; + +const IOSQES: u8 = 6; +const IOCQES: u8 = 4; +const MAX_ASYNC_EVENT_REQUESTS: u8 = 4; // minimum recommended by spec +const ERROR_LOG_PAGE_ENTRIES: u8 = 1; + +#[derive(Inspect)] +pub struct AdminConfig { + #[inspect(skip)] + pub driver_source: VmTaskDriverSource, + #[inspect(skip)] + pub mem: GuestMemory, + #[inspect(skip)] + pub interrupts: Vec, + #[inspect(skip)] + pub doorbells: Vec>, + #[inspect(display)] + pub subsystem_id: Guid, + pub max_sqs: u16, + pub max_cqs: u16, + pub qe_sizes: Arc>, + #[inspect(skip)] + pub fault_configuration: FaultConfiguration, +} + +#[derive(Inspect)] +pub struct AdminHandler { + driver: VmTaskDriver, + config: AdminConfig, + #[inspect(iter_by_key)] + namespaces: BTreeMap>, +} + +#[derive(Inspect)] +pub struct AdminState { + admin_sq: SubmissionQueue, + admin_cq: CompletionQueue, + #[inspect(with = "|x| inspect::iter_by_index(x).map_key(|x| x + 1)")] + io_sqs: Vec, + #[inspect(with = "|x| inspect::iter_by_index(x).map_key(|x| x + 1)")] + io_cqs: Vec>, + #[inspect(skip)] + sq_delete_response: mesh::Receiver, + #[inspect(with = "Option::is_some")] + shadow_db_evt_gpa_base: Option, + #[inspect(iter_by_index)] + asynchronous_event_requests: Vec, + #[inspect( + rename = "namespaces", + with = "|x| inspect::iter_by_key(x.iter().map(|v| (v, ChangedNamespace { changed: true })))" + )] + changed_namespaces: Vec, + notified_changed_namespaces: bool, + #[inspect(skip)] + recv_changed_namespace: futures::channel::mpsc::Receiver, + #[inspect(skip)] + send_changed_namespace: futures::channel::mpsc::Sender, + #[inspect(skip)] + poll_namespace_change: BTreeMap>, +} + +#[derive(Inspect)] +struct ChangedNamespace { + changed: bool, +} + +#[derive(Inspect)] +struct IoSq { + #[inspect(flatten)] + task: TaskControl, + driver: VmTaskDriver, + pending_delete_cid: Option, + cqid: Option, + shadow_db_evt_idx: Option, +} + +#[derive(Inspect)] +struct IoCq { + #[inspect(hex)] + gpa: u64, + #[inspect(hex)] + len: u16, + interrupt: Option, + sqid: Option, + shadow_db_evt_idx: Option, +} + +impl AdminState { + pub fn new(handler: &AdminHandler, asq: u64, asqs: u16, acq: u64, acqs: u16) -> Self { + // Start polling for namespace changes. Use a bounded channel to avoid + // unbounded memory allocation when the queue is stuck. + #[expect(clippy::disallowed_methods)] // TODO + let (send_changed_namespace, recv_changed_namespace) = futures::channel::mpsc::channel(256); + let poll_namespace_change = handler + .namespaces + .iter() + .map(|(&nsid, namespace)| { + ( + nsid, + spawn_namespace_notifier( + &handler.driver, + nsid, + namespace.clone(), + send_changed_namespace.clone(), + ), + ) + }) + .collect(); + + let mut state = Self { + admin_sq: SubmissionQueue::new(handler.config.doorbells[0].clone(), asq, asqs, None), + admin_cq: CompletionQueue::new( + handler.config.doorbells[1].clone(), + Some(handler.config.interrupts[0].clone()), + acq, + acqs, + None, + ), + io_sqs: Vec::new(), + io_cqs: Vec::new(), + sq_delete_response: Default::default(), + shadow_db_evt_gpa_base: None, + asynchronous_event_requests: Vec::new(), + changed_namespaces: Vec::new(), + notified_changed_namespaces: false, + recv_changed_namespace, + send_changed_namespace, + poll_namespace_change, + }; + state.set_max_queues(handler, handler.config.max_sqs, handler.config.max_cqs); + state + } + + /// Stops all submission queues and drains them of any pending IO. + /// + /// This future may be dropped and reissued. + pub async fn drain(&mut self) { + for sq in &mut self.io_sqs { + sq.task.stop().await; + if let Some(state) = sq.task.state_mut() { + state.drain().await; + sq.task.remove(); + } + } + } + + /// Caller must ensure that no queues are active. + fn set_max_queues(&mut self, handler: &AdminHandler, num_sqs: u16, num_cqs: u16) { + let num_qids = 2 + num_sqs.max(num_cqs) * 2; + assert!(handler.config.doorbells.len() >= num_qids as usize); + + self.io_sqs.truncate(num_sqs.into()); + self.io_sqs + .extend((self.io_sqs.len()..num_sqs.into()).map(|i| { + // This driver doesn't explicitly do any IO (that's handled by + // the storage backends), so the target VP doesn't matter. But + // set it anyway as a hint to the backend that this queue needs + // its own thread. + let driver = handler + .config + .driver_source + .builder() + .run_on_target(false) + .target_vp(0) + .build("nvme"); + + IoSq { + task: TaskControl::new(IoHandler::new( + handler.config.mem.clone(), + i as u16 + 1, + self.sq_delete_response.sender(), + )), + pending_delete_cid: None, + cqid: None, + shadow_db_evt_idx: None, + driver, + } + })); + self.io_cqs.resize_with(num_cqs.into(), || None); + } + + fn add_changed_namespace(&mut self, nsid: u32) { + if let Err(i) = self.changed_namespaces.binary_search(&nsid) { + self.changed_namespaces.insert(i, nsid); + } + } + + async fn add_namespace( + &mut self, + driver: &VmTaskDriver, + nsid: u32, + namespace: &Arc, + ) { + // Update the IO queues. + for sq in &mut self.io_sqs { + let io_running = sq.task.stop().await; + if let Some(io_state) = sq.task.state_mut() { + io_state.add_namespace(nsid, namespace.clone()); + } + if io_running { + sq.task.start(); + } + } + + // Start polling. + let old = self.poll_namespace_change.insert( + nsid, + spawn_namespace_notifier( + driver, + nsid, + namespace.clone(), + self.send_changed_namespace.clone(), + ), + ); + assert!(old.is_none()); + + // Notify the guest driver of the change. + self.add_changed_namespace(nsid); + } + + async fn remove_namespace(&mut self, nsid: u32) { + // Update the IO queues. + for sq in &mut self.io_sqs { + let io_running = sq.task.stop().await; + if let Some(io_state) = sq.task.state_mut() { + io_state.remove_namespace(nsid); + } + if io_running { + sq.task.start(); + } + } + + // Stop polling. + self.poll_namespace_change + .remove(&nsid) + .unwrap() + .cancel() + .await; + + // Notify the guest driver of the change. + self.add_changed_namespace(nsid); + + self.poll_namespace_change + .remove(&nsid) + .unwrap() + .cancel() + .await; + } +} + +fn spawn_namespace_notifier( + driver: &VmTaskDriver, + nsid: u32, + namespace: Arc, + mut send_changed_namespace: futures::channel::mpsc::Sender, +) -> Task<()> { + driver.spawn("wait_resize", async move { + let mut counter = None; + loop { + counter = Some(namespace.wait_change(counter).await); + tracing::info!(nsid, "namespace changed"); + if send_changed_namespace.send(nsid).await.is_err() { + break; + } + } + }) +} + +#[derive(Debug, Error)] +#[error("invalid queue identifier {qid}")] +struct InvalidQueueIdentifier { + qid: u16, + #[source] + reason: InvalidQueueIdentifierReason, +} + +#[derive(Debug, Error)] +enum InvalidQueueIdentifierReason { + #[error("queue id is out of bounds")] + Oob, + #[error("queue id is in use")] + InUse, + #[error("queue id is not in use")] + NotInUse, +} + +impl From for NvmeError { + fn from(err: InvalidQueueIdentifier) -> Self { + Self::new(spec::Status::INVALID_QUEUE_IDENTIFIER, err) + } +} + +enum Event { + Command(Result), + SqDeleteComplete(u16), + NamespaceChange(u32), +} + +/// Error returned when adding a namespace with a conflicting ID. +#[derive(Debug, Error)] +#[error("namespace id conflict for {0}")] +pub struct NsidConflict(u32); + +impl AdminHandler { + pub fn new(driver: VmTaskDriver, config: AdminConfig) -> Self { + Self { + driver, + config, + namespaces: Default::default(), + } + } + + pub async fn add_namespace( + &mut self, + state: Option<&mut AdminState>, + nsid: u32, + disk: Disk, + ) -> Result<(), NsidConflict> { + let namespace = &*match self.namespaces.entry(nsid) { + btree_map::Entry::Vacant(entry) => entry.insert(Arc::new(Namespace::new( + self.config.mem.clone(), + nsid, + disk, + ))), + btree_map::Entry::Occupied(_) => return Err(NsidConflict(nsid)), + }; + + if let Some(state) = state { + state.add_namespace(&self.driver, nsid, namespace).await; + } + + Ok(()) + } + + pub async fn remove_namespace(&mut self, state: Option<&mut AdminState>, nsid: u32) -> bool { + if self.namespaces.remove(&nsid).is_none() { + return false; + } + + if let Some(state) = state { + state.remove_namespace(nsid).await; + } + + true + } + + async fn next_event(&mut self, state: &mut AdminState) -> Result { + let event = loop { + // Wait for there to be room for a completion for the next + // command or the completed sq deletion. + state.admin_cq.wait_ready(&self.config.mem).await?; + + if !state.changed_namespaces.is_empty() && !state.notified_changed_namespaces { + if let Some(cid) = state.asynchronous_event_requests.pop() { + state.admin_cq.write( + &self.config.mem, + spec::Completion { + dw0: spec::AsynchronousEventRequestDw0::new() + .with_event_type(spec::AsynchronousEventType::NOTICE.0) + .with_log_page_identifier(spec::LogPageIdentifier::CHANGED_NAMESPACE_LIST.0) + .with_information(spec::AsynchronousEventInformationNotice::NAMESPACE_ATTRIBUTE_CHANGED.0) + .into(), + dw1: 0, + sqhd: state.admin_sq.sqhd(), + sqid: 0, + cid, + status: spec::CompletionStatus::new(), + }, + )?; + + state.notified_changed_namespaces = true; + continue; + } + } + + let next_command = state.admin_sq.next(&self.config.mem).map(Event::Command); + let sq_delete_complete = async { + let Some(sqid) = state.sq_delete_response.next().await else { + pending().await + }; + Event::SqDeleteComplete(sqid) + }; + let changed_namespace = async { + let Some(nsid) = state.recv_changed_namespace.next().await else { + pending().await + }; + Event::NamespaceChange(nsid) + }; + + break (next_command, sq_delete_complete, changed_namespace) + .race() + .await; + }; + Ok(event) + } + + async fn process_event( + &mut self, + state: &mut AdminState, + event: Result, + ) -> Result<(), QueueError> { + // For the admin queue, update Evt_IDX at the beginning of command + // processing, just to keep it simple. + state.admin_sq.advance_evt_idx(&self.config.mem)?; + + let (cid, result) = match event? { + Event::Command(command) => { + let mut command = command?; + let opcode = spec::AdminOpcode(command.cdw0.opcode()); + + if let Some(admin_fault) = &self.config.fault_configuration.admin_fault { + let fault = admin_fault.fault_submission_queue(command).await; + + match fault { + QueueFaultBehavior::Update(command_updated) => { + tracing::warn!( + "configured fault: admin command updated in sq. original: {:?},\n new: {:?}", + &command, + &command_updated + ); + command = command_updated; + } + QueueFaultBehavior::Drop => { + tracing::warn!( + "configured fault: admin command dropped from sq {:?}", + &command + ); + return Ok(()); + } + QueueFaultBehavior::Default => {} + } + } + + let result = match opcode { + spec::AdminOpcode::IDENTIFY => self + .handle_identify(&command) + .map(|()| Some(Default::default())), + spec::AdminOpcode::GET_FEATURES => { + self.handle_get_features(state, &command).await.map(Some) + } + spec::AdminOpcode::SET_FEATURES => { + self.handle_set_features(state, &command).map(Some) + } + spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE => self + .handle_create_io_completion_queue(state, &command) + .map(|()| Some(Default::default())), + spec::AdminOpcode::CREATE_IO_SUBMISSION_QUEUE => self + .handle_create_io_submission_queue(state, &command) + .map(|()| Some(Default::default())), + spec::AdminOpcode::DELETE_IO_COMPLETION_QUEUE => self + .handle_delete_io_completion_queue(state, &command) + .map(|()| Some(Default::default())), + spec::AdminOpcode::DELETE_IO_SUBMISSION_QUEUE => { + self.handle_delete_io_submission_queue(state, &command) + } + spec::AdminOpcode::ASYNCHRONOUS_EVENT_REQUEST => { + self.handle_asynchronous_event_request(state, &command) + } + spec::AdminOpcode::ABORT => self.handle_abort(), + spec::AdminOpcode::GET_LOG_PAGE => self + .handle_get_log_page(state, &command) + .map(|()| Some(Default::default())), + spec::AdminOpcode::DOORBELL_BUFFER_CONFIG => self + .handle_doorbell_buffer_config(state, &command) + .map(|()| Some(Default::default())), + opcode => { + tracelimit::warn_ratelimited!(?opcode, "unsupported opcode"); + Err(spec::Status::INVALID_COMMAND_OPCODE.into()) + } + }; + + let result = match result { + Ok(Some(cr)) => cr, + Ok(None) => return Ok(()), + Err(err) => { + tracelimit::warn_ratelimited!( + error = &err as &dyn std::error::Error, + cid = command.cdw0.cid(), + ?opcode, + "command error" + ); + err.into() + } + }; + + (command.cdw0.cid(), result) + } + Event::SqDeleteComplete(sqid) => { + let sq = &mut state.io_sqs[sqid as usize - 1]; + let cid = sq.pending_delete_cid.take().unwrap(); + let cqid = sq.cqid.take().unwrap(); + sq.task.stop().await; + sq.task.remove(); + assert_eq!( + state.io_cqs[cqid as usize - 1] + .as_mut() + .unwrap() + .sqid + .take(), + Some(sqid) + ); + (cid, Default::default()) + } + Event::NamespaceChange(nsid) => { + state.add_changed_namespace(nsid); + return Ok(()); + } + }; + + let status = spec::CompletionStatus::new().with_status(result.status.0); + + let mut completion = spec::Completion { + dw0: result.dw[0], + dw1: result.dw[1], + sqid: 0, + sqhd: state.admin_sq.sqhd(), + status, + cid, + }; + + if let Some(admin_fault) = &self.config.fault_configuration.admin_fault { + let fault = admin_fault.fault_completion_queue(completion.clone()).await; + + match fault { + QueueFaultBehavior::Update(completion_new) => { + tracelimit::warn_ratelimited!( + "configured fault: admin completion updated in cq. original: {:?},\n new: {:?}", + &completion, + &completion_new + ); + completion = completion_new; + } + QueueFaultBehavior::Drop => { + tracelimit::warn_ratelimited!( + "configured fault: admin completion dropped from cq {:?}", + &completion + ); + return Ok(()); + } + QueueFaultBehavior::Default => {} + } + } + + state.admin_cq.write(&self.config.mem, completion)?; + // Again, for simplicity, update EVT_IDX here. + state.admin_cq.catch_up_evt_idx(true, 0, &self.config.mem)?; + Ok(()) + } + + fn handle_identify(&mut self, command: &spec::Command) -> Result<(), NvmeError> { + let cdw10: spec::Cdw10Identify = command.cdw10.into(); + // All identify results are 4096 bytes. + let mut buf = [0u64; 512]; + let buf = buf.as_mut_bytes(); + match spec::Cns(cdw10.cns()) { + spec::Cns::CONTROLLER => { + let id = spec::IdentifyController::mut_from_prefix(buf).unwrap().0; // TODO: zerocopy: from-prefix (mut_from_prefix): use-rest-of-range (https://github.com/microsoft/openvmm/issues/759) + *id = self.identify_controller(); + + write!( + Cursor::new(&mut id.subnqn[..]), + "nqn.2014-08.org.nvmexpress:uuid:{}", + self.config.subsystem_id + ) + .unwrap(); + } + spec::Cns::ACTIVE_NAMESPACES => { + if command.nsid >= 0xfffffffe { + return Err(spec::Status::INVALID_NAMESPACE_OR_FORMAT.into()); + } + let nsids = <[u32]>::mut_from_bytes(buf).unwrap(); + for (ns, nsid) in self + .namespaces + .keys() + .filter(|&ns| *ns > command.nsid) + .zip(nsids) + { + *nsid = *ns; + } + } + spec::Cns::NAMESPACE => { + if let Some(ns) = self.namespaces.get(&command.nsid) { + ns.identify(buf); + } else { + tracelimit::warn_ratelimited!(nsid = command.nsid, "unknown namespace id"); + } + } + spec::Cns::DESCRIPTOR_NAMESPACE => { + if let Some(ns) = self.namespaces.get(&command.nsid) { + ns.namespace_id_descriptor(buf); + } else { + tracelimit::warn_ratelimited!(nsid = command.nsid, "unknown namespace id"); + } + } + cns => { + tracelimit::warn_ratelimited!(?cns, "unsupported cns"); + return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()); + } + }; + PrpRange::parse(&self.config.mem, buf.len(), command.dptr)?.write(&self.config.mem, buf)?; + Ok(()) + } + + fn identify_controller(&self) -> spec::IdentifyController { + let oacs = spec::OptionalAdminCommandSupport::from(0).with_doorbell_buffer_config(true); + spec::IdentifyController { + vid: VENDOR_ID, + ssvid: VENDOR_ID, + mdts: (MAX_DATA_TRANSFER_SIZE / PAGE_SIZE).trailing_zeros() as u8, + ver: NVME_VERSION, + rtd3r: 400000, + rtd3e: 400000, + sqes: spec::QueueEntrySize::new() + .with_min(IOSQES) + .with_max(IOSQES), + cqes: spec::QueueEntrySize::new() + .with_min(IOCQES) + .with_max(IOCQES), + frmw: spec::FirmwareUpdates::new().with_ffsro(true).with_nofs(1), + nn: self.namespaces.keys().copied().max().unwrap_or(0), + ieee: [0x74, 0xe2, 0x8c], // Microsoft + fr: (*b"v1.00000").into(), + mn: (*b"MSFT NVMe Accelerator v1.0 ").into(), + sn: (*b"SN: 000001 ").into(), + aerl: MAX_ASYNC_EVENT_REQUESTS - 1, + elpe: ERROR_LOG_PAGE_ENTRIES - 1, + oaes: spec::Oaes::new().with_namespace_attribute(true), + oncs: spec::Oncs::new() + .with_dataset_management(true) + // Namespaces still have to opt in individually via `rescap`. + .with_reservations(true), + vwc: spec::VolatileWriteCache::new() + .with_present(true) + .with_broadcast_flush_behavior(spec::BroadcastFlushBehavior::NOT_SUPPORTED.0), + cntrltype: spec::ControllerType::IO_CONTROLLER, + oacs, + ..FromZeros::new_zeroed() + } + } + + fn handle_set_features( + &mut self, + state: &mut AdminState, + command: &spec::Command, + ) -> Result { + let cdw10: spec::Cdw10SetFeatures = command.cdw10.into(); + let mut dw = [0; 2]; + // Note that we don't support non-zero cdw10.save, since ONCS.save == 0. + match spec::Feature(cdw10.fid()) { + spec::Feature::NUMBER_OF_QUEUES => { + if state.io_sqs.iter().any(|sq| sq.task.has_state()) + || state.io_cqs.iter().any(|cq| cq.is_some()) + { + return Err(spec::Status::COMMAND_SEQUENCE_ERROR.into()); + } + let cdw11: spec::Cdw11FeatureNumberOfQueues = command.cdw11.into(); + if cdw11.ncq_z() == u16::MAX || cdw11.nsq_z() == u16::MAX { + return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()); + } + let num_sqs = (cdw11.nsq_z() + 1).min(self.config.max_sqs); + let num_cqs = (cdw11.ncq_z() + 1).min(self.config.max_cqs); + state.set_max_queues(self, num_sqs, num_cqs); + + dw[0] = spec::Cdw11FeatureNumberOfQueues::new() + .with_ncq_z(num_cqs - 1) + .with_nsq_z(num_sqs - 1) + .into(); + } + spec::Feature::VOLATILE_WRITE_CACHE => { + let cdw11 = spec::Cdw11FeatureVolatileWriteCache::from(command.cdw11); + if !cdw11.wce() { + tracelimit::warn_ratelimited!( + "ignoring unsupported attempt to disable write cache" + ); + } + } + feature => { + tracelimit::warn_ratelimited!(?feature, "unsupported feature"); + return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()); + } + } + Ok(CommandResult::new(spec::Status::SUCCESS, dw)) + } + + async fn handle_get_features( + &mut self, + state: &mut AdminState, + command: &spec::Command, + ) -> Result { + let cdw10: spec::Cdw10GetFeatures = command.cdw10.into(); + let mut dw = [0; 2]; + + // Note that we don't support non-zero cdw10.sel, since ONCS.save == 0. + match spec::Feature(cdw10.fid()) { + spec::Feature::NUMBER_OF_QUEUES => { + let num_cqs = state.io_cqs.len(); + let num_sqs = state.io_sqs.len(); + dw[0] = spec::Cdw11FeatureNumberOfQueues::new() + .with_ncq_z((num_cqs - 1) as u16) + .with_nsq_z((num_sqs - 1) as u16) + .into(); + } + spec::Feature::VOLATILE_WRITE_CACHE => { + // Write cache is always enabled. + dw[0] = spec::Cdw11FeatureVolatileWriteCache::new() + .with_wce(true) + .into(); + } + spec::Feature::NVM_RESERVATION_PERSISTENCE => { + let namespace = self + .namespaces + .get(&command.nsid) + .ok_or(spec::Status::INVALID_NAMESPACE_OR_FORMAT)?; + + return namespace.get_feature(command).await; + } + feature => { + tracelimit::warn_ratelimited!(?feature, "unsupported feature"); + return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()); + } + } + Ok(CommandResult::new(spec::Status::SUCCESS, dw)) + } + + fn handle_create_io_completion_queue( + &mut self, + state: &mut AdminState, + command: &spec::Command, + ) -> Result<(), NvmeError> { + let cdw10: spec::Cdw10CreateIoQueue = command.cdw10.into(); + let cdw11: spec::Cdw11CreateIoCompletionQueue = command.cdw11.into(); + if !cdw11.pc() { + return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()); + } + let cqid = cdw10.qid(); + let io_queue = state + .io_cqs + .get_mut((cqid as usize).wrapping_sub(1)) + .ok_or(InvalidQueueIdentifier { + qid: cqid, + reason: InvalidQueueIdentifierReason::Oob, + })?; + + if io_queue.is_some() { + return Err(InvalidQueueIdentifier { + qid: cqid, + reason: InvalidQueueIdentifierReason::InUse, + } + .into()); + } + + let interrupt = if cdw11.ien() { + let iv = cdw11.iv(); + if iv as usize >= self.config.interrupts.len() { + return Err(spec::Status::INVALID_INTERRUPT_VECTOR.into()); + }; + Some(iv) + } else { + None + }; + let gpa = command.dptr[0] & PAGE_MASK; + let len0 = cdw10.qsize_z(); + if len0 == 0 || len0 >= MAX_QES || self.config.qe_sizes.lock().cqe_bits != IOCQES { + return Err(spec::Status::INVALID_QUEUE_SIZE.into()); + } + + let mut shadow_db_evt_idx: Option = None; + if let Some(shadow_db_evt_gpa_base) = state.shadow_db_evt_gpa_base { + shadow_db_evt_idx = Some(ShadowDoorbell::new( + shadow_db_evt_gpa_base, + cqid, + false, + DOORBELL_STRIDE_BITS.into(), + )); + } + + *io_queue = Some(IoCq { + gpa, + len: len0 + 1, + interrupt, + sqid: None, + shadow_db_evt_idx, + }); + Ok(()) + } + + fn handle_create_io_submission_queue( + &mut self, + state: &mut AdminState, + command: &spec::Command, + ) -> Result<(), NvmeError> { + let cdw10: spec::Cdw10CreateIoQueue = command.cdw10.into(); + let cdw11: spec::Cdw11CreateIoSubmissionQueue = command.cdw11.into(); + if !cdw11.pc() { + return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()); + } + let sqid = cdw10.qid(); + let sq = state + .io_sqs + .get_mut((sqid as usize).wrapping_sub(1)) + .ok_or(InvalidQueueIdentifier { + qid: sqid, + reason: InvalidQueueIdentifierReason::Oob, + })?; + + if sq.task.has_state() { + return Err(InvalidQueueIdentifier { + qid: sqid, + reason: InvalidQueueIdentifierReason::InUse, + } + .into()); + } + + let cqid = cdw11.cqid(); + let cq = state + .io_cqs + .get_mut((cqid as usize).wrapping_sub(1)) + .and_then(|x| x.as_mut()) + .ok_or(spec::Status::COMPLETION_QUEUE_INVALID)?; + + // Don't allow sharing completion queues. This isn't spec compliant + // but it simplifies the device significantly and OSes don't seem to + // mind. This could be fixed by having a slower path when completion + // queues are shared. + if cq.sqid.is_some() { + return Err(spec::Status::COMPLETION_QUEUE_INVALID.into()); + } + + let sq_gpa = command.dptr[0] & PAGE_MASK; + let len0 = cdw10.qsize_z(); + if len0 == 0 || len0 >= MAX_QES || self.config.qe_sizes.lock().sqe_bits != IOSQES { + return Err(spec::Status::INVALID_QUEUE_SIZE.into()); + } + + if let Some(shadow_db_evt_gpa_base) = state.shadow_db_evt_gpa_base { + sq.shadow_db_evt_idx = Some(ShadowDoorbell::new( + shadow_db_evt_gpa_base, + sqid, + true, + DOORBELL_STRIDE_BITS.into(), + )); + } + + cq.sqid = Some(sqid); + sq.cqid = Some(cqid); + let sq_tail = self.config.doorbells[sqid as usize * 2].clone(); + let cq_head = self.config.doorbells[cqid as usize * 2 + 1].clone(); + let interrupt = cq + .interrupt + .map(|iv| self.config.interrupts[iv as usize].clone()); + let namespaces = self.namespaces.clone(); + let sq_len = len0 + 1; + let cq_gpa = cq.gpa; + let cq_len = cq.len; + let state = IoState::new( + sq_gpa, + sq_len, + sq_tail, + sq.shadow_db_evt_idx, + cq_gpa, + cq_len, + cq_head, + cq.shadow_db_evt_idx, + interrupt, + namespaces, + ); + sq.task.insert(&sq.driver, "nvme-io", state); + sq.task.start(); + Ok(()) + } + + fn handle_delete_io_submission_queue( + &self, + state: &mut AdminState, + command: &spec::Command, + ) -> Result, NvmeError> { + let cdw10: spec::Cdw10DeleteIoQueue = command.cdw10.into(); + let sqid = cdw10.qid(); + let sq = state + .io_sqs + .get_mut((sqid as usize).wrapping_sub(1)) + .ok_or(InvalidQueueIdentifier { + qid: sqid, + reason: InvalidQueueIdentifierReason::Oob, + })?; + + if !sq.task.has_state() || sq.pending_delete_cid.is_some() { + return Err(InvalidQueueIdentifier { + qid: sqid, + reason: InvalidQueueIdentifierReason::NotInUse, + } + .into()); + } + + sq.task + .update_with(|sq, sq_state| sq.delete(sq_state.unwrap())); + sq.pending_delete_cid = Some(command.cdw0.cid()); + Ok(None) + } + + fn handle_delete_io_completion_queue( + &self, + state: &mut AdminState, + command: &spec::Command, + ) -> Result<(), NvmeError> { + let cdw10: spec::Cdw10DeleteIoQueue = command.cdw10.into(); + let cqid = cdw10.qid(); + let cq = state + .io_cqs + .get_mut((cqid as usize).wrapping_sub(1)) + .ok_or(InvalidQueueIdentifier { + qid: cqid, + reason: InvalidQueueIdentifierReason::Oob, + })?; + + let active_cq = cq.as_ref().ok_or(InvalidQueueIdentifier { + qid: cqid, + reason: InvalidQueueIdentifierReason::NotInUse, + })?; + if active_cq.sqid.is_some() { + return Err(spec::Status::INVALID_QUEUE_DELETION.into()); + } + + *cq = None; + Ok(()) + } + + fn handle_asynchronous_event_request( + &self, + state: &mut AdminState, + command: &spec::Command, + ) -> Result, NvmeError> { + if state.asynchronous_event_requests.len() >= MAX_ASYNC_EVENT_REQUESTS as usize { + return Err(spec::Status::ASYNCHRONOUS_EVENT_REQUEST_LIMIT_EXCEEDED.into()); + } + state.asynchronous_event_requests.push(command.cdw0.cid()); + Ok(None) + } + + /// Abort is a required command, but a legal implementation is to just + /// complete it with a status that means "I'm sorry, that command couldn't + /// be aborted." + fn handle_abort(&self) -> Result, NvmeError> { + Ok(Some(CommandResult { + status: spec::Status::SUCCESS, + dw: [1, 0], + })) + } + + fn handle_get_log_page( + &self, + state: &mut AdminState, + command: &spec::Command, + ) -> Result<(), NvmeError> { + let cdw10 = spec::Cdw10GetLogPage::from(command.cdw10); + let cdw11 = spec::Cdw11GetLogPage::from(command.cdw11); + let numd = + ((cdw10.numdl_z() as u32) | ((cdw11.numdu() as u32) << 16)).saturating_add(1) as usize; + let len = numd * 4; + let prp = PrpRange::parse(&self.config.mem, len, command.dptr)?; + + match spec::LogPageIdentifier(cdw10.lid()) { + spec::LogPageIdentifier::ERROR_INFORMATION => { + // Write empty log entries. + prp.zero( + &self.config.mem, + len.min(ERROR_LOG_PAGE_ENTRIES as usize * 64), + )?; + } + spec::LogPageIdentifier::HEALTH_INFORMATION => { + if command.nsid != !0 { + return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()); + } + // Write an empty page. + prp.zero(&self.config.mem, len.min(512))?; + } + spec::LogPageIdentifier::FIRMWARE_SLOT_INFORMATION => { + // Write an empty page. + prp.zero(&self.config.mem, len.min(512))?; + } + spec::LogPageIdentifier::CHANGED_NAMESPACE_LIST => { + // Zero the whole list. + prp.zero(&self.config.mem, len.min(4096))?; + // Now write in the changed namespaces. + if state.changed_namespaces.len() > 1024 { + // Too many to fit, write !0 so the driver scans everything. + prp.write(&self.config.mem, (!0u32).as_bytes())?; + } else { + let count = state.changed_namespaces.len().min(numd); + prp.write( + &self.config.mem, + state.changed_namespaces[..count].as_bytes(), + )?; + } + state.changed_namespaces.clear(); + if !cdw10.rae() { + state.notified_changed_namespaces = false; + } + } + lid => { + tracelimit::warn_ratelimited!(?lid, "unsupported log page"); + return Err(spec::Status::INVALID_LOG_PAGE.into()); + } + } + + Ok(()) + } + + fn handle_doorbell_buffer_config( + &self, + state: &mut AdminState, + command: &spec::Command, + ) -> Result<(), NvmeError> { + let shadow_db_gpa = command.dptr[0]; + let event_idx_gpa = command.dptr[1]; + + if (shadow_db_gpa == 0) + || (shadow_db_gpa & 0xfff != 0) + || (event_idx_gpa == 0) + || (event_idx_gpa & 0xfff != 0) + || (shadow_db_gpa == event_idx_gpa) + { + return Err(spec::Status::INVALID_FIELD_IN_COMMAND.into()); + } + + // Stash the base values for use in data queue creation. + let sdb_base = ShadowDoorbell { + shadow_db_gpa, + event_idx_gpa, + }; + state.shadow_db_evt_gpa_base = Some(sdb_base); + + // Update the admin queue to use shadow doorbells. + state.admin_sq.update_shadow_db( + &self.config.mem, + ShadowDoorbell::new(sdb_base, 0, true, DOORBELL_STRIDE_BITS.into()), + ); + state.admin_cq.update_shadow_db( + &self.config.mem, + ShadowDoorbell::new(sdb_base, 0, false, DOORBELL_STRIDE_BITS.into()), + ); + + // Update any data queues with the new shadow doorbell base. + for (qid, sq) in state.io_sqs.iter_mut().enumerate() { + if !sq.task.has_state() { + continue; + } + let gm = self.config.mem.clone(); + + // Data queue pairs are qid + 1, because the admin queue isn't in this vector. + let sq_sdb = + ShadowDoorbell::new(sdb_base, qid as u16 + 1, true, DOORBELL_STRIDE_BITS.into()); + let cq_sdb = + ShadowDoorbell::new(sdb_base, qid as u16 + 1, false, DOORBELL_STRIDE_BITS.into()); + + sq.task.update_with(move |sq, sq_state| { + sq.update_shadow_db(&gm, sq_state.unwrap(), sq_sdb, cq_sdb); + }); + } + Ok(()) + } +} + +impl AsyncRun for AdminHandler { + async fn run( + &mut self, + stop: &mut StopTask<'_>, + state: &mut AdminState, + ) -> Result<(), Cancelled> { + loop { + let event = stop.until_stopped(self.next_event(state)).await?; + if let Err(err) = self.process_event(state, event).await { + tracing::error!( + error = &err as &dyn std::error::Error, + "admin queue failure" + ); + break; + } + } + Ok(()) + } +} + +impl InspectTask for AdminHandler { + fn inspect(&self, req: inspect::Request<'_>, state: Option<&AdminState>) { + req.respond().merge(self).merge(state); + } +} diff --git a/vm/devices/storage/nvme_test/src/workers/coordinator.rs b/vm/devices/storage/nvme_test/src/workers/coordinator.rs new file mode 100644 index 0000000000..db44e5631c --- /dev/null +++ b/vm/devices/storage/nvme_test/src/workers/coordinator.rs @@ -0,0 +1,318 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Coordinator between queues and hot add/remove of namespaces. + +use super::IoQueueEntrySizes; +use super::admin::AdminConfig; +use super::admin::AdminHandler; +use super::admin::AdminState; +use super::admin::NsidConflict; +use crate::FaultConfiguration; +use crate::queue::DoorbellRegister; +use disk_backend::Disk; +use futures::FutureExt; +use futures::StreamExt; +use futures_concurrency::future::Race; +use guestmem::GuestMemory; +use guid::Guid; +use inspect::Inspect; +use inspect::InspectMut; +use mesh::rpc::PendingRpc; +use mesh::rpc::Rpc; +use mesh::rpc::RpcSend; +use pal_async::task::Spawn; +use pal_async::task::Task; +use parking_lot::Mutex; +use std::future::pending; +use std::sync::Arc; +use task_control::TaskControl; +use vmcore::interrupt::Interrupt; +use vmcore::vm_task::VmTaskDriver; +use vmcore::vm_task::VmTaskDriverSource; + +pub struct NvmeWorkers { + _task: Task<()>, + send: mesh::Sender, + doorbells: Vec>, + state: EnableState, +} + +#[derive(Debug)] +enum EnableState { + Disabled, + Enabling(PendingRpc<()>), + Enabled, + Resetting(PendingRpc<()>), +} + +impl InspectMut for NvmeWorkers { + fn inspect_mut(&mut self, req: inspect::Request<'_>) { + self.send.send(CoordinatorRequest::Inspect(req.defer())); + } +} + +impl NvmeWorkers { + pub fn new( + driver_source: &VmTaskDriverSource, + mem: GuestMemory, + interrupts: Vec, + max_sqs: u16, + max_cqs: u16, + qe_sizes: Arc>, + subsystem_id: Guid, + fault_configuration: FaultConfiguration, + ) -> Self { + let num_qids = 2 + max_sqs.max(max_cqs) * 2; + let doorbells: Vec<_> = (0..num_qids) + .map(|_| Arc::new(DoorbellRegister::new())) + .collect(); + + let driver = driver_source.simple(); + let handler: AdminHandler = AdminHandler::new( + driver.clone(), + AdminConfig { + driver_source: driver_source.clone(), + mem, + interrupts, + doorbells: doorbells.clone(), + subsystem_id, + max_sqs, + max_cqs, + qe_sizes, + fault_configuration, + }, + ); + let coordinator = Coordinator { + driver: driver.clone(), + admin: TaskControl::new(handler), + reset: None, + }; + let (send, recv) = mesh::mpsc_channel(); + let task = driver.spawn("nvme-coord", coordinator.run(recv)); + Self { + _task: task, + send, + doorbells, + state: EnableState::Disabled, + } + } + + pub fn client(&self) -> NvmeFaultControllerClient { + NvmeFaultControllerClient { + send: self.send.clone(), + } + } + + pub fn doorbell(&self, index: u16, value: u32) { + if let Some(doorbell) = self.doorbells.get(index as usize) { + doorbell.write(value); + } else { + tracelimit::warn_ratelimited!(index, value, "unknown doorbell"); + } + } + + pub fn enable(&mut self, asq: u64, asqs: u16, acq: u64, acqs: u16) { + if let EnableState::Disabled = self.state { + self.state = EnableState::Enabling(self.send.call( + CoordinatorRequest::EnableAdmin, + EnableAdminParams { + asq, + asqs, + acq, + acqs, + }, + )); + } else { + panic!("not disabled: {:?}", self.state); + } + } + + pub fn poll_enabled(&mut self) -> bool { + if let EnableState::Enabling(recv) = &mut self.state { + if recv.now_or_never().is_some() { + self.state = EnableState::Enabled; + true + } else { + false + } + } else { + panic!("not enabling: {:?}", self.state) + } + } + + pub fn controller_reset(&mut self) { + if let EnableState::Enabled = self.state { + self.state = + EnableState::Resetting(self.send.call(CoordinatorRequest::ControllerReset, ())); + } else { + panic!("not enabled: {:?}", self.state); + } + } + + pub fn poll_controller_reset(&mut self) -> bool { + if let EnableState::Resetting(recv) = &mut self.state { + if recv.now_or_never().is_some() { + self.state = EnableState::Disabled; + true + } else { + false + } + } else { + panic!("not resetting: {:?}", self.state) + } + } + + // Reset the workers from whatever state they are in. + pub async fn reset(&mut self) { + loop { + match &mut self.state { + EnableState::Disabled => break, + EnableState::Enabling(recv) => { + recv.await.unwrap(); + self.state = EnableState::Enabled; + } + EnableState::Enabled => { + self.controller_reset(); + } + EnableState::Resetting(recv) => { + recv.await.unwrap(); + self.state = EnableState::Disabled; + } + } + } + } +} + +/// Client for modifying the NVMe controller state at runtime. +#[derive(Debug)] +pub struct NvmeFaultControllerClient { + send: mesh::Sender, +} + +impl NvmeFaultControllerClient { + /// Adds a namespace. + pub async fn add_namespace(&self, nsid: u32, disk: Disk) -> Result<(), NsidConflict> { + self.send + .call(CoordinatorRequest::AddNamespace, (nsid, disk)) + .await + .unwrap() + } + + /// Removes a namespace. + pub async fn remove_namespace(&self, nsid: u32) -> bool { + self.send + .call(CoordinatorRequest::RemoveNamespace, nsid) + .await + .unwrap() + } +} + +#[derive(Inspect)] +struct Coordinator { + driver: VmTaskDriver, + #[inspect(flatten)] + admin: TaskControl, + #[inspect(with = "Option::is_some")] + reset: Option>, +} + +enum CoordinatorRequest { + EnableAdmin(Rpc), + AddNamespace(Rpc<(u32, Disk), Result<(), NsidConflict>>), + RemoveNamespace(Rpc), + Inspect(inspect::Deferred), + ControllerReset(Rpc<(), ()>), +} + +struct EnableAdminParams { + asq: u64, + asqs: u16, + acq: u64, + acqs: u16, +} + +impl Coordinator { + async fn run(mut self, mut recv: mesh::Receiver) { + loop { + enum Event { + Request(Option), + ResetComplete, + } + + let controller_reset = async { + if self.reset.is_some() { + self.admin.stop().await; + if let Some(state) = self.admin.state_mut() { + state.drain().await; + self.admin.remove(); + } + } else { + pending().await + } + }; + + let event = ( + recv.next().map(Event::Request), + controller_reset.map(|_| Event::ResetComplete), + ) + .race() + .await; + + match event { + Event::Request(Some(req)) => match req { + CoordinatorRequest::EnableAdmin(rpc) => rpc.handle_sync( + |EnableAdminParams { + asq, + asqs, + acq, + acqs, + }| { + if !self.admin.has_state() { + let state = + AdminState::new(self.admin.task(), asq, asqs, acq, acqs); + self.admin.insert(&self.driver, "nvme-admin", state); + self.admin.start(); + } else { + tracelimit::warn_ratelimited!("duplicate attempt to enable admin"); + } + }, + ), + CoordinatorRequest::AddNamespace(rpc) => { + rpc.handle(async |(nsid, disk)| { + let running = self.admin.stop().await; + let (admin, state) = self.admin.get_mut(); + let r = admin.add_namespace(state, nsid, disk).await; + if running { + self.admin.start(); + } + r + }) + .await + } + CoordinatorRequest::RemoveNamespace(rpc) => { + rpc.handle(async |nsid| { + let running = self.admin.stop().await; + let (admin, state) = self.admin.get_mut(); + let r = admin.remove_namespace(state, nsid).await; + if running { + self.admin.start(); + } + r + }) + .await + } + CoordinatorRequest::ControllerReset(rpc) => { + assert!(self.reset.is_none()); + self.reset = Some(rpc); + } + CoordinatorRequest::Inspect(req) => req.inspect(&self), + }, + Event::Request(None) => break, + Event::ResetComplete => { + self.reset.take().unwrap().complete(()); + } + } + } + } +} diff --git a/vm/devices/storage/nvme_test/src/workers/io.rs b/vm/devices/storage/nvme_test/src/workers/io.rs new file mode 100644 index 0000000000..3247889fe2 --- /dev/null +++ b/vm/devices/storage/nvme_test/src/workers/io.rs @@ -0,0 +1,296 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! I/O queue handler. + +use crate::error::CommandResult; +use crate::error::NvmeError; +use crate::namespace::Namespace; +use crate::queue::CompletionQueue; +use crate::queue::DoorbellRegister; +use crate::queue::QueueError; +use crate::queue::ShadowDoorbell; +use crate::queue::SubmissionQueue; +use crate::spec; +use crate::spec::nvm; +use crate::workers::MAX_DATA_TRANSFER_SIZE; +use futures_concurrency::future::Race; +use guestmem::GuestMemory; +use inspect::Inspect; +use std::collections::BTreeMap; +use std::future::Future; +use std::future::pending; +use std::pin::Pin; +use std::sync::Arc; +use task_control::AsyncRun; +use task_control::Cancelled; +use task_control::InspectTask; +use task_control::StopTask; +use thiserror::Error; +use unicycle::FuturesUnordered; +use vmcore::interrupt::Interrupt; + +#[derive(Inspect)] +pub struct IoHandler { + mem: GuestMemory, + sqid: u16, + #[inspect(skip)] + admin_response: mesh::Sender, +} + +#[derive(Inspect)] +pub struct IoState { + sq: SubmissionQueue, + cq: CompletionQueue, + #[inspect(skip)] + namespaces: BTreeMap>, + #[inspect(skip)] + ios: FuturesUnordered + Send>>>, + io_count: usize, + queue_state: IoQueueState, +} + +#[derive(Inspect)] +enum IoQueueState { + Active, + Deleting, + Deleted, +} + +impl IoState { + pub fn new( + sq_gpa: u64, + sq_len: u16, + sq_tail: Arc, + sq_sdb_idx_gpas: Option, + cq_gpa: u64, + cq_len: u16, + cq_head: Arc, + cq_sdb_idx_gpas: Option, + interrupt: Option, + namespaces: BTreeMap>, + ) -> Self { + Self { + sq: SubmissionQueue::new(sq_tail, sq_gpa, sq_len, sq_sdb_idx_gpas), + cq: CompletionQueue::new(cq_head, interrupt, cq_gpa, cq_len, cq_sdb_idx_gpas), + namespaces, + ios: FuturesUnordered::new(), + io_count: 0, + queue_state: IoQueueState::Active, + } + } + + pub fn add_namespace(&mut self, nsid: u32, namespace: Arc) { + assert!(self.namespaces.insert(nsid, namespace).is_none()); + } + + pub fn remove_namespace(&mut self, nsid: u32) { + let _ = self.namespaces.remove(&nsid).unwrap(); + } + + /// Drains any pending IOs. + /// + /// This future may be dropped and reissued. + pub async fn drain(&mut self) { + while self.ios.next().await.is_some() { + self.io_count -= 1; + } + } +} + +struct IoResult { + nsid: u32, + cid: u16, + opcode: nvm::NvmOpcode, + result: Result, + advance_evt_idx: bool, +} + +impl AsyncRun for IoHandler { + async fn run(&mut self, stop: &mut StopTask<'_>, state: &mut IoState) -> Result<(), Cancelled> { + let mem = self.mem.clone(); + stop.until_stopped(async { + if let Err(err) = self.process(state, &mem).await { + tracing::error!(error = &err as &dyn std::error::Error, "io handler failed"); + } + }) + .await + } +} + +impl InspectTask for IoHandler { + fn inspect(&self, req: inspect::Request<'_>, state: Option<&IoState>) { + req.respond().merge(self).merge(state); + } +} + +const MAX_IO_QUEUE_DEPTH: usize = 8; + +#[derive(Debug, Error)] +enum HandlerError { + #[error("nvme queue error")] + Queue(#[from] QueueError), +} + +impl IoHandler { + pub fn new(mem: GuestMemory, sqid: u16, admin_response: mesh::Sender) -> Self { + Self { + mem, + sqid, + admin_response, + } + } + + pub fn delete(&mut self, state: &mut IoState) { + match state.queue_state { + IoQueueState::Active => state.queue_state = IoQueueState::Deleting, + IoQueueState::Deleting | IoQueueState::Deleted => {} + } + } + + async fn process( + &mut self, + state: &mut IoState, + mem: &GuestMemory, + ) -> Result<(), HandlerError> { + loop { + let deleting = match state.queue_state { + IoQueueState::Active => { + // Wait for a completion to be ready. This will be necessary either + // to post an immediate result or to post an IO completion. It's not + // strictly necessary to start a new IO, but handling that special + // case is not worth the complexity. + state.cq.wait_ready(mem).await?; + false + } + IoQueueState::Deleting => { + if state.ios.is_empty() { + self.admin_response.send(self.sqid); + state.queue_state = IoQueueState::Deleted; + break; + } + true + } + IoQueueState::Deleted => break, + }; + + enum Event { + Sq(Result), + Io(IoResult), + } + + let next_sqe = async { + if state.io_count < MAX_IO_QUEUE_DEPTH && !deleting { + Event::Sq(state.sq.next(&self.mem).await) + } else { + pending().await + } + }; + + let next_io_completion = async { + if state.ios.is_empty() { + pending().await + } else { + Event::Io(state.ios.next().await.unwrap()) + } + }; + + let event = (next_sqe, next_io_completion).race().await; + let (cid, result) = match event { + Event::Io(io_result) => { + if io_result.advance_evt_idx { + let result = state.sq.advance_evt_idx(&self.mem); + if result.is_err() { + tracelimit::warn_ratelimited!("failure to advance evt_idx"); + } + } + state.io_count -= 1; + let result = match io_result.result { + Ok(cr) => cr, + Err(err) => { + tracelimit::warn_ratelimited!( + error = &err as &dyn std::error::Error, + cid = io_result.cid, + nsid = io_result.nsid, + opcode = ?io_result.opcode, + "io error" + ); + err.into() + } + }; + (io_result.cid, result) + } + Event::Sq(r) => { + let command = r?; + let cid = command.cdw0.cid(); + + if let Some(ns) = state.namespaces.get(&command.nsid) { + let ns = ns.clone(); + // If the queue depth is low, immediately update the evt_idx, so that + // the guest driver will ring the doorbell again. If the queue depth is + // high, defer this until I/O completion, on the theory that high queue + // depth workloads won't wait before enqueuing more work. + // + // TODO: Update later after performance testing, perhaps to something + // like to 2*(number of VPs)/(number of queue pairs). + let mut advance_evt_idx = true; + if state.io_count <= 1 { + let result = state.sq.advance_evt_idx(&self.mem); + if result.is_err() { + tracelimit::warn_ratelimited!("failure to advance evt_idx"); + } + advance_evt_idx = false; + } + let io = Box::pin(async move { + let result = ns.nvm_command(MAX_DATA_TRANSFER_SIZE, &command).await; + IoResult { + nsid: command.nsid, + opcode: nvm::NvmOpcode(command.cdw0.opcode()), + cid, + result, + advance_evt_idx, + } + }); + state.ios.push(io); + state.io_count += 1; + continue; + } + + let result = state.sq.advance_evt_idx(&self.mem); + if result.is_err() { + tracelimit::warn_ratelimited!("failure to advance evt_idx"); + } + (cid, spec::Status::INVALID_NAMESPACE_OR_FORMAT.into()) + } + }; + + let completion = spec::Completion { + dw0: result.dw[0], + dw1: result.dw[1], + sqhd: state.sq.sqhd(), + sqid: self.sqid, + cid, + status: spec::CompletionStatus::new().with_status(result.status.0), + }; + if !state.cq.write(&self.mem, completion)? { + assert!(deleting); + tracelimit::warn_ratelimited!("dropped i/o completion during queue deletion"); + } + state + .cq + .catch_up_evt_idx(false, state.io_count as u32, &self.mem)?; + } + Ok(()) + } + + pub fn update_shadow_db( + &mut self, + mem: &GuestMemory, + state: &mut IoState, + sq_sdb: ShadowDoorbell, + cq_sdb: ShadowDoorbell, + ) { + state.sq.update_shadow_db(mem, sq_sdb); + state.cq.update_shadow_db(mem, cq_sdb); + } +}