From de463f48d5c32a1eaa31050c0515daf2a1504aea Mon Sep 17 00:00:00 2001 From: John Starks Date: Mon, 20 Apr 2026 11:51:32 -0700 Subject: [PATCH 1/9] vfio_assigned_device: dynamic IOMMU mapping for P2P DMA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VFIO device assignment needs peer-to-peer DMA between assigned devices — one device must be able to DMA into another device's BAR. The old VFIO container manager snapshot guest RAM at creation time via GuestMemory::full_mapping() and programmed a static set of IOMMU identity mappings, so device BARs (which are mapped dynamically as the guest configures them) were never visible to the IOMMU. Introduce a DmaTarget trait in the region manager that receives incremental sub-mapping events (map_dma / unmap_dma) as regions are enabled, disabled, or modified. VFIO containers register themselves as DMA mapper consumers via a new DmaMapperClient, and the region manager replays all existing active sub-mappings on registration, then sends live updates as the memory map evolves. This means device BAR mappings are now programmed into every VFIO container's IOMMU as they appear, enabling P2P DMA between assigned devices. The region manager maintains the VaMapper lifecycle for backends that need host VAs (VFIO type1's pin_user_pages), controlled by a needs_va flag at registration. This also sets up the abstraction for future iommufd support, which maps from fd+offset directly and won't need a VaMapper at all. On the vfio_sys side, Container::map_dma now takes *const u8 instead of a raw u64 and is marked unsafe, properly reflecting the memory safety requirements that were previously hidden in the API. --- Cargo.lock | 2 +- openvmm/membacking/src/lib.rs | 4 + .../src/mapping_manager/va_mapper.rs | 7 + .../src/memory_manager/device_memory.rs | 5 +- openvmm/membacking/src/memory_manager/mod.rs | 5 + openvmm/membacking/src/region_manager.rs | 344 +++++++++++++++++- openvmm/openvmm_core/src/worker/dispatch.rs | 2 +- .../pci/vfio_assigned_device/Cargo.toml | 2 +- .../pci/vfio_assigned_device/src/lib.rs | 1 - .../pci/vfio_assigned_device/src/manager.rs | 140 ++++--- .../pci/vfio_assigned_device/src/resolver.rs | 9 +- vm/devices/user_driver/vfio_sys/src/lib.rs | 29 +- 12 files changed, 461 insertions(+), 89 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 81c5c5a278..74801c9c2f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8701,8 +8701,8 @@ dependencies = [ "anyhow", "async-trait", "chipset_device", - "guestmem", "inspect", + "membacking", "memory_range", "mesh", "pal_async", diff --git a/openvmm/membacking/src/lib.rs b/openvmm/membacking/src/lib.rs index 3f2b50f617..650ab321d7 100644 --- a/openvmm/membacking/src/lib.rs +++ b/openvmm/membacking/src/lib.rs @@ -88,6 +88,7 @@ mod sys { /// On Unix, this is an empty (uninhabitable) enum. pub type RemoteProcess = sys::RemoteProcess; +pub use mapping_manager::Mappable; pub use memory_manager::DeviceMemoryMapper; pub use memory_manager::GuestMemoryBuilder; pub use memory_manager::GuestMemoryClient; @@ -97,3 +98,6 @@ pub use memory_manager::PartitionAttachError; pub use memory_manager::RamVisibility; pub use memory_manager::RamVisibilityControl; pub use memory_manager::SharedMemoryBacking; +pub use region_manager::DmaMapperClient; +pub use region_manager::DmaMapperHandle; +pub use region_manager::DmaTarget; diff --git a/openvmm/membacking/src/mapping_manager/va_mapper.rs b/openvmm/membacking/src/mapping_manager/va_mapper.rs index be561660cc..34bcbd856b 100644 --- a/openvmm/membacking/src/mapping_manager/va_mapper.rs +++ b/openvmm/membacking/src/mapping_manager/va_mapper.rs @@ -44,6 +44,10 @@ use std::sync::Arc; use std::thread::JoinHandle; use thiserror::Error; +/// A virtual address space mapper for guest memory. +/// +/// Maintains a reserved VA range and maps file-backed or anonymous memory +/// into it as directed by the mapping manager. pub struct VaMapper { inner: Arc, process: Option, @@ -259,14 +263,17 @@ impl VaMapper { self.inner.request_mapping(range, false).await } + /// Returns the base pointer of the VA reservation. pub fn as_ptr(&self) -> *mut u8 { self.inner.mapping.as_ptr().cast() } + /// Returns the length of the VA reservation in bytes. pub fn len(&self) -> usize { self.inner.mapping.len() } + /// Returns the remote process, if this mapper maps into a remote process. pub fn process(&self) -> Option<&RemoteProcess> { self.process.as_ref() } diff --git a/openvmm/membacking/src/memory_manager/device_memory.rs b/openvmm/membacking/src/memory_manager/device_memory.rs index 8b8c1a9959..46dd7fb40b 100644 --- a/openvmm/membacking/src/memory_manager/device_memory.rs +++ b/openvmm/membacking/src/memory_manager/device_memory.rs @@ -158,7 +158,10 @@ impl MappableGuestMemory for DeviceMemoryControl { MemoryRange::try_from(gpa..gpa.wrapping_add(self.0.len as u64)) .map_err(|err| io::Error::new(io::ErrorKind::InvalidInput, err))?, DEVICE_PRIORITY, - false, // device memory cannot currently be a DMA target + false, // not a DMA target: excludes device BARs from + // GuestMemorySharing (vhost-user). IOMMU consumers + // get notified of these mappings via the DmaMapper + // path regardless of this flag. ) .await .map_err(io::Error::other)?; diff --git a/openvmm/membacking/src/memory_manager/mod.rs b/openvmm/membacking/src/memory_manager/mod.rs index a47078f034..2a29eb2d18 100644 --- a/openvmm/membacking/src/memory_manager/mod.rs +++ b/openvmm/membacking/src/memory_manager/mod.rs @@ -455,6 +455,11 @@ impl GuestMemoryManager { DeviceMemoryMapper::new(self.region_manager.client().clone()) } + /// Returns a client for registering DMA mappers (VFIO, iommufd). + pub fn dma_mapper_client(&self) -> crate::region_manager::DmaMapperClient { + crate::region_manager::DmaMapperClient::new(self.region_manager.client()) + } + /// Returns an object for manipulating the visibility state of different RAM /// regions. pub fn ram_visibility_control(&self) -> RamVisibilityControl { diff --git a/openvmm/membacking/src/region_manager.rs b/openvmm/membacking/src/region_manager.rs index 33369b6a71..e6c00cd17c 100644 --- a/openvmm/membacking/src/region_manager.rs +++ b/openvmm/membacking/src/region_manager.rs @@ -4,9 +4,13 @@ //! Implements the region manager, which tracks regions and their mappings, as //! well as partitions to map the regions into. +// UNSAFETY: Calling unsafe DmaTarget::map_dma with validated VA pointers. +#![expect(unsafe_code)] + use crate::mapping_manager::Mappable; use crate::mapping_manager::MappingManagerClient; use crate::mapping_manager::MappingParams; +use crate::mapping_manager::VaMapper; use crate::partition_mapper::PartitionMapper; use futures::StreamExt; use inspect::Inspect; @@ -17,9 +21,123 @@ use mesh::rpc::Rpc; use mesh::rpc::RpcSend; use pal_async::task::Spawn; use std::cmp::Ordering; +use std::sync::Arc; use thiserror::Error; use vmcore::local_only::LocalOnly; +/// A consumer of IOMMU-granularity DMA mapping events. +/// +/// Unlike [`PartitionMemoryMap`](virt::PartitionMemoryMap), which maps entire +/// regions by VA pointer for lazy SLAT resolution, this trait receives +/// individual sub-mapping events with the backing fd + offset, suitable for +/// explicit IOMMU programming (VFIO type1, iommufd, etc.). +/// +/// DMA targets receive notifications for **all** active sub-mappings, +/// including device BAR memory (regions with `dma_target: false`). The +/// `dma_target` flag controls only whether a region is exposed via +/// `GuestMemorySharing` (for vhost-user); IOMMU consumers need the full +/// GPA→backing map to program identity mappings for all guest-visible memory. +/// +/// Implementations must be `Send + Sync` because they are stored behind `Arc` +/// in the region manager task. +pub trait DmaTarget: Send + Sync { + /// Program an IOMMU mapping from `iova` to the backing described by + /// `mappable` at `file_offset` for `size` bytes. + /// + /// `host_va` is the host virtual address of the mapping, provided when + /// the target was registered with `needs_va = true`. When `None`, the + /// implementation should use `mappable` and `file_offset` directly + /// (e.g., iommufd). + /// + /// # Safety + /// When `host_va` is `Some`, the pointed-to memory must be backed and + /// must not be unmapped for the duration of the resulting IOMMU mapping. + /// The caller (the crate-internal `DmaMapper`) guarantees this by holding an + /// [`Arc`] and calling `ensure_mapped` before each invocation. + unsafe fn map_dma( + &self, + iova: u64, + host_va: Option<*const u8>, + mappable: &Mappable, + file_offset: u64, + size: u64, + ) -> anyhow::Result<()>; + + /// Remove an IOMMU mapping at `iova` for `size` bytes. + /// + /// The region manager may call this with a range that is larger than any + /// single `map_dma` call (e.g., unmapping an entire region at once even + /// though individual sub-mappings were mapped separately). Implementations + /// must handle this gracefully — for example, by unmapping whatever + /// sub-ranges are currently mapped within the requested range and ignoring + /// any gaps. + fn unmap_dma(&self, iova: u64, size: u64) -> anyhow::Result<()>; +} + +/// Wraps a [`DmaTarget`] for use by the region manager. +/// +/// Holds an optional [`VaMapper`] to ensure pages are faulted in before +/// `map_dma` (needed for VFIO type1's `pin_user_pages`). +struct DmaMapper { + id: DmaMapperId, + target: Arc, + va_mapper: Option>, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +struct DmaMapperId(u64); + +impl DmaMapper { + /// Map a sub-mapping into the IOMMU. + async fn map_dma( + &self, + iova: u64, + mappable: &Mappable, + file_offset: u64, + size: u64, + ) -> anyhow::Result<()> { + // Ensure the VaMapper has the backing pages faulted in and compute + // the host VA for type1 backends. + let host_va = if let Some(va_mapper) = &self.va_mapper { + va_mapper + .ensure_mapped(MemoryRange::new(iova..iova + size)) + .await + .map_err(|_| { + anyhow::anyhow!( + "VA range {:#x}..{:#x} has no backing mapping", + iova, + iova + size + ) + })?; + Some(va_mapper.as_ptr().wrapping_add(iova as usize).cast_const()) + } else { + None + }; + // SAFETY: When host_va is Some, the VaMapper has been ensure_mapped + // for this range. The VaMapper is held alive by this DmaMapper (via + // Arc), so the VA reservation persists. The pages are backed because + // ensure_mapped succeeded. The IOMMU mapping will be torn down + // (via unmap_dma in disable_region or remove_dma_mapper) before the + // VaMapper releases the VA range. + unsafe { + self.target + .map_dma(iova, host_va, mappable, file_offset, size) + } + } + + /// Unmap a range from the IOMMU. + fn unmap_dma(&self, iova: u64, size: u64) { + if let Err(e) = self.target.unmap_dma(iova, size) { + tracing::warn!( + error = &*e as &dyn std::error::Error, + iova, + size, + "DMA unmap failed" + ); + } + } +} + /// The region manager. #[derive(Debug, Inspect)] pub struct RegionManager { @@ -130,6 +248,8 @@ fn inspect_mappings(req: inspect::Request<'_>, region_start: u64, mappings: &[Re struct RegionManagerTaskInner { partitions: Vec, + dma_mappers: Vec, + next_dma_mapper_id: u64, mapping_manager: MappingManagerClient, } @@ -144,6 +264,8 @@ enum RegionRequest { AddPartition( LocalOnly>>, ), + AddDmaMapper(LocalOnly, bool), anyhow::Result>>), + RemoveDmaMapper(LocalOnly), Inspect(inspect::Deferred), } @@ -178,6 +300,8 @@ impl RegionManagerTask { inner: RegionManagerTaskInner { mapping_manager, partitions: Vec::new(), + dma_mappers: Vec::new(), + next_dma_mapper_id: 0, }, } } @@ -197,6 +321,14 @@ impl RegionManagerTask { rpc.handle(async |partition| self.add_partition(partition).await) .await } + RegionRequest::AddDmaMapper(LocalOnly(rpc)) => { + let ((target, needs_va), rpc) = rpc.split(); + let result = self.add_dma_mapper(target, needs_va).await; + rpc.complete(result); + } + RegionRequest::RemoveDmaMapper(LocalOnly(id)) => { + self.remove_dma_mapper(id); + } RegionRequest::AddRegion(rpc) => rpc.handle_sync(|params| self.add_region(params)), RegionRequest::RemoveRegion(rpc) => { rpc.handle(async |id| self.unmap_region(id, true).await) @@ -234,6 +366,73 @@ impl RegionManagerTask { Ok(()) } + async fn add_dma_mapper( + &mut self, + target: Arc, + needs_va: bool, + ) -> anyhow::Result { + // Create a VaMapper if the target needs host VAs for IOMMU programming. + let va_mapper = if needs_va { + Some( + self.inner + .mapping_manager + .new_mapper() + .await + .map_err(|e| anyhow::anyhow!(e))?, + ) + } else { + None + }; + + let id = DmaMapperId(self.inner.next_dma_mapper_id); + self.inner.next_dma_mapper_id += 1; + + let mapper = DmaMapper { + id, + target, + va_mapper, + }; + + // Replay existing active sub-mappings so the new IOMMU consumer + // gets the current state. + for region in &self.regions { + if region.is_active { + for mapping in ®ion.mappings { + let iova = region.params.range.start() + mapping.params.range_in_region.start(); + let size = mapping.params.range_in_region.len(); + mapper + .map_dma( + iova, + &mapping.params.mappable, + mapping.params.file_offset, + size, + ) + .await?; + } + } + } + self.inner.dma_mappers.push(mapper); + Ok(id) + } + + fn remove_dma_mapper(&mut self, id: DmaMapperId) { + if let Some(pos) = self.inner.dma_mappers.iter().position(|m| m.id == id) { + let mapper = &self.inner.dma_mappers[pos]; + // Unmap all active sub-mappings from this mapper before removing it. + for region in &self.regions { + if region.is_active { + for mapping in ®ion.mappings { + let iova = + region.params.range.start() + mapping.params.range_in_region.start(); + let size = mapping.params.range_in_region.len(); + mapper.unmap_dma(iova, size); + } + } + } + self.inner.dma_mappers.swap_remove(pos); + } + } + fn region_index(&self, id: RegionId) -> usize { self.regions.iter().position(|r| r.id == id).unwrap() } @@ -410,6 +609,25 @@ impl RegionManagerTask { for partition in &mut self.inner.partitions { partition.notify_new_mapping(range).await; } + + for dma_mapper in &self.inner.dma_mappers { + if let Err(e) = dma_mapper + .map_dma( + range.start(), + ¶ms.mappable, + params.file_offset, + range.len(), + ) + .await + { + tracing::warn!( + error = &*e as &dyn std::error::Error, + iova = range.start(), + size = range.len(), + "DMA mapper failed to map new sub-mapping" + ); + } + } } region.mappings.push(RegionMapping { params }); @@ -418,6 +636,22 @@ impl RegionManagerTask { async fn remove_mappings(&mut self, id: RegionId, range_in_region: MemoryRange) { let index = self.region_index(id); let region = &mut self.regions[index]; + let active_range = region.active_range(); + + // Collect absolute GPA ranges of mappings being removed (before + // mutating the vec) so we can notify DMA mappers. + let removed_ranges: Vec = if active_range.is_some() { + let region_range = region.params.range; + region + .mappings + .iter() + .filter(|m| range_in_region.contains(&m.params.range_in_region)) + .map(|m| range_within(region_range, m.params.range_in_region)) + .collect() + } else { + Vec::new() + }; + region.mappings.retain_mut(|mapping| { if !range_in_region.contains(&mapping.params.range_in_region) { assert!( @@ -428,12 +662,18 @@ impl RegionManagerTask { } false }); - if let Some(region_range) = region.active_range() { + if let Some(region_range) = active_range { self.inner .mapping_manager .remove_mappings(range_within(region_range, range_in_region)) .await; + // Unmap removed sub-mappings from DMA mappers. + for removed in &removed_ranges { + for dma_mapper in &self.inner.dma_mappers { + dma_mapper.unmap_dma(removed.start(), removed.len()); + } + } // Currently there is no need to tell the partitions about the // removed mappings; they will find out when the underlying VA is // invalidated by the kernel. @@ -466,6 +706,30 @@ impl RegionManagerTaskInner { .await; } + // Map sub-mappings into DMA mappers (per-sub-mapping granularity). + for mapping in ®ion.mappings { + let iova = region.params.range.start() + mapping.params.range_in_region.start(); + let size = mapping.params.range_in_region.len(); + for dma_mapper in &self.dma_mappers { + if let Err(e) = dma_mapper + .map_dma( + iova, + &mapping.params.mappable, + mapping.params.file_offset, + size, + ) + .await + { + tracing::warn!( + error = &*e as &dyn std::error::Error, + iova, + size, + "DMA mapper failed to map sub-mapping during region enable" + ); + } + } + } + // Map the region into the partitions. for partition in &mut self.partitions { partition @@ -486,7 +750,15 @@ impl RegionManagerTaskInner { "disabling region" ); + // Unmap DMA mappers first — IOMMU entries must be removed before + // the VA mappings are torn down (type1's pin_user_pages pins are + // released by unmap_dma, and the underlying pages must still be + // valid at that point). let region_range = region.params.range; + for dma_mapper in &mut self.dma_mappers { + dma_mapper.unmap_dma(region_range.start(), region_range.len()); + } + for partition in &mut self.partitions { partition.unmap_region(region_range); } @@ -562,6 +834,76 @@ impl RegionManagerClient { } } +/// Client for registering DMA mappers with the region manager. +/// +/// This is the public-facing handle for IOMMU consumers (VFIO, iommufd) +/// to register themselves. It exposes only `add_dma_mapper`, hiding the +/// rest of the region manager API. +#[derive(Clone)] +pub struct DmaMapperClient { + req_send: mesh::Sender, +} + +impl DmaMapperClient { + pub(crate) fn new(region_manager: &RegionManagerClient) -> Self { + Self { + req_send: region_manager.req_send.clone(), + } + } + + /// Register a DMA target to receive sub-mapping events. + /// + /// This may only be called in the same process as the region manager. + /// + /// If `needs_va` is `true`, the region manager will maintain a `VaMapper` + /// and pass a host VA to [`DmaTarget::map_dma`] for each sub-mapping. + /// Use this for backends that program the IOMMU via host VAs (VFIO type1). + /// + /// If `needs_va` is `false`, no `VaMapper` is created and `host_va` will + /// be `None`. Use this for backends that map from the fd directly (iommufd). + /// + /// The replay loop maps all existing active sub-mappings into the new + /// consumer. + /// + /// Returns a [`DmaMapperHandle`] that removes the mapper when dropped. + pub async fn add_dma_mapper( + &self, + target: Arc, + needs_va: bool, + ) -> anyhow::Result { + let id = self + .req_send + .call( + |x| RegionRequest::AddDmaMapper(LocalOnly(x)), + (target, needs_va), + ) + .await + .unwrap()?; + Ok(DmaMapperHandle { + id: Some(id), + req_send: self.req_send.clone(), + }) + } +} + +/// Handle to a registered DMA mapper. +/// +/// Removes the mapper from the region manager on drop, unmapping all +/// active IOMMU entries. +pub struct DmaMapperHandle { + id: Option, + req_send: mesh::Sender, +} + +impl Drop for DmaMapperHandle { + fn drop(&mut self) { + if let Some(id) = self.id { + self.req_send + .send(RegionRequest::RemoveDmaMapper(LocalOnly(id))); + } + } +} + /// A handle to a region. /// /// Removes the region on drop. diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 0535edb655..615740c8d1 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -1798,7 +1798,7 @@ impl InitializedVm { let vfio_inspect = { let vfio_resolver = vfio_assigned_device::resolver::VfioDeviceResolver::new( driver_source.builder().build("vfio-container-mgr"), - gm.clone(), + memory_manager.dma_mapper_client(), ); let handle = vfio_resolver.inspect_handle(); resolver.add_async_resolver::< diff --git a/vm/devices/pci/vfio_assigned_device/Cargo.toml b/vm/devices/pci/vfio_assigned_device/Cargo.toml index 7e7587eea8..b50dea75e8 100644 --- a/vm/devices/pci/vfio_assigned_device/Cargo.toml +++ b/vm/devices/pci/vfio_assigned_device/Cargo.toml @@ -8,9 +8,9 @@ rust-version.workspace = true [target.'cfg(target_os = "linux")'.dependencies] chipset_device.workspace = true -guestmem.workspace = true inspect.workspace = true memory_range.workspace = true +membacking.workspace = true mesh.workspace = true pal_async.workspace = true pci_core.workspace = true diff --git a/vm/devices/pci/vfio_assigned_device/src/lib.rs b/vm/devices/pci/vfio_assigned_device/src/lib.rs index dc819e72d7..77dc5200f7 100644 --- a/vm/devices/pci/vfio_assigned_device/src/lib.rs +++ b/vm/devices/pci/vfio_assigned_device/src/lib.rs @@ -13,7 +13,6 @@ //! fatal. #![cfg(target_os = "linux")] -#![forbid(unsafe_code)] pub mod manager; pub mod resolver; diff --git a/vm/devices/pci/vfio_assigned_device/src/manager.rs b/vm/devices/pci/vfio_assigned_device/src/manager.rs index 1432bcc460..75b156f43c 100644 --- a/vm/devices/pci/vfio_assigned_device/src/manager.rs +++ b/vm/devices/pci/vfio_assigned_device/src/manager.rs @@ -7,15 +7,53 @@ //! tables) for every assigned device, this module manages a pool of containers //! and reuses them across devices whose IOMMU groups are compatible. +// UNSAFETY: Implementing unsafe DmaTarget::map_dma for VFIO type1 IOMMU. +#![expect(unsafe_code)] + use anyhow::Context as _; -use guestmem::GuestMemory; use inspect::{Inspect, InspectMut}; +use membacking::DmaMapperClient; use mesh::rpc::FailableRpc; use mesh::rpc::RpcSend as _; use std::collections::HashMap; use std::fs::File; use std::sync::Arc; +/// Implements [`membacking::DmaTarget`] for VFIO type1 IOMMU containers. +/// +/// Translates sub-mapping events from the region manager into VFIO +/// `map_dma`/`unmap_dma` ioctls. The host VA needed for `pin_user_pages` +/// is provided by the region manager's `DmaMapper` wrapper. +struct VfioType1DmaTarget { + container: Arc, +} + +impl membacking::DmaTarget for VfioType1DmaTarget { + unsafe fn map_dma( + &self, + iova: u64, + host_va: Option<*const u8>, + _mappable: &membacking::Mappable, + _file_offset: u64, + size: u64, + ) -> anyhow::Result<()> { + let vaddr = host_va.expect("VFIO type1 requires host VA (registered with needs_va=true)"); + // SAFETY: The caller (DmaMapper in membacking) guarantees that the + // host VA is backed and stable via ensure_mapped + VaMapper lifetime. + unsafe { + self.container + .map_dma(iova, vaddr, size) + .context("VFIO DMA map failed") + } + } + + fn unmap_dma(&self, iova: u64, size: u64) -> anyhow::Result<()> { + self.container + .unmap_dma(iova, size) + .context("VFIO DMA unmap failed") + } +} + /// RPC messages for the container manager task. enum VfioManagerRpc { /// Prepare a container and group for a device, creating or reusing @@ -69,6 +107,9 @@ impl VfioDeviceBinding { struct ContainerEntry { id: u64, container: Arc, + /// Handle to the DMA mapper registration — removes the mapper from + /// the region manager when dropped, unmapping all IOMMU entries. + _dma_handle: membacking::DmaMapperHandle, } /// Manages VFIO containers and groups, sharing containers across devices. @@ -90,11 +131,9 @@ pub(crate) struct VfioContainerManager { /// Next container ID to assign. #[inspect(skip)] next_container_id: u64, - /// Guest memory handle for DMA mapping. - guest_memory: GuestMemory, - /// Cached guest memory regions + base VA, fetched lazily. + /// Client for registering VFIO containers as DMA mappers. #[inspect(skip)] - dma_info: Option, + dma_mapper_client: DmaMapperClient, #[inspect(skip)] recv: mesh::Receiver, } @@ -140,23 +179,16 @@ struct GroupEntry { container_id: u64, } -struct DmaInfo { - regions: Vec, - base_va: u64, - va_size: u64, -} - impl VfioContainerManager { /// Create a new container manager. - pub fn new(guest_memory: GuestMemory) -> Self { + pub fn new(dma_mapper_client: DmaMapperClient) -> Self { Self { containers: Vec::new(), groups: HashMap::new(), devices: Vec::new(), next_device_id: 0, next_container_id: 0, - guest_memory, - dma_info: None, + dma_mapper_client, recv: mesh::Receiver::new(), } } @@ -365,8 +397,9 @@ impl VfioContainerManager { .map(|c| &c.container) } - /// Create a new container, set IOMMU type, map guest RAM, and attach the - /// group. Returns the container ID. + /// Create a new container, set IOMMU type, register with the region + /// manager for dynamic DMA mapping, and attach the group. Returns the + /// container ID. async fn create_container_for_group( &mut self, group: &vfio_sys::Group, @@ -383,35 +416,21 @@ impl VfioContainerManager { .set_iommu(vfio_sys::IommuType::Type1v2) .context("failed to set VFIO IOMMU type to Type1v2 (IOMMU required)")?; - // Lazily fetch guest memory regions on first use. - if self.dma_info.is_none() { - self.dma_info = Some(self.fetch_dma_info().await?); - } - let dma_info = self.dma_info.as_ref().unwrap(); - - // Identity-map guest RAM (IOVA == GPA) for device DMA access. - for region in &dma_info.regions { - let gpa = region.guest_address; - let size = region.size; - let gpa_end = gpa - .checked_add(size) - .context("guest memory region overflows u64")?; - anyhow::ensure!( - gpa_end <= dma_info.va_size, - "guest memory region {:#x}..{:#x} exceeds mapping size {:#x}", - gpa, - gpa_end, - dma_info.va_size - ); - let vaddr = dma_info.base_va + gpa; - container.map_dma(gpa, vaddr, size).with_context(|| { - format!( - "failed to map DMA for guest memory region {:#x}..{:#x}", - gpa, gpa_end - ) - })?; - tracing::debug!(gpa, size, vaddr, "mapped guest RAM for VFIO DMA"); - } + let container = Arc::new(container); + + let dma_target: Arc = Arc::new(VfioType1DmaTarget { + container: container.clone(), + }); + + // Register as a DMA mapper — the region manager will create a + // VaMapper internally (since needs_va is true) and replay all + // existing active sub-mappings (guest RAM + any active device + // BARs) into this container's IOMMU. + let dma_handle = self + .dma_mapper_client + .add_dma_mapper(dma_target, true) + .await + .context("failed to register VFIO container with region manager")?; tracing::info!( pci_id, @@ -420,37 +439,16 @@ impl VfioContainerManager { "created new VFIO container" ); - let container = Arc::new(container); let id = self.next_container_id; self.next_container_id += 1; - self.containers.push(ContainerEntry { id, container }); + self.containers.push(ContainerEntry { + id, + container, + _dma_handle: dma_handle, + }); Ok(id) } - async fn fetch_dma_info(&self) -> anyhow::Result { - let sharing = self - .guest_memory - .sharing() - .context("VFIO requires shareable guest memory")?; - - let regions = sharing - .get_regions() - .await - .map_err(|e| anyhow::anyhow!(e)) - .context("failed to get shareable guest memory regions")?; - - let (base_va, va_size) = self - .guest_memory - .full_mapping() - .context("VFIO DMA mapping requires linearly mapped guest memory")?; - - Ok(DmaInfo { - regions, - base_va: base_va as u64, - va_size: va_size as u64, - }) - } - pub(crate) fn client(&mut self) -> VfioManagerClient { VfioManagerClient { sender: self.recv.sender(), diff --git a/vm/devices/pci/vfio_assigned_device/src/resolver.rs b/vm/devices/pci/vfio_assigned_device/src/resolver.rs index 49965919ee..6c09dac7b2 100644 --- a/vm/devices/pci/vfio_assigned_device/src/resolver.rs +++ b/vm/devices/pci/vfio_assigned_device/src/resolver.rs @@ -8,6 +8,7 @@ use crate::manager::VfioContainerManager; use crate::manager::VfioManagerClient; use anyhow::Context as _; use async_trait::async_trait; +use membacking::DmaMapperClient; use pci_resources::ResolvePciDeviceHandleParams; use pci_resources::ResolvedPciDevice; use vfio_assigned_device_resources::VfioDeviceHandle; @@ -27,10 +28,10 @@ pub struct VfioDeviceResolver { impl VfioDeviceResolver { /// Create a new resolver, spawning the container manager task. /// - /// The manager lazily initializes DMA mappings on first use, so creating - /// this is cheap for VMs that have no VFIO devices. - pub fn new(spawner: impl pal_async::task::Spawn, guest_memory: guestmem::GuestMemory) -> Self { - let mut manager = VfioContainerManager::new(guest_memory); + /// The manager registers each new VFIO container with the region manager + /// so that DMA mappings are kept in sync with the VM's memory map. + pub fn new(spawner: impl pal_async::task::Spawn, dma_mapper_client: DmaMapperClient) -> Self { + let mut manager = VfioContainerManager::new(dma_mapper_client); let client = manager.client(); let task = spawner.spawn("vfio-container-mgr", manager.run()); Self { diff --git a/vm/devices/user_driver/vfio_sys/src/lib.rs b/vm/devices/user_driver/vfio_sys/src/lib.rs index 1a70ba3038..976f1d9f22 100644 --- a/vm/devices/user_driver/vfio_sys/src/lib.rs +++ b/vm/devices/user_driver/vfio_sys/src/lib.rs @@ -36,8 +36,19 @@ use vfio_bindings::bindings::vfio::vfio_region_sparse_mmap_area; /// Returns the host page size. pub fn host_page_size() -> u64 { - // SAFETY: sysconf(_SC_PAGESIZE) is always safe and always succeeds on Linux. - unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 } + use std::sync::atomic::{AtomicU64, Ordering::Relaxed}; + static PAGE_SIZE: AtomicU64 = const { AtomicU64::new(0) }; + + let page_size = PAGE_SIZE.load(Relaxed); + if page_size == 0 { + // SAFETY: sysconf(_SC_PAGESIZE) is always safe and always succeeds on Linux. + let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; + assert!(page_size >= 4096); + PAGE_SIZE.store(page_size, Relaxed); + page_size + } else { + page_size + } } mod ioctl { @@ -147,16 +158,18 @@ impl Container { /// page-aligned. /// /// Only valid when the container uses a Type1v2 IOMMU. - pub fn map_dma(&self, iova: u64, vaddr: u64, size: u64) -> anyhow::Result<()> { + /// + /// # Safety + /// `vaddr` must point to valid, backed memory for `size` bytes. The + /// memory must not be unmapped while the IOMMU mapping is live (until + /// a corresponding `unmap_dma` call). + pub unsafe fn map_dma(&self, iova: u64, vaddr: *const u8, size: u64) -> anyhow::Result<()> { use vfio_bindings::bindings::vfio::VFIO_DMA_MAP_FLAG_READ; use vfio_bindings::bindings::vfio::VFIO_DMA_MAP_FLAG_WRITE; - // SAFETY: sysconf(_SC_PAGESIZE) is always safe and returns the - // host page size. - let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) }; - anyhow::ensure!(page_size > 0, "sysconf(_SC_PAGESIZE) failed"); - let page_size = page_size as u64; + let page_size = host_page_size(); let page_mask = page_size - 1; + let vaddr = vaddr as u64; anyhow::ensure!( iova & page_mask == 0 && vaddr & page_mask == 0 && size & page_mask == 0, "VFIO DMA mapping requires page-aligned iova ({iova:#x}), vaddr ({vaddr:#x}), and size ({size:#x}), page size {page_size:#x}" From 1dbb5da6b2407573bdd90bc960eb92363cba9fff Mon Sep 17 00:00:00 2001 From: John Starks Date: Thu, 23 Apr 2026 17:19:51 -0700 Subject: [PATCH 2/9] fix build --- Cargo.lock | 1 + vm/devices/pci/vfio_assigned_device/Cargo.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 74801c9c2f..f74fe11a24 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8701,6 +8701,7 @@ dependencies = [ "anyhow", "async-trait", "chipset_device", + "guestmem", "inspect", "membacking", "memory_range", diff --git a/vm/devices/pci/vfio_assigned_device/Cargo.toml b/vm/devices/pci/vfio_assigned_device/Cargo.toml index b50dea75e8..e05b52c754 100644 --- a/vm/devices/pci/vfio_assigned_device/Cargo.toml +++ b/vm/devices/pci/vfio_assigned_device/Cargo.toml @@ -8,6 +8,7 @@ rust-version.workspace = true [target.'cfg(target_os = "linux")'.dependencies] chipset_device.workspace = true +guestmem.workspace = true inspect.workspace = true memory_range.workspace = true membacking.workspace = true From 6b0529a85b5db70211e87e7850a0a99112c01f81 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 25 Apr 2026 00:11:16 +0000 Subject: [PATCH 3/9] fixes --- openvmm/membacking/src/region_manager.rs | 67 ++++++++----------- .../pci/vfio_assigned_device/src/manager.rs | 9 ++- vm/devices/user_driver/vfio_sys/src/lib.rs | 7 +- 3 files changed, 36 insertions(+), 47 deletions(-) diff --git a/openvmm/membacking/src/region_manager.rs b/openvmm/membacking/src/region_manager.rs index e6c00cd17c..3ebeb618d9 100644 --- a/openvmm/membacking/src/region_manager.rs +++ b/openvmm/membacking/src/region_manager.rs @@ -41,8 +41,8 @@ use vmcore::local_only::LocalOnly; /// Implementations must be `Send + Sync` because they are stored behind `Arc` /// in the region manager task. pub trait DmaTarget: Send + Sync { - /// Program an IOMMU mapping from `iova` to the backing described by - /// `mappable` at `file_offset` for `size` bytes. + /// Program an IOMMU mapping for `range` to the backing described by + /// `mappable` at `file_offset`. /// /// `host_va` is the host virtual address of the mapping, provided when /// the target was registered with `needs_va = true`. When `None`, the @@ -56,14 +56,13 @@ pub trait DmaTarget: Send + Sync { /// [`Arc`] and calling `ensure_mapped` before each invocation. unsafe fn map_dma( &self, - iova: u64, + range: MemoryRange, host_va: Option<*const u8>, mappable: &Mappable, file_offset: u64, - size: u64, ) -> anyhow::Result<()>; - /// Remove an IOMMU mapping at `iova` for `size` bytes. + /// Remove an IOMMU mapping for `range`. /// /// The region manager may call this with a range that is larger than any /// single `map_dma` call (e.g., unmapping an entire region at once even @@ -71,7 +70,7 @@ pub trait DmaTarget: Send + Sync { /// must handle this gracefully — for example, by unmapping whatever /// sub-ranges are currently mapped within the requested range and ignoring /// any gaps. - fn unmap_dma(&self, iova: u64, size: u64) -> anyhow::Result<()>; + fn unmap_dma(&self, range: MemoryRange) -> anyhow::Result<()>; } /// Wraps a [`DmaTarget`] for use by the region manager. @@ -91,25 +90,25 @@ impl DmaMapper { /// Map a sub-mapping into the IOMMU. async fn map_dma( &self, - iova: u64, + range: MemoryRange, mappable: &Mappable, file_offset: u64, - size: u64, ) -> anyhow::Result<()> { // Ensure the VaMapper has the backing pages faulted in and compute // the host VA for type1 backends. let host_va = if let Some(va_mapper) = &self.va_mapper { va_mapper - .ensure_mapped(MemoryRange::new(iova..iova + size)) + .ensure_mapped(range) .await .map_err(|_| { anyhow::anyhow!( - "VA range {:#x}..{:#x} has no backing mapping", - iova, - iova + size + "VA range {range} has no backing mapping", ) })?; - Some(va_mapper.as_ptr().wrapping_add(iova as usize).cast_const()) + // SAFETY: range.start() is within the VA reservation (ensured by + // ensure_mapped succeeding), so this produces a valid pointer + // within the mapping. + Some(unsafe { va_mapper.as_ptr().add(range.start() as usize).cast_const() }) } else { None }; @@ -121,17 +120,16 @@ impl DmaMapper { // VaMapper releases the VA range. unsafe { self.target - .map_dma(iova, host_va, mappable, file_offset, size) + .map_dma(range, host_va, mappable, file_offset) } } /// Unmap a range from the IOMMU. - fn unmap_dma(&self, iova: u64, size: u64) { - if let Err(e) = self.target.unmap_dma(iova, size) { + fn unmap_dma(&self, range: MemoryRange) { + if let Err(e) = self.target.unmap_dma(range) { tracing::warn!( error = &*e as &dyn std::error::Error, - iova, - size, + %range, "DMA unmap failed" ); } @@ -398,14 +396,12 @@ impl RegionManagerTask { for region in &self.regions { if region.is_active { for mapping in ®ion.mappings { - let iova = region.params.range.start() + mapping.params.range_in_region.start(); - let size = mapping.params.range_in_region.len(); + let range = range_within(region.params.range, mapping.params.range_in_region); mapper .map_dma( - iova, + range, &mapping.params.mappable, mapping.params.file_offset, - size, ) .await?; } @@ -422,10 +418,8 @@ impl RegionManagerTask { for region in &self.regions { if region.is_active { for mapping in ®ion.mappings { - let iova = - region.params.range.start() + mapping.params.range_in_region.start(); - let size = mapping.params.range_in_region.len(); - mapper.unmap_dma(iova, size); + let range = range_within(region.params.range, mapping.params.range_in_region); + mapper.unmap_dma(range); } } } @@ -613,17 +607,15 @@ impl RegionManagerTask { for dma_mapper in &self.inner.dma_mappers { if let Err(e) = dma_mapper .map_dma( - range.start(), + range, ¶ms.mappable, params.file_offset, - range.len(), ) .await { tracing::warn!( error = &*e as &dyn std::error::Error, - iova = range.start(), - size = range.len(), + %range, "DMA mapper failed to map new sub-mapping" ); } @@ -669,9 +661,9 @@ impl RegionManagerTask { .await; // Unmap removed sub-mappings from DMA mappers. - for removed in &removed_ranges { + for &removed in &removed_ranges { for dma_mapper in &self.inner.dma_mappers { - dma_mapper.unmap_dma(removed.start(), removed.len()); + dma_mapper.unmap_dma(removed); } } // Currently there is no need to tell the partitions about the @@ -708,22 +700,19 @@ impl RegionManagerTaskInner { // Map sub-mappings into DMA mappers (per-sub-mapping granularity). for mapping in ®ion.mappings { - let iova = region.params.range.start() + mapping.params.range_in_region.start(); - let size = mapping.params.range_in_region.len(); + let range = range_within(region.params.range, mapping.params.range_in_region); for dma_mapper in &self.dma_mappers { if let Err(e) = dma_mapper .map_dma( - iova, + range, &mapping.params.mappable, mapping.params.file_offset, - size, ) .await { tracing::warn!( error = &*e as &dyn std::error::Error, - iova, - size, + %range, "DMA mapper failed to map sub-mapping during region enable" ); } @@ -756,7 +745,7 @@ impl RegionManagerTaskInner { // valid at that point). let region_range = region.params.range; for dma_mapper in &mut self.dma_mappers { - dma_mapper.unmap_dma(region_range.start(), region_range.len()); + dma_mapper.unmap_dma(region_range); } for partition in &mut self.partitions { diff --git a/vm/devices/pci/vfio_assigned_device/src/manager.rs b/vm/devices/pci/vfio_assigned_device/src/manager.rs index 75b156f43c..e80d48f8cb 100644 --- a/vm/devices/pci/vfio_assigned_device/src/manager.rs +++ b/vm/devices/pci/vfio_assigned_device/src/manager.rs @@ -31,25 +31,24 @@ struct VfioType1DmaTarget { impl membacking::DmaTarget for VfioType1DmaTarget { unsafe fn map_dma( &self, - iova: u64, + range: memory_range::MemoryRange, host_va: Option<*const u8>, _mappable: &membacking::Mappable, _file_offset: u64, - size: u64, ) -> anyhow::Result<()> { let vaddr = host_va.expect("VFIO type1 requires host VA (registered with needs_va=true)"); // SAFETY: The caller (DmaMapper in membacking) guarantees that the // host VA is backed and stable via ensure_mapped + VaMapper lifetime. unsafe { self.container - .map_dma(iova, vaddr, size) + .map_dma(range.start(), vaddr, range.len()) .context("VFIO DMA map failed") } } - fn unmap_dma(&self, iova: u64, size: u64) -> anyhow::Result<()> { + fn unmap_dma(&self, range: memory_range::MemoryRange) -> anyhow::Result<()> { self.container - .unmap_dma(iova, size) + .unmap_dma(range.start(), range.len()) .context("VFIO DMA unmap failed") } } diff --git a/vm/devices/user_driver/vfio_sys/src/lib.rs b/vm/devices/user_driver/vfio_sys/src/lib.rs index 976f1d9f22..19defc9030 100644 --- a/vm/devices/user_driver/vfio_sys/src/lib.rs +++ b/vm/devices/user_driver/vfio_sys/src/lib.rs @@ -41,9 +41,10 @@ pub fn host_page_size() -> u64 { let page_size = PAGE_SIZE.load(Relaxed); if page_size == 0 { - // SAFETY: sysconf(_SC_PAGESIZE) is always safe and always succeeds on Linux. - let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; - assert!(page_size >= 4096); + // SAFETY: sysconf(_SC_PAGESIZE) is always safe to call on Linux. + let raw = unsafe { libc::sysconf(libc::_SC_PAGESIZE) }; + assert!(raw > 0, "sysconf(_SC_PAGESIZE) failed: {raw}"); + let page_size = raw as u64; PAGE_SIZE.store(page_size, Relaxed); page_size } else { From 81733dafd5edea07b0f86f4214ad8281ae47a3b8 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 25 Apr 2026 00:19:05 +0000 Subject: [PATCH 4/9] feedback --- openvmm/membacking/src/region_manager.rs | 246 ++++++++++++++++++++- vm/devices/user_driver/vfio_sys/src/lib.rs | 6 +- 2 files changed, 244 insertions(+), 8 deletions(-) diff --git a/openvmm/membacking/src/region_manager.rs b/openvmm/membacking/src/region_manager.rs index 3ebeb618d9..b95277fd4e 100644 --- a/openvmm/membacking/src/region_manager.rs +++ b/openvmm/membacking/src/region_manager.rs @@ -62,14 +62,14 @@ pub trait DmaTarget: Send + Sync { file_offset: u64, ) -> anyhow::Result<()>; - /// Remove an IOMMU mapping for `range`. + /// Remove IOMMU mappings within `range`. /// - /// The region manager may call this with a range that is larger than any - /// single `map_dma` call (e.g., unmapping an entire region at once even - /// though individual sub-mappings were mapped separately). Implementations - /// must handle this gracefully — for example, by unmapping whatever - /// sub-ranges are currently mapped within the requested range and ignoring - /// any gaps. + /// The region manager may call this with a range that covers multiple + /// prior `map_dma` calls (e.g., unmapping an entire region at once even + /// though individual sub-mappings were mapped separately). The range + /// will always be aligned to mapping boundaries — it will not bisect + /// any prior mapping. Gaps within the range (unmapped sub-ranges) are + /// expected and must not cause errors. fn unmap_dma(&self, range: MemoryRange) -> anyhow::Result<()>; } @@ -979,14 +979,63 @@ impl Drop for RegionHandle { mod tests { use super::MapParams; use super::RegionManagerTask; + use crate::mapping_manager::Mappable; use crate::mapping_manager::MappingManager; use crate::region_manager::AddRegionError; + use crate::region_manager::DmaTarget; use crate::region_manager::RegionId; + use crate::region_manager::RegionMappingParams; use crate::region_manager::RegionParams; use memory_range::MemoryRange; use pal_async::async_test; use pal_async::task::Spawn; use std::ops::Range; + use std::sync::Arc; + use std::sync::Mutex; + + /// Records map/unmap calls for test assertions. + #[derive(Default)] + struct RecordingDmaTarget { + events: Mutex>, + } + + #[derive(Debug, Clone, PartialEq, Eq)] + enum DmaEvent { + Map(MemoryRange), + Unmap(MemoryRange), + } + + impl DmaTarget for RecordingDmaTarget { + unsafe fn map_dma( + &self, + range: MemoryRange, + _host_va: Option<*const u8>, + _mappable: &Mappable, + _file_offset: u64, + ) -> anyhow::Result<()> { + self.events.lock().unwrap().push(DmaEvent::Map(range)); + Ok(()) + } + + fn unmap_dma(&self, range: MemoryRange) -> anyhow::Result<()> { + self.events.lock().unwrap().push(DmaEvent::Unmap(range)); + Ok(()) + } + } + + impl RecordingDmaTarget { + fn take_events(&self) -> Vec { + std::mem::take(&mut self.events.lock().unwrap()) + } + } + + /// Create a dummy Mappable from /dev/zero for tests. + fn test_mappable() -> Mappable { + use std::fs::File; + Mappable::from(std::os::fd::OwnedFd::from( + File::open("/dev/zero").unwrap(), + )) + } #[async_test] async fn test_region_overlap(spawn: impl Spawn) { @@ -1043,4 +1092,187 @@ mod tests { let _low = task.add(0, 0x1000..0x8000).await.unwrap(); } + + /// Helper that wraps RegionManagerTask for DMA tests. + struct DmaTestTask { + task: RegionManagerTask, + mappable: Mappable, + } + + impl DmaTestTask { + fn new(spawn: impl Spawn) -> Self { + let mm = MappingManager::new(spawn, 0x200000, false); + Self { + task: RegionManagerTask::new(mm.client().clone()), + mappable: test_mappable(), + } + } + + async fn add_region(&mut self, range: Range) -> RegionId { + let id = self + .task + .add_region(RegionParams { + priority: 0, + name: format!("{range:x?}"), + range: MemoryRange::new(range), + dma_target: false, + }) + .unwrap(); + self.task + .map_region( + id, + MapParams { + executable: true, + writable: true, + prefetch: false, + }, + ) + .await; + id + } + + async fn add_mapping(&mut self, id: RegionId, range_in_region: Range) { + self.task + .add_mapping( + id, + RegionMappingParams { + range_in_region: MemoryRange::new(range_in_region), + mappable: self.mappable.clone(), + file_offset: 0, + writable: true, + }, + ) + .await; + } + } + + #[async_test] + async fn test_dma_replay_on_registration(spawn: impl Spawn) { + let mut t = DmaTestTask::new(&spawn); + let r = t.add_region(0x0..0x10000).await; + t.add_mapping(r, 0x0..0x4000).await; + t.add_mapping(r, 0x8000..0xC000).await; + + // Register a DMA mapper — it should replay the two active mappings. + let target = Arc::new(RecordingDmaTarget::default()); + let id = t + .task + .add_dma_mapper(target.clone(), false) + .await + .unwrap(); + + assert_eq!( + target.take_events(), + vec![ + DmaEvent::Map(MemoryRange::new(0x0..0x4000)), + DmaEvent::Map(MemoryRange::new(0x8000..0xC000)), + ] + ); + + // Clean up. + t.task.remove_dma_mapper(id); + } + + #[async_test] + async fn test_dma_live_map_unmap(spawn: impl Spawn) { + let mut t = DmaTestTask::new(&spawn); + let r = t.add_region(0x0..0x10000).await; + + let target = Arc::new(RecordingDmaTarget::default()); + let _id = t + .task + .add_dma_mapper(target.clone(), false) + .await + .unwrap(); + target.take_events(); // discard empty replay + + // Adding a mapping to an active region should notify the DMA mapper. + t.add_mapping(r, 0x0..0x4000).await; + assert_eq!( + target.take_events(), + vec![DmaEvent::Map(MemoryRange::new(0x0..0x4000))] + ); + + // Removing the mapping should unmap it. + t.task + .remove_mappings(r, MemoryRange::new(0x0..0x4000)) + .await; + assert_eq!( + target.take_events(), + vec![DmaEvent::Unmap(MemoryRange::new(0x0..0x4000))] + ); + } + + #[async_test] + async fn test_dma_disable_region_unmaps(spawn: impl Spawn) { + let mut t = DmaTestTask::new(&spawn); + let r = t.add_region(0x0..0x10000).await; + t.add_mapping(r, 0x0..0x4000).await; + t.add_mapping(r, 0x8000..0xC000).await; + + let target = Arc::new(RecordingDmaTarget::default()); + let _id = t + .task + .add_dma_mapper(target.clone(), false) + .await + .unwrap(); + target.take_events(); // discard replay + + // Disabling the region should unmap the entire region range. + t.task.unmap_region(r, false).await; + assert_eq!( + target.take_events(), + vec![DmaEvent::Unmap(MemoryRange::new(0x0..0x10000))] + ); + } + + #[async_test] + async fn test_dma_remove_mapper_unmaps_all(spawn: impl Spawn) { + let mut t = DmaTestTask::new(&spawn); + let r = t.add_region(0x0..0x10000).await; + t.add_mapping(r, 0x0..0x4000).await; + t.add_mapping(r, 0x8000..0xC000).await; + + let target = Arc::new(RecordingDmaTarget::default()); + let id = t + .task + .add_dma_mapper(target.clone(), false) + .await + .unwrap(); + target.take_events(); // discard replay + + // Removing the mapper should unmap each active sub-mapping. + t.task.remove_dma_mapper(id); + assert_eq!( + target.take_events(), + vec![ + DmaEvent::Unmap(MemoryRange::new(0x0..0x4000)), + DmaEvent::Unmap(MemoryRange::new(0x8000..0xC000)), + ] + ); + } + + #[async_test] + async fn test_dma_inactive_region_no_notifications(spawn: impl Spawn) { + let mut t = DmaTestTask::new(&spawn); + let r = t.add_region(0x0..0x10000).await; + t.add_mapping(r, 0x0..0x4000).await; + + // Disable the region before registering the mapper. + t.task.unmap_region(r, false).await; + + let target = Arc::new(RecordingDmaTarget::default()); + let _id = t + .task + .add_dma_mapper(target.clone(), false) + .await + .unwrap(); + + // No replay for inactive regions. + assert_eq!(target.take_events(), vec![]); + + // Adding a mapping while inactive should also not notify. + t.add_mapping(r, 0x8000..0xC000).await; + assert_eq!(target.take_events(), vec![]); + } } diff --git a/vm/devices/user_driver/vfio_sys/src/lib.rs b/vm/devices/user_driver/vfio_sys/src/lib.rs index 19defc9030..fe1579d5db 100644 --- a/vm/devices/user_driver/vfio_sys/src/lib.rs +++ b/vm/devices/user_driver/vfio_sys/src/lib.rs @@ -194,7 +194,11 @@ impl Container { /// Unmap a previously mapped IOVA range from the IOMMU. /// - /// `iova` and `size` must match a previous `map_dma` call. + /// For Type1v2, the unmap range must not bisect any previous mapping: + /// if a mapping exists at `iova`, it must start exactly at `iova`, and + /// if a mapping exists at `iova + size - 1`, it must end there. + /// Multiple mappings may be unmapped in one call as long as these + /// boundary conditions hold. Gaps within the range are fine. pub fn unmap_dma(&self, iova: u64, size: u64) -> anyhow::Result<()> { let mut dma_unmap = vfio_bindings::bindings::vfio::vfio_iommu_type1_dma_unmap { argsz: size_of::() as u32, From b9587af5d9aef0971b9d639af84aae820641b5fd Mon Sep 17 00:00:00 2001 From: John Starks Date: Sat, 25 Apr 2026 00:30:57 +0000 Subject: [PATCH 5/9] feedback --- openvmm/membacking/src/region_manager.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/openvmm/membacking/src/region_manager.rs b/openvmm/membacking/src/region_manager.rs index b95277fd4e..538926a59b 100644 --- a/openvmm/membacking/src/region_manager.rs +++ b/openvmm/membacking/src/region_manager.rs @@ -396,7 +396,8 @@ impl RegionManagerTask { for region in &self.regions { if region.is_active { for mapping in ®ion.mappings { - let range = range_within(region.params.range, mapping.params.range_in_region); + let range = + range_within(region.params.range, mapping.params.range_in_region); mapper .map_dma( range, @@ -407,6 +408,7 @@ impl RegionManagerTask { } } } + self.inner.dma_mappers.push(mapper); Ok(id) } @@ -852,7 +854,9 @@ impl DmaMapperClient { /// be `None`. Use this for backends that map from the fd directly (iommufd). /// /// The replay loop maps all existing active sub-mappings into the new - /// consumer. + /// consumer. On failure, already-mapped entries are **not** rolled back; + /// the caller must clean up by dropping the [`DmaTarget`] (e.g., closing + /// the VFIO container fd). /// /// Returns a [`DmaMapperHandle`] that removes the mapper when dropped. pub async fn add_dma_mapper( From 1e202f0831961d30e34844b924e7d2a17cff29c2 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sun, 26 Apr 2026 21:13:09 +0000 Subject: [PATCH 6/9] fmt --- openvmm/membacking/src/region_manager.rs | 70 +++++------------------- 1 file changed, 15 insertions(+), 55 deletions(-) diff --git a/openvmm/membacking/src/region_manager.rs b/openvmm/membacking/src/region_manager.rs index 538926a59b..344df43c84 100644 --- a/openvmm/membacking/src/region_manager.rs +++ b/openvmm/membacking/src/region_manager.rs @@ -12,6 +12,7 @@ use crate::mapping_manager::MappingManagerClient; use crate::mapping_manager::MappingParams; use crate::mapping_manager::VaMapper; use crate::partition_mapper::PartitionMapper; +use anyhow::Context as _; use futures::StreamExt; use inspect::Inspect; use inspect::InspectMut; @@ -100,11 +101,7 @@ impl DmaMapper { va_mapper .ensure_mapped(range) .await - .map_err(|_| { - anyhow::anyhow!( - "VA range {range} has no backing mapping", - ) - })?; + .context("VA range has no backing mapping")?; // SAFETY: range.start() is within the VA reservation (ensured by // ensure_mapped succeeding), so this produces a valid pointer // within the mapping. @@ -118,10 +115,7 @@ impl DmaMapper { // ensure_mapped succeeded. The IOMMU mapping will be torn down // (via unmap_dma in disable_region or remove_dma_mapper) before the // VaMapper releases the VA range. - unsafe { - self.target - .map_dma(range, host_va, mappable, file_offset) - } + unsafe { self.target.map_dma(range, host_va, mappable, file_offset) } } /// Unmap a range from the IOMMU. @@ -396,14 +390,9 @@ impl RegionManagerTask { for region in &self.regions { if region.is_active { for mapping in ®ion.mappings { - let range = - range_within(region.params.range, mapping.params.range_in_region); + let range = range_within(region.params.range, mapping.params.range_in_region); mapper - .map_dma( - range, - &mapping.params.mappable, - mapping.params.file_offset, - ) + .map_dma(range, &mapping.params.mappable, mapping.params.file_offset) .await?; } } @@ -420,7 +409,8 @@ impl RegionManagerTask { for region in &self.regions { if region.is_active { for mapping in ®ion.mappings { - let range = range_within(region.params.range, mapping.params.range_in_region); + let range = + range_within(region.params.range, mapping.params.range_in_region); mapper.unmap_dma(range); } } @@ -608,11 +598,7 @@ impl RegionManagerTask { for dma_mapper in &self.inner.dma_mappers { if let Err(e) = dma_mapper - .map_dma( - range, - ¶ms.mappable, - params.file_offset, - ) + .map_dma(range, ¶ms.mappable, params.file_offset) .await { tracing::warn!( @@ -705,11 +691,7 @@ impl RegionManagerTaskInner { let range = range_within(region.params.range, mapping.params.range_in_region); for dma_mapper in &self.dma_mappers { if let Err(e) = dma_mapper - .map_dma( - range, - &mapping.params.mappable, - mapping.params.file_offset, - ) + .map_dma(range, &mapping.params.mappable, mapping.params.file_offset) .await { tracing::warn!( @@ -1036,9 +1018,7 @@ mod tests { /// Create a dummy Mappable from /dev/zero for tests. fn test_mappable() -> Mappable { use std::fs::File; - Mappable::from(std::os::fd::OwnedFd::from( - File::open("/dev/zero").unwrap(), - )) + Mappable::from(std::os::fd::OwnedFd::from(File::open("/dev/zero").unwrap())) } #[async_test] @@ -1159,11 +1139,7 @@ mod tests { // Register a DMA mapper — it should replay the two active mappings. let target = Arc::new(RecordingDmaTarget::default()); - let id = t - .task - .add_dma_mapper(target.clone(), false) - .await - .unwrap(); + let id = t.task.add_dma_mapper(target.clone(), false).await.unwrap(); assert_eq!( target.take_events(), @@ -1183,11 +1159,7 @@ mod tests { let r = t.add_region(0x0..0x10000).await; let target = Arc::new(RecordingDmaTarget::default()); - let _id = t - .task - .add_dma_mapper(target.clone(), false) - .await - .unwrap(); + let _id = t.task.add_dma_mapper(target.clone(), false).await.unwrap(); target.take_events(); // discard empty replay // Adding a mapping to an active region should notify the DMA mapper. @@ -1215,11 +1187,7 @@ mod tests { t.add_mapping(r, 0x8000..0xC000).await; let target = Arc::new(RecordingDmaTarget::default()); - let _id = t - .task - .add_dma_mapper(target.clone(), false) - .await - .unwrap(); + let _id = t.task.add_dma_mapper(target.clone(), false).await.unwrap(); target.take_events(); // discard replay // Disabling the region should unmap the entire region range. @@ -1238,11 +1206,7 @@ mod tests { t.add_mapping(r, 0x8000..0xC000).await; let target = Arc::new(RecordingDmaTarget::default()); - let id = t - .task - .add_dma_mapper(target.clone(), false) - .await - .unwrap(); + let id = t.task.add_dma_mapper(target.clone(), false).await.unwrap(); target.take_events(); // discard replay // Removing the mapper should unmap each active sub-mapping. @@ -1266,11 +1230,7 @@ mod tests { t.task.unmap_region(r, false).await; let target = Arc::new(RecordingDmaTarget::default()); - let _id = t - .task - .add_dma_mapper(target.clone(), false) - .await - .unwrap(); + let _id = t.task.add_dma_mapper(target.clone(), false).await.unwrap(); // No replay for inactive regions. assert_eq!(target.take_events(), vec![]); From 77538701dab9779862592a8d1c215b82015f5586 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sun, 26 Apr 2026 21:22:15 +0000 Subject: [PATCH 7/9] feedback --- openvmm/membacking/src/region_manager.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/openvmm/membacking/src/region_manager.rs b/openvmm/membacking/src/region_manager.rs index 344df43c84..31a0a86479 100644 --- a/openvmm/membacking/src/region_manager.rs +++ b/openvmm/membacking/src/region_manager.rs @@ -1015,10 +1015,11 @@ mod tests { } } - /// Create a dummy Mappable from /dev/zero for tests. + /// Create a dummy Mappable for tests (cross-platform). fn test_mappable() -> Mappable { - use std::fs::File; - Mappable::from(std::os::fd::OwnedFd::from(File::open("/dev/zero").unwrap())) + sparse_mmap::alloc_shared_memory(0x10000, "test-dma") + .unwrap() + .into() } #[async_test] From 213000bc8e76f5eaf4c8c1949a21806eee4d9b2c Mon Sep 17 00:00:00 2001 From: John Starks Date: Sun, 26 Apr 2026 21:41:47 +0000 Subject: [PATCH 8/9] fix --- openvmm/membacking/src/region_manager.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/openvmm/membacking/src/region_manager.rs b/openvmm/membacking/src/region_manager.rs index 31a0a86479..60871e4865 100644 --- a/openvmm/membacking/src/region_manager.rs +++ b/openvmm/membacking/src/region_manager.rs @@ -975,9 +975,9 @@ mod tests { use memory_range::MemoryRange; use pal_async::async_test; use pal_async::task::Spawn; + use parking_lot::Mutex; use std::ops::Range; use std::sync::Arc; - use std::sync::Mutex; /// Records map/unmap calls for test assertions. #[derive(Default)] @@ -999,19 +999,19 @@ mod tests { _mappable: &Mappable, _file_offset: u64, ) -> anyhow::Result<()> { - self.events.lock().unwrap().push(DmaEvent::Map(range)); + self.events.lock().push(DmaEvent::Map(range)); Ok(()) } fn unmap_dma(&self, range: MemoryRange) -> anyhow::Result<()> { - self.events.lock().unwrap().push(DmaEvent::Unmap(range)); + self.events.lock().push(DmaEvent::Unmap(range)); Ok(()) } } impl RecordingDmaTarget { fn take_events(&self) -> Vec { - std::mem::take(&mut self.events.lock().unwrap()) + std::mem::take(&mut self.events.lock()) } } From 2668e16b70785436d52af486410716d798497af1 Mon Sep 17 00:00:00 2001 From: John Starks Date: Sun, 26 Apr 2026 23:20:40 +0000 Subject: [PATCH 9/9] fix --- openvmm/membacking/src/region_manager.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/openvmm/membacking/src/region_manager.rs b/openvmm/membacking/src/region_manager.rs index 60871e4865..58a1c77e1e 100644 --- a/openvmm/membacking/src/region_manager.rs +++ b/openvmm/membacking/src/region_manager.rs @@ -643,17 +643,19 @@ impl RegionManagerTask { false }); if let Some(region_range) = active_range { - self.inner - .mapping_manager - .remove_mappings(range_within(region_range, range_in_region)) - .await; - - // Unmap removed sub-mappings from DMA mappers. + // Unmap DMA mappers first — IOMMU entries must be removed before + // the VA mappings are torn down (same ordering as disable_region). for &removed in &removed_ranges { for dma_mapper in &self.inner.dma_mappers { dma_mapper.unmap_dma(removed); } } + + self.inner + .mapping_manager + .remove_mappings(range_within(region_range, range_in_region)) + .await; + // Currently there is no need to tell the partitions about the // removed mappings; they will find out when the underlying VA is // invalidated by the kernel.