diff --git a/Cargo.lock b/Cargo.lock index 81c5c5a278..f74fe11a24 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8703,6 +8703,7 @@ dependencies = [ "chipset_device", "guestmem", "inspect", + "membacking", "memory_range", "mesh", "pal_async", diff --git a/openvmm/membacking/src/lib.rs b/openvmm/membacking/src/lib.rs index 3f2b50f617..650ab321d7 100644 --- a/openvmm/membacking/src/lib.rs +++ b/openvmm/membacking/src/lib.rs @@ -88,6 +88,7 @@ mod sys { /// On Unix, this is an empty (uninhabitable) enum. pub type RemoteProcess = sys::RemoteProcess; +pub use mapping_manager::Mappable; pub use memory_manager::DeviceMemoryMapper; pub use memory_manager::GuestMemoryBuilder; pub use memory_manager::GuestMemoryClient; @@ -97,3 +98,6 @@ pub use memory_manager::PartitionAttachError; pub use memory_manager::RamVisibility; pub use memory_manager::RamVisibilityControl; pub use memory_manager::SharedMemoryBacking; +pub use region_manager::DmaMapperClient; +pub use region_manager::DmaMapperHandle; +pub use region_manager::DmaTarget; diff --git a/openvmm/membacking/src/mapping_manager/va_mapper.rs b/openvmm/membacking/src/mapping_manager/va_mapper.rs index be561660cc..34bcbd856b 100644 --- a/openvmm/membacking/src/mapping_manager/va_mapper.rs +++ b/openvmm/membacking/src/mapping_manager/va_mapper.rs @@ -44,6 +44,10 @@ use std::sync::Arc; use std::thread::JoinHandle; use thiserror::Error; +/// A virtual address space mapper for guest memory. +/// +/// Maintains a reserved VA range and maps file-backed or anonymous memory +/// into it as directed by the mapping manager. pub struct VaMapper { inner: Arc, process: Option, @@ -259,14 +263,17 @@ impl VaMapper { self.inner.request_mapping(range, false).await } + /// Returns the base pointer of the VA reservation. pub fn as_ptr(&self) -> *mut u8 { self.inner.mapping.as_ptr().cast() } + /// Returns the length of the VA reservation in bytes. pub fn len(&self) -> usize { self.inner.mapping.len() } + /// Returns the remote process, if this mapper maps into a remote process. pub fn process(&self) -> Option<&RemoteProcess> { self.process.as_ref() } diff --git a/openvmm/membacking/src/memory_manager/device_memory.rs b/openvmm/membacking/src/memory_manager/device_memory.rs index 8b8c1a9959..46dd7fb40b 100644 --- a/openvmm/membacking/src/memory_manager/device_memory.rs +++ b/openvmm/membacking/src/memory_manager/device_memory.rs @@ -158,7 +158,10 @@ impl MappableGuestMemory for DeviceMemoryControl { MemoryRange::try_from(gpa..gpa.wrapping_add(self.0.len as u64)) .map_err(|err| io::Error::new(io::ErrorKind::InvalidInput, err))?, DEVICE_PRIORITY, - false, // device memory cannot currently be a DMA target + false, // not a DMA target: excludes device BARs from + // GuestMemorySharing (vhost-user). IOMMU consumers + // get notified of these mappings via the DmaMapper + // path regardless of this flag. ) .await .map_err(io::Error::other)?; diff --git a/openvmm/membacking/src/memory_manager/mod.rs b/openvmm/membacking/src/memory_manager/mod.rs index a47078f034..2a29eb2d18 100644 --- a/openvmm/membacking/src/memory_manager/mod.rs +++ b/openvmm/membacking/src/memory_manager/mod.rs @@ -455,6 +455,11 @@ impl GuestMemoryManager { DeviceMemoryMapper::new(self.region_manager.client().clone()) } + /// Returns a client for registering DMA mappers (VFIO, iommufd). + pub fn dma_mapper_client(&self) -> crate::region_manager::DmaMapperClient { + crate::region_manager::DmaMapperClient::new(self.region_manager.client()) + } + /// Returns an object for manipulating the visibility state of different RAM /// regions. pub fn ram_visibility_control(&self) -> RamVisibilityControl { diff --git a/openvmm/membacking/src/region_manager.rs b/openvmm/membacking/src/region_manager.rs index 33369b6a71..58a1c77e1e 100644 --- a/openvmm/membacking/src/region_manager.rs +++ b/openvmm/membacking/src/region_manager.rs @@ -4,10 +4,15 @@ //! Implements the region manager, which tracks regions and their mappings, as //! well as partitions to map the regions into. +// UNSAFETY: Calling unsafe DmaTarget::map_dma with validated VA pointers. +#![expect(unsafe_code)] + use crate::mapping_manager::Mappable; use crate::mapping_manager::MappingManagerClient; use crate::mapping_manager::MappingParams; +use crate::mapping_manager::VaMapper; use crate::partition_mapper::PartitionMapper; +use anyhow::Context as _; use futures::StreamExt; use inspect::Inspect; use inspect::InspectMut; @@ -17,9 +22,114 @@ use mesh::rpc::Rpc; use mesh::rpc::RpcSend; use pal_async::task::Spawn; use std::cmp::Ordering; +use std::sync::Arc; use thiserror::Error; use vmcore::local_only::LocalOnly; +/// A consumer of IOMMU-granularity DMA mapping events. +/// +/// Unlike [`PartitionMemoryMap`](virt::PartitionMemoryMap), which maps entire +/// regions by VA pointer for lazy SLAT resolution, this trait receives +/// individual sub-mapping events with the backing fd + offset, suitable for +/// explicit IOMMU programming (VFIO type1, iommufd, etc.). +/// +/// DMA targets receive notifications for **all** active sub-mappings, +/// including device BAR memory (regions with `dma_target: false`). The +/// `dma_target` flag controls only whether a region is exposed via +/// `GuestMemorySharing` (for vhost-user); IOMMU consumers need the full +/// GPA→backing map to program identity mappings for all guest-visible memory. +/// +/// Implementations must be `Send + Sync` because they are stored behind `Arc` +/// in the region manager task. +pub trait DmaTarget: Send + Sync { + /// Program an IOMMU mapping for `range` to the backing described by + /// `mappable` at `file_offset`. + /// + /// `host_va` is the host virtual address of the mapping, provided when + /// the target was registered with `needs_va = true`. When `None`, the + /// implementation should use `mappable` and `file_offset` directly + /// (e.g., iommufd). + /// + /// # Safety + /// When `host_va` is `Some`, the pointed-to memory must be backed and + /// must not be unmapped for the duration of the resulting IOMMU mapping. + /// The caller (the crate-internal `DmaMapper`) guarantees this by holding an + /// [`Arc`] and calling `ensure_mapped` before each invocation. + unsafe fn map_dma( + &self, + range: MemoryRange, + host_va: Option<*const u8>, + mappable: &Mappable, + file_offset: u64, + ) -> anyhow::Result<()>; + + /// Remove IOMMU mappings within `range`. + /// + /// The region manager may call this with a range that covers multiple + /// prior `map_dma` calls (e.g., unmapping an entire region at once even + /// though individual sub-mappings were mapped separately). The range + /// will always be aligned to mapping boundaries — it will not bisect + /// any prior mapping. Gaps within the range (unmapped sub-ranges) are + /// expected and must not cause errors. + fn unmap_dma(&self, range: MemoryRange) -> anyhow::Result<()>; +} + +/// Wraps a [`DmaTarget`] for use by the region manager. +/// +/// Holds an optional [`VaMapper`] to ensure pages are faulted in before +/// `map_dma` (needed for VFIO type1's `pin_user_pages`). +struct DmaMapper { + id: DmaMapperId, + target: Arc, + va_mapper: Option>, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +struct DmaMapperId(u64); + +impl DmaMapper { + /// Map a sub-mapping into the IOMMU. + async fn map_dma( + &self, + range: MemoryRange, + mappable: &Mappable, + file_offset: u64, + ) -> anyhow::Result<()> { + // Ensure the VaMapper has the backing pages faulted in and compute + // the host VA for type1 backends. + let host_va = if let Some(va_mapper) = &self.va_mapper { + va_mapper + .ensure_mapped(range) + .await + .context("VA range has no backing mapping")?; + // SAFETY: range.start() is within the VA reservation (ensured by + // ensure_mapped succeeding), so this produces a valid pointer + // within the mapping. + Some(unsafe { va_mapper.as_ptr().add(range.start() as usize).cast_const() }) + } else { + None + }; + // SAFETY: When host_va is Some, the VaMapper has been ensure_mapped + // for this range. The VaMapper is held alive by this DmaMapper (via + // Arc), so the VA reservation persists. The pages are backed because + // ensure_mapped succeeded. The IOMMU mapping will be torn down + // (via unmap_dma in disable_region or remove_dma_mapper) before the + // VaMapper releases the VA range. + unsafe { self.target.map_dma(range, host_va, mappable, file_offset) } + } + + /// Unmap a range from the IOMMU. + fn unmap_dma(&self, range: MemoryRange) { + if let Err(e) = self.target.unmap_dma(range) { + tracing::warn!( + error = &*e as &dyn std::error::Error, + %range, + "DMA unmap failed" + ); + } + } +} + /// The region manager. #[derive(Debug, Inspect)] pub struct RegionManager { @@ -130,6 +240,8 @@ fn inspect_mappings(req: inspect::Request<'_>, region_start: u64, mappings: &[Re struct RegionManagerTaskInner { partitions: Vec, + dma_mappers: Vec, + next_dma_mapper_id: u64, mapping_manager: MappingManagerClient, } @@ -144,6 +256,8 @@ enum RegionRequest { AddPartition( LocalOnly>>, ), + AddDmaMapper(LocalOnly, bool), anyhow::Result>>), + RemoveDmaMapper(LocalOnly), Inspect(inspect::Deferred), } @@ -178,6 +292,8 @@ impl RegionManagerTask { inner: RegionManagerTaskInner { mapping_manager, partitions: Vec::new(), + dma_mappers: Vec::new(), + next_dma_mapper_id: 0, }, } } @@ -197,6 +313,14 @@ impl RegionManagerTask { rpc.handle(async |partition| self.add_partition(partition).await) .await } + RegionRequest::AddDmaMapper(LocalOnly(rpc)) => { + let ((target, needs_va), rpc) = rpc.split(); + let result = self.add_dma_mapper(target, needs_va).await; + rpc.complete(result); + } + RegionRequest::RemoveDmaMapper(LocalOnly(id)) => { + self.remove_dma_mapper(id); + } RegionRequest::AddRegion(rpc) => rpc.handle_sync(|params| self.add_region(params)), RegionRequest::RemoveRegion(rpc) => { rpc.handle(async |id| self.unmap_region(id, true).await) @@ -234,6 +358,67 @@ impl RegionManagerTask { Ok(()) } + async fn add_dma_mapper( + &mut self, + target: Arc, + needs_va: bool, + ) -> anyhow::Result { + // Create a VaMapper if the target needs host VAs for IOMMU programming. + let va_mapper = if needs_va { + Some( + self.inner + .mapping_manager + .new_mapper() + .await + .map_err(|e| anyhow::anyhow!(e))?, + ) + } else { + None + }; + + let id = DmaMapperId(self.inner.next_dma_mapper_id); + self.inner.next_dma_mapper_id += 1; + + let mapper = DmaMapper { + id, + target, + va_mapper, + }; + + // Replay existing active sub-mappings so the new IOMMU consumer + // gets the current state. + for region in &self.regions { + if region.is_active { + for mapping in ®ion.mappings { + let range = range_within(region.params.range, mapping.params.range_in_region); + mapper + .map_dma(range, &mapping.params.mappable, mapping.params.file_offset) + .await?; + } + } + } + + self.inner.dma_mappers.push(mapper); + Ok(id) + } + + fn remove_dma_mapper(&mut self, id: DmaMapperId) { + if let Some(pos) = self.inner.dma_mappers.iter().position(|m| m.id == id) { + let mapper = &self.inner.dma_mappers[pos]; + // Unmap all active sub-mappings from this mapper before removing it. + for region in &self.regions { + if region.is_active { + for mapping in ®ion.mappings { + let range = + range_within(region.params.range, mapping.params.range_in_region); + mapper.unmap_dma(range); + } + } + } + self.inner.dma_mappers.swap_remove(pos); + } + } + fn region_index(&self, id: RegionId) -> usize { self.regions.iter().position(|r| r.id == id).unwrap() } @@ -410,6 +595,19 @@ impl RegionManagerTask { for partition in &mut self.inner.partitions { partition.notify_new_mapping(range).await; } + + for dma_mapper in &self.inner.dma_mappers { + if let Err(e) = dma_mapper + .map_dma(range, ¶ms.mappable, params.file_offset) + .await + { + tracing::warn!( + error = &*e as &dyn std::error::Error, + %range, + "DMA mapper failed to map new sub-mapping" + ); + } + } } region.mappings.push(RegionMapping { params }); @@ -418,6 +616,22 @@ impl RegionManagerTask { async fn remove_mappings(&mut self, id: RegionId, range_in_region: MemoryRange) { let index = self.region_index(id); let region = &mut self.regions[index]; + let active_range = region.active_range(); + + // Collect absolute GPA ranges of mappings being removed (before + // mutating the vec) so we can notify DMA mappers. + let removed_ranges: Vec = if active_range.is_some() { + let region_range = region.params.range; + region + .mappings + .iter() + .filter(|m| range_in_region.contains(&m.params.range_in_region)) + .map(|m| range_within(region_range, m.params.range_in_region)) + .collect() + } else { + Vec::new() + }; + region.mappings.retain_mut(|mapping| { if !range_in_region.contains(&mapping.params.range_in_region) { assert!( @@ -428,7 +642,15 @@ impl RegionManagerTask { } false }); - if let Some(region_range) = region.active_range() { + if let Some(region_range) = active_range { + // Unmap DMA mappers first — IOMMU entries must be removed before + // the VA mappings are torn down (same ordering as disable_region). + for &removed in &removed_ranges { + for dma_mapper in &self.inner.dma_mappers { + dma_mapper.unmap_dma(removed); + } + } + self.inner .mapping_manager .remove_mappings(range_within(region_range, range_in_region)) @@ -466,6 +688,23 @@ impl RegionManagerTaskInner { .await; } + // Map sub-mappings into DMA mappers (per-sub-mapping granularity). + for mapping in ®ion.mappings { + let range = range_within(region.params.range, mapping.params.range_in_region); + for dma_mapper in &self.dma_mappers { + if let Err(e) = dma_mapper + .map_dma(range, &mapping.params.mappable, mapping.params.file_offset) + .await + { + tracing::warn!( + error = &*e as &dyn std::error::Error, + %range, + "DMA mapper failed to map sub-mapping during region enable" + ); + } + } + } + // Map the region into the partitions. for partition in &mut self.partitions { partition @@ -486,7 +725,15 @@ impl RegionManagerTaskInner { "disabling region" ); + // Unmap DMA mappers first — IOMMU entries must be removed before + // the VA mappings are torn down (type1's pin_user_pages pins are + // released by unmap_dma, and the underlying pages must still be + // valid at that point). let region_range = region.params.range; + for dma_mapper in &mut self.dma_mappers { + dma_mapper.unmap_dma(region_range); + } + for partition in &mut self.partitions { partition.unmap_region(region_range); } @@ -562,6 +809,78 @@ impl RegionManagerClient { } } +/// Client for registering DMA mappers with the region manager. +/// +/// This is the public-facing handle for IOMMU consumers (VFIO, iommufd) +/// to register themselves. It exposes only `add_dma_mapper`, hiding the +/// rest of the region manager API. +#[derive(Clone)] +pub struct DmaMapperClient { + req_send: mesh::Sender, +} + +impl DmaMapperClient { + pub(crate) fn new(region_manager: &RegionManagerClient) -> Self { + Self { + req_send: region_manager.req_send.clone(), + } + } + + /// Register a DMA target to receive sub-mapping events. + /// + /// This may only be called in the same process as the region manager. + /// + /// If `needs_va` is `true`, the region manager will maintain a `VaMapper` + /// and pass a host VA to [`DmaTarget::map_dma`] for each sub-mapping. + /// Use this for backends that program the IOMMU via host VAs (VFIO type1). + /// + /// If `needs_va` is `false`, no `VaMapper` is created and `host_va` will + /// be `None`. Use this for backends that map from the fd directly (iommufd). + /// + /// The replay loop maps all existing active sub-mappings into the new + /// consumer. On failure, already-mapped entries are **not** rolled back; + /// the caller must clean up by dropping the [`DmaTarget`] (e.g., closing + /// the VFIO container fd). + /// + /// Returns a [`DmaMapperHandle`] that removes the mapper when dropped. + pub async fn add_dma_mapper( + &self, + target: Arc, + needs_va: bool, + ) -> anyhow::Result { + let id = self + .req_send + .call( + |x| RegionRequest::AddDmaMapper(LocalOnly(x)), + (target, needs_va), + ) + .await + .unwrap()?; + Ok(DmaMapperHandle { + id: Some(id), + req_send: self.req_send.clone(), + }) + } +} + +/// Handle to a registered DMA mapper. +/// +/// Removes the mapper from the region manager on drop, unmapping all +/// active IOMMU entries. +pub struct DmaMapperHandle { + id: Option, + req_send: mesh::Sender, +} + +impl Drop for DmaMapperHandle { + fn drop(&mut self) { + if let Some(id) = self.id { + self.req_send + .send(RegionRequest::RemoveDmaMapper(LocalOnly(id))); + } + } +} + /// A handle to a region. /// /// Removes the region on drop. @@ -648,14 +967,62 @@ impl Drop for RegionHandle { mod tests { use super::MapParams; use super::RegionManagerTask; + use crate::mapping_manager::Mappable; use crate::mapping_manager::MappingManager; use crate::region_manager::AddRegionError; + use crate::region_manager::DmaTarget; use crate::region_manager::RegionId; + use crate::region_manager::RegionMappingParams; use crate::region_manager::RegionParams; use memory_range::MemoryRange; use pal_async::async_test; use pal_async::task::Spawn; + use parking_lot::Mutex; use std::ops::Range; + use std::sync::Arc; + + /// Records map/unmap calls for test assertions. + #[derive(Default)] + struct RecordingDmaTarget { + events: Mutex>, + } + + #[derive(Debug, Clone, PartialEq, Eq)] + enum DmaEvent { + Map(MemoryRange), + Unmap(MemoryRange), + } + + impl DmaTarget for RecordingDmaTarget { + unsafe fn map_dma( + &self, + range: MemoryRange, + _host_va: Option<*const u8>, + _mappable: &Mappable, + _file_offset: u64, + ) -> anyhow::Result<()> { + self.events.lock().push(DmaEvent::Map(range)); + Ok(()) + } + + fn unmap_dma(&self, range: MemoryRange) -> anyhow::Result<()> { + self.events.lock().push(DmaEvent::Unmap(range)); + Ok(()) + } + } + + impl RecordingDmaTarget { + fn take_events(&self) -> Vec { + std::mem::take(&mut self.events.lock()) + } + } + + /// Create a dummy Mappable for tests (cross-platform). + fn test_mappable() -> Mappable { + sparse_mmap::alloc_shared_memory(0x10000, "test-dma") + .unwrap() + .into() + } #[async_test] async fn test_region_overlap(spawn: impl Spawn) { @@ -712,4 +1079,167 @@ mod tests { let _low = task.add(0, 0x1000..0x8000).await.unwrap(); } + + /// Helper that wraps RegionManagerTask for DMA tests. + struct DmaTestTask { + task: RegionManagerTask, + mappable: Mappable, + } + + impl DmaTestTask { + fn new(spawn: impl Spawn) -> Self { + let mm = MappingManager::new(spawn, 0x200000, false); + Self { + task: RegionManagerTask::new(mm.client().clone()), + mappable: test_mappable(), + } + } + + async fn add_region(&mut self, range: Range) -> RegionId { + let id = self + .task + .add_region(RegionParams { + priority: 0, + name: format!("{range:x?}"), + range: MemoryRange::new(range), + dma_target: false, + }) + .unwrap(); + self.task + .map_region( + id, + MapParams { + executable: true, + writable: true, + prefetch: false, + }, + ) + .await; + id + } + + async fn add_mapping(&mut self, id: RegionId, range_in_region: Range) { + self.task + .add_mapping( + id, + RegionMappingParams { + range_in_region: MemoryRange::new(range_in_region), + mappable: self.mappable.clone(), + file_offset: 0, + writable: true, + }, + ) + .await; + } + } + + #[async_test] + async fn test_dma_replay_on_registration(spawn: impl Spawn) { + let mut t = DmaTestTask::new(&spawn); + let r = t.add_region(0x0..0x10000).await; + t.add_mapping(r, 0x0..0x4000).await; + t.add_mapping(r, 0x8000..0xC000).await; + + // Register a DMA mapper — it should replay the two active mappings. + let target = Arc::new(RecordingDmaTarget::default()); + let id = t.task.add_dma_mapper(target.clone(), false).await.unwrap(); + + assert_eq!( + target.take_events(), + vec![ + DmaEvent::Map(MemoryRange::new(0x0..0x4000)), + DmaEvent::Map(MemoryRange::new(0x8000..0xC000)), + ] + ); + + // Clean up. + t.task.remove_dma_mapper(id); + } + + #[async_test] + async fn test_dma_live_map_unmap(spawn: impl Spawn) { + let mut t = DmaTestTask::new(&spawn); + let r = t.add_region(0x0..0x10000).await; + + let target = Arc::new(RecordingDmaTarget::default()); + let _id = t.task.add_dma_mapper(target.clone(), false).await.unwrap(); + target.take_events(); // discard empty replay + + // Adding a mapping to an active region should notify the DMA mapper. + t.add_mapping(r, 0x0..0x4000).await; + assert_eq!( + target.take_events(), + vec![DmaEvent::Map(MemoryRange::new(0x0..0x4000))] + ); + + // Removing the mapping should unmap it. + t.task + .remove_mappings(r, MemoryRange::new(0x0..0x4000)) + .await; + assert_eq!( + target.take_events(), + vec![DmaEvent::Unmap(MemoryRange::new(0x0..0x4000))] + ); + } + + #[async_test] + async fn test_dma_disable_region_unmaps(spawn: impl Spawn) { + let mut t = DmaTestTask::new(&spawn); + let r = t.add_region(0x0..0x10000).await; + t.add_mapping(r, 0x0..0x4000).await; + t.add_mapping(r, 0x8000..0xC000).await; + + let target = Arc::new(RecordingDmaTarget::default()); + let _id = t.task.add_dma_mapper(target.clone(), false).await.unwrap(); + target.take_events(); // discard replay + + // Disabling the region should unmap the entire region range. + t.task.unmap_region(r, false).await; + assert_eq!( + target.take_events(), + vec![DmaEvent::Unmap(MemoryRange::new(0x0..0x10000))] + ); + } + + #[async_test] + async fn test_dma_remove_mapper_unmaps_all(spawn: impl Spawn) { + let mut t = DmaTestTask::new(&spawn); + let r = t.add_region(0x0..0x10000).await; + t.add_mapping(r, 0x0..0x4000).await; + t.add_mapping(r, 0x8000..0xC000).await; + + let target = Arc::new(RecordingDmaTarget::default()); + let id = t.task.add_dma_mapper(target.clone(), false).await.unwrap(); + target.take_events(); // discard replay + + // Removing the mapper should unmap each active sub-mapping. + t.task.remove_dma_mapper(id); + assert_eq!( + target.take_events(), + vec![ + DmaEvent::Unmap(MemoryRange::new(0x0..0x4000)), + DmaEvent::Unmap(MemoryRange::new(0x8000..0xC000)), + ] + ); + } + + #[async_test] + async fn test_dma_inactive_region_no_notifications(spawn: impl Spawn) { + let mut t = DmaTestTask::new(&spawn); + let r = t.add_region(0x0..0x10000).await; + t.add_mapping(r, 0x0..0x4000).await; + + // Disable the region before registering the mapper. + t.task.unmap_region(r, false).await; + + let target = Arc::new(RecordingDmaTarget::default()); + let _id = t.task.add_dma_mapper(target.clone(), false).await.unwrap(); + + // No replay for inactive regions. + assert_eq!(target.take_events(), vec![]); + + // Adding a mapping while inactive should also not notify. + t.add_mapping(r, 0x8000..0xC000).await; + assert_eq!(target.take_events(), vec![]); + } } diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 0535edb655..615740c8d1 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -1798,7 +1798,7 @@ impl InitializedVm { let vfio_inspect = { let vfio_resolver = vfio_assigned_device::resolver::VfioDeviceResolver::new( driver_source.builder().build("vfio-container-mgr"), - gm.clone(), + memory_manager.dma_mapper_client(), ); let handle = vfio_resolver.inspect_handle(); resolver.add_async_resolver::< diff --git a/vm/devices/pci/vfio_assigned_device/Cargo.toml b/vm/devices/pci/vfio_assigned_device/Cargo.toml index 7e7587eea8..e05b52c754 100644 --- a/vm/devices/pci/vfio_assigned_device/Cargo.toml +++ b/vm/devices/pci/vfio_assigned_device/Cargo.toml @@ -11,6 +11,7 @@ chipset_device.workspace = true guestmem.workspace = true inspect.workspace = true memory_range.workspace = true +membacking.workspace = true mesh.workspace = true pal_async.workspace = true pci_core.workspace = true diff --git a/vm/devices/pci/vfio_assigned_device/src/lib.rs b/vm/devices/pci/vfio_assigned_device/src/lib.rs index dc819e72d7..77dc5200f7 100644 --- a/vm/devices/pci/vfio_assigned_device/src/lib.rs +++ b/vm/devices/pci/vfio_assigned_device/src/lib.rs @@ -13,7 +13,6 @@ //! fatal. #![cfg(target_os = "linux")] -#![forbid(unsafe_code)] pub mod manager; pub mod resolver; diff --git a/vm/devices/pci/vfio_assigned_device/src/manager.rs b/vm/devices/pci/vfio_assigned_device/src/manager.rs index 1432bcc460..e80d48f8cb 100644 --- a/vm/devices/pci/vfio_assigned_device/src/manager.rs +++ b/vm/devices/pci/vfio_assigned_device/src/manager.rs @@ -7,15 +7,52 @@ //! tables) for every assigned device, this module manages a pool of containers //! and reuses them across devices whose IOMMU groups are compatible. +// UNSAFETY: Implementing unsafe DmaTarget::map_dma for VFIO type1 IOMMU. +#![expect(unsafe_code)] + use anyhow::Context as _; -use guestmem::GuestMemory; use inspect::{Inspect, InspectMut}; +use membacking::DmaMapperClient; use mesh::rpc::FailableRpc; use mesh::rpc::RpcSend as _; use std::collections::HashMap; use std::fs::File; use std::sync::Arc; +/// Implements [`membacking::DmaTarget`] for VFIO type1 IOMMU containers. +/// +/// Translates sub-mapping events from the region manager into VFIO +/// `map_dma`/`unmap_dma` ioctls. The host VA needed for `pin_user_pages` +/// is provided by the region manager's `DmaMapper` wrapper. +struct VfioType1DmaTarget { + container: Arc, +} + +impl membacking::DmaTarget for VfioType1DmaTarget { + unsafe fn map_dma( + &self, + range: memory_range::MemoryRange, + host_va: Option<*const u8>, + _mappable: &membacking::Mappable, + _file_offset: u64, + ) -> anyhow::Result<()> { + let vaddr = host_va.expect("VFIO type1 requires host VA (registered with needs_va=true)"); + // SAFETY: The caller (DmaMapper in membacking) guarantees that the + // host VA is backed and stable via ensure_mapped + VaMapper lifetime. + unsafe { + self.container + .map_dma(range.start(), vaddr, range.len()) + .context("VFIO DMA map failed") + } + } + + fn unmap_dma(&self, range: memory_range::MemoryRange) -> anyhow::Result<()> { + self.container + .unmap_dma(range.start(), range.len()) + .context("VFIO DMA unmap failed") + } +} + /// RPC messages for the container manager task. enum VfioManagerRpc { /// Prepare a container and group for a device, creating or reusing @@ -69,6 +106,9 @@ impl VfioDeviceBinding { struct ContainerEntry { id: u64, container: Arc, + /// Handle to the DMA mapper registration — removes the mapper from + /// the region manager when dropped, unmapping all IOMMU entries. + _dma_handle: membacking::DmaMapperHandle, } /// Manages VFIO containers and groups, sharing containers across devices. @@ -90,11 +130,9 @@ pub(crate) struct VfioContainerManager { /// Next container ID to assign. #[inspect(skip)] next_container_id: u64, - /// Guest memory handle for DMA mapping. - guest_memory: GuestMemory, - /// Cached guest memory regions + base VA, fetched lazily. + /// Client for registering VFIO containers as DMA mappers. #[inspect(skip)] - dma_info: Option, + dma_mapper_client: DmaMapperClient, #[inspect(skip)] recv: mesh::Receiver, } @@ -140,23 +178,16 @@ struct GroupEntry { container_id: u64, } -struct DmaInfo { - regions: Vec, - base_va: u64, - va_size: u64, -} - impl VfioContainerManager { /// Create a new container manager. - pub fn new(guest_memory: GuestMemory) -> Self { + pub fn new(dma_mapper_client: DmaMapperClient) -> Self { Self { containers: Vec::new(), groups: HashMap::new(), devices: Vec::new(), next_device_id: 0, next_container_id: 0, - guest_memory, - dma_info: None, + dma_mapper_client, recv: mesh::Receiver::new(), } } @@ -365,8 +396,9 @@ impl VfioContainerManager { .map(|c| &c.container) } - /// Create a new container, set IOMMU type, map guest RAM, and attach the - /// group. Returns the container ID. + /// Create a new container, set IOMMU type, register with the region + /// manager for dynamic DMA mapping, and attach the group. Returns the + /// container ID. async fn create_container_for_group( &mut self, group: &vfio_sys::Group, @@ -383,35 +415,21 @@ impl VfioContainerManager { .set_iommu(vfio_sys::IommuType::Type1v2) .context("failed to set VFIO IOMMU type to Type1v2 (IOMMU required)")?; - // Lazily fetch guest memory regions on first use. - if self.dma_info.is_none() { - self.dma_info = Some(self.fetch_dma_info().await?); - } - let dma_info = self.dma_info.as_ref().unwrap(); - - // Identity-map guest RAM (IOVA == GPA) for device DMA access. - for region in &dma_info.regions { - let gpa = region.guest_address; - let size = region.size; - let gpa_end = gpa - .checked_add(size) - .context("guest memory region overflows u64")?; - anyhow::ensure!( - gpa_end <= dma_info.va_size, - "guest memory region {:#x}..{:#x} exceeds mapping size {:#x}", - gpa, - gpa_end, - dma_info.va_size - ); - let vaddr = dma_info.base_va + gpa; - container.map_dma(gpa, vaddr, size).with_context(|| { - format!( - "failed to map DMA for guest memory region {:#x}..{:#x}", - gpa, gpa_end - ) - })?; - tracing::debug!(gpa, size, vaddr, "mapped guest RAM for VFIO DMA"); - } + let container = Arc::new(container); + + let dma_target: Arc = Arc::new(VfioType1DmaTarget { + container: container.clone(), + }); + + // Register as a DMA mapper — the region manager will create a + // VaMapper internally (since needs_va is true) and replay all + // existing active sub-mappings (guest RAM + any active device + // BARs) into this container's IOMMU. + let dma_handle = self + .dma_mapper_client + .add_dma_mapper(dma_target, true) + .await + .context("failed to register VFIO container with region manager")?; tracing::info!( pci_id, @@ -420,37 +438,16 @@ impl VfioContainerManager { "created new VFIO container" ); - let container = Arc::new(container); let id = self.next_container_id; self.next_container_id += 1; - self.containers.push(ContainerEntry { id, container }); + self.containers.push(ContainerEntry { + id, + container, + _dma_handle: dma_handle, + }); Ok(id) } - async fn fetch_dma_info(&self) -> anyhow::Result { - let sharing = self - .guest_memory - .sharing() - .context("VFIO requires shareable guest memory")?; - - let regions = sharing - .get_regions() - .await - .map_err(|e| anyhow::anyhow!(e)) - .context("failed to get shareable guest memory regions")?; - - let (base_va, va_size) = self - .guest_memory - .full_mapping() - .context("VFIO DMA mapping requires linearly mapped guest memory")?; - - Ok(DmaInfo { - regions, - base_va: base_va as u64, - va_size: va_size as u64, - }) - } - pub(crate) fn client(&mut self) -> VfioManagerClient { VfioManagerClient { sender: self.recv.sender(), diff --git a/vm/devices/pci/vfio_assigned_device/src/resolver.rs b/vm/devices/pci/vfio_assigned_device/src/resolver.rs index 49965919ee..6c09dac7b2 100644 --- a/vm/devices/pci/vfio_assigned_device/src/resolver.rs +++ b/vm/devices/pci/vfio_assigned_device/src/resolver.rs @@ -8,6 +8,7 @@ use crate::manager::VfioContainerManager; use crate::manager::VfioManagerClient; use anyhow::Context as _; use async_trait::async_trait; +use membacking::DmaMapperClient; use pci_resources::ResolvePciDeviceHandleParams; use pci_resources::ResolvedPciDevice; use vfio_assigned_device_resources::VfioDeviceHandle; @@ -27,10 +28,10 @@ pub struct VfioDeviceResolver { impl VfioDeviceResolver { /// Create a new resolver, spawning the container manager task. /// - /// The manager lazily initializes DMA mappings on first use, so creating - /// this is cheap for VMs that have no VFIO devices. - pub fn new(spawner: impl pal_async::task::Spawn, guest_memory: guestmem::GuestMemory) -> Self { - let mut manager = VfioContainerManager::new(guest_memory); + /// The manager registers each new VFIO container with the region manager + /// so that DMA mappings are kept in sync with the VM's memory map. + pub fn new(spawner: impl pal_async::task::Spawn, dma_mapper_client: DmaMapperClient) -> Self { + let mut manager = VfioContainerManager::new(dma_mapper_client); let client = manager.client(); let task = spawner.spawn("vfio-container-mgr", manager.run()); Self { diff --git a/vm/devices/user_driver/vfio_sys/src/lib.rs b/vm/devices/user_driver/vfio_sys/src/lib.rs index 1a70ba3038..fe1579d5db 100644 --- a/vm/devices/user_driver/vfio_sys/src/lib.rs +++ b/vm/devices/user_driver/vfio_sys/src/lib.rs @@ -36,8 +36,20 @@ use vfio_bindings::bindings::vfio::vfio_region_sparse_mmap_area; /// Returns the host page size. pub fn host_page_size() -> u64 { - // SAFETY: sysconf(_SC_PAGESIZE) is always safe and always succeeds on Linux. - unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 } + use std::sync::atomic::{AtomicU64, Ordering::Relaxed}; + static PAGE_SIZE: AtomicU64 = const { AtomicU64::new(0) }; + + let page_size = PAGE_SIZE.load(Relaxed); + if page_size == 0 { + // SAFETY: sysconf(_SC_PAGESIZE) is always safe to call on Linux. + let raw = unsafe { libc::sysconf(libc::_SC_PAGESIZE) }; + assert!(raw > 0, "sysconf(_SC_PAGESIZE) failed: {raw}"); + let page_size = raw as u64; + PAGE_SIZE.store(page_size, Relaxed); + page_size + } else { + page_size + } } mod ioctl { @@ -147,16 +159,18 @@ impl Container { /// page-aligned. /// /// Only valid when the container uses a Type1v2 IOMMU. - pub fn map_dma(&self, iova: u64, vaddr: u64, size: u64) -> anyhow::Result<()> { + /// + /// # Safety + /// `vaddr` must point to valid, backed memory for `size` bytes. The + /// memory must not be unmapped while the IOMMU mapping is live (until + /// a corresponding `unmap_dma` call). + pub unsafe fn map_dma(&self, iova: u64, vaddr: *const u8, size: u64) -> anyhow::Result<()> { use vfio_bindings::bindings::vfio::VFIO_DMA_MAP_FLAG_READ; use vfio_bindings::bindings::vfio::VFIO_DMA_MAP_FLAG_WRITE; - // SAFETY: sysconf(_SC_PAGESIZE) is always safe and returns the - // host page size. - let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) }; - anyhow::ensure!(page_size > 0, "sysconf(_SC_PAGESIZE) failed"); - let page_size = page_size as u64; + let page_size = host_page_size(); let page_mask = page_size - 1; + let vaddr = vaddr as u64; anyhow::ensure!( iova & page_mask == 0 && vaddr & page_mask == 0 && size & page_mask == 0, "VFIO DMA mapping requires page-aligned iova ({iova:#x}), vaddr ({vaddr:#x}), and size ({size:#x}), page size {page_size:#x}" @@ -180,7 +194,11 @@ impl Container { /// Unmap a previously mapped IOVA range from the IOMMU. /// - /// `iova` and `size` must match a previous `map_dma` call. + /// For Type1v2, the unmap range must not bisect any previous mapping: + /// if a mapping exists at `iova`, it must start exactly at `iova`, and + /// if a mapping exists at `iova + size - 1`, it must end there. + /// Multiple mappings may be unmapped in one call as long as these + /// boundary conditions hold. Gaps within the range are fine. pub fn unmap_dma(&self, iova: u64, size: u64) -> anyhow::Result<()> { let mut dma_unmap = vfio_bindings::bindings::vfio::vfio_iommu_type1_dma_unmap { argsz: size_of::() as u32,