diff --git a/src/firecracker/src/api_server/request/snapshot.rs b/src/firecracker/src/api_server/request/snapshot.rs index 04c20de6b2d..111078297a9 100644 --- a/src/firecracker/src/api_server/request/snapshot.rs +++ b/src/firecracker/src/api_server/request/snapshot.rs @@ -111,6 +111,7 @@ fn parse_put_snapshot_load(body: &Body) -> Result { resume_vm: snapshot_config.resume_vm, network_overrides: snapshot_config.network_overrides, vsock_override: snapshot_config.vsock_override, + block_delta_dir: snapshot_config.block_delta_dir, }; // Construct the `ParsedRequest` object. diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index 8d87eefc11a..8b383740a3d 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -1220,10 +1220,16 @@ definitions: type: string description: Type of the IO engine used by the device. "Async" is supported on - host kernels newer than 5.10.51. + host kernels newer than 5.10.51. "Overlay" enables copy-on-write + with dirty bitmap tracking for fast snapshots (requires base_path). This field is optional for virtio-block config and should be omitted for vhost-user-block configuration. - enum: ["Sync", "Async"] + enum: ["Sync", "Async", "Overlay"] default: "Sync" + base_path: + type: string + description: + Read-only base image path for overlay mode. Required when io_engine + is "Overlay". The path_on_host field becomes the overlay file path. # VhostUserBlock specific parameters socket: @@ -1572,6 +1578,12 @@ definitions: description: Type of snapshot to create. It is optional and by default, a full snapshot is created. + block_delta_dir: + type: string + description: + Directory for block device delta files. When set, overlay block + devices write delta files (containing only dirty blocks) into this + directory, named {drive_id}.delta. NetworkOverride: type: object @@ -1650,6 +1662,12 @@ definitions: for restoring a snapshot with a different socket path than the one used when the snapshot was created. For example, when the original socket path is no longer available or when deploying to a different environment. + block_delta_dir: + type: string + description: + Directory containing block device delta files for cloning. Each + overlay device looks for {drive_id}.delta in this directory and + applies it to a fresh overlay on restore. TokenBucket: diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 9f9b71a30c3..456a61eec32 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -905,6 +905,7 @@ pub(crate) mod tests { file_engine_type: None, socket: None, + base_path: None, }; block_dev_configs diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index b5052bc5aba..96877ed9c16 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -331,6 +331,43 @@ impl DeviceManager { } } + /// Write delta files for all overlay block devices into the given directory. + /// Each delta file is named `{drive_id}.delta`. + /// Returns the first error encountered, if any. + pub fn write_block_deltas( + &self, + delta_dir: &std::path::Path, + ) -> Result<(), crate::devices::virtio::block::virtio::io::delta::DeltaError> { + use crate::devices::virtio::block::device::Block; + use crate::devices::virtio::device::VirtioDeviceType; + + let mut first_error: Option = + None; + + let _: Result<(), Infallible> = + self.mmio_devices + .for_each_virtio_mmio_device(|_, _, device| { + let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); + let mut locked_device = mmio_transport_locked.locked_device(); + if locked_device.device_type() == VirtioDeviceType::Block { + let block = locked_device.as_mut_any().downcast_mut::().unwrap(); + let delta_path = delta_dir.join(format!("{}.delta", block.id())); + if let Err(e) = block.write_delta(&delta_path) { + error!("Failed to write block delta for {}: {:?}", block.id(), e); + if first_error.is_none() { + first_error = Some(e); + } + } + } + Ok(()) + }); + + match first_error { + Some(e) => Err(e), + None => Ok(()), + } + } + /// Mark queue memory dirty for activated VirtIO devices pub fn mark_virtio_queue_memory_dirty(&self, mem: &GuestMemoryMmap) { // Go through MMIO VirtIO devices diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index 46264dd9ff4..0830e7c66bb 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -7,10 +7,13 @@ use event_manager::{EventOps, Events, MutEventSubscriber}; use log::info; use vmm_sys_util::eventfd::EventFd; +use std::path::Path; + use super::BlockError; use super::persist::{BlockConstructorArgs, BlockState}; use super::vhost_user::device::{VhostUserBlock, VhostUserBlockConfig}; use super::virtio::device::{VirtioBlock, VirtioBlockConfig}; +use super::virtio::io::delta; use crate::devices::virtio::ActivateError; use crate::devices::virtio::device::{VirtioDevice, VirtioDeviceType}; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue}; @@ -115,6 +118,17 @@ impl Block { Self::VhostUser(_) => true, } } + + /// Write a delta file if this block device uses an overlay engine. + pub fn write_delta( + &mut self, + delta_path: &Path, + ) -> Result, delta::DeltaError> { + match self { + Self::Virtio(b) => b.write_delta(delta_path), + Self::VhostUser(_) => Ok(None), + } + } } impl VirtioDevice for Block { diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index 1f8bb2ae88c..99424e6788f 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -104,6 +104,7 @@ impl From for BlockDeviceConfig { file_engine_type: None, socket: Some(value.socket), + base_path: None, } } } @@ -416,6 +417,7 @@ mod tests { file_engine_type: None, socket: Some("sock".to_string()), + base_path: None, }; VhostUserBlockConfig::try_from(&block_config).unwrap(); @@ -431,6 +433,7 @@ mod tests { file_engine_type: Some(FileEngineType::Sync), socket: None, + base_path: None, }; VhostUserBlockConfig::try_from(&block_config).unwrap_err(); @@ -446,6 +449,7 @@ mod tests { file_engine_type: Some(FileEngineType::Sync), socket: Some("sock".to_string()), + base_path: None, }; VhostUserBlockConfig::try_from(&block_config).unwrap_err(); } diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index ec7ec468505..f89913458c6 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -14,7 +14,10 @@ use std::os::linux::fs::MetadataExt; use std::path::PathBuf; use std::sync::Arc; +use std::path::Path; + use block_io::FileEngine; +use block_io::dirty_bitmap::DEFAULT_BLOCK_SIZE; use serde::{Deserialize, Serialize}; use vm_memory::ByteValued; use vmm_sys_util::eventfd::EventFd; @@ -27,7 +30,7 @@ use crate::devices::virtio::block::CacheType; use crate::devices::virtio::block::virtio::metrics::{BlockDeviceMetrics, BlockMetricsPerDevice}; use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice, VirtioDeviceType}; use crate::devices::virtio::generated::virtio_blk::{ - VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, + VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, }; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; @@ -41,7 +44,7 @@ use crate::vmm_config::RateLimiterConfig; use crate::vmm_config::drive::BlockDeviceConfig; use crate::vstate::memory::GuestMemoryMmap; -/// The engine file type, either Sync or Async (through io_uring). +/// The engine file type, either Sync, Async (through io_uring), or Overlay (COW). #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Deserialize, Serialize)] pub enum FileEngineType { /// Use an Async engine, based on io_uring. @@ -49,12 +52,15 @@ pub enum FileEngineType { /// Use a Sync engine, based on blocking system calls. #[default] Sync, + /// Use a sync COW overlay engine with dirty bitmap tracking. + Overlay, } /// Helper object for setting up all `Block` fields derived from its backing file. #[derive(Debug)] pub struct DiskProperties { pub file_path: String, + pub base_path: Option, pub file_engine: FileEngine, pub nsectors: u64, pub image_id: [u8; VIRTIO_BLK_ID_BYTES as usize], @@ -101,6 +107,7 @@ impl DiskProperties { Ok(Self { file_path: disk_image_path, + base_path: None, file_engine: FileEngine::from_file(disk_image, file_engine_type) .map_err(VirtioBlockError::FileEngine)?, nsectors: disk_size >> SECTOR_SHIFT, @@ -108,6 +115,157 @@ impl DiskProperties { }) } + /// Create a new overlay file engine with a read-only base and a writable overlay. + pub fn new_overlay( + base_image_path: String, + overlay_path: String, + block_size: u32, + ) -> Result { + let mut base_file = OpenOptions::new() + .read(true) + .open(PathBuf::from(&base_image_path)) + .map_err(|x| VirtioBlockError::BackingFile(x, base_image_path.clone()))?; + + let disk_size = Self::file_size(&base_image_path, &mut base_file)?; + let image_id = Self::build_disk_image_id(&base_file); + + let overlay_file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(PathBuf::from(&overlay_path)) + .map_err(|x| VirtioBlockError::BackingFile(x, overlay_path.clone()))?; + + let overlay_size = overlay_file + .metadata() + .map_err(VirtioBlockError::GetFileMetadata)? + .len(); + + if overlay_size == 0 { + // Fresh overlay — set length to match base. + overlay_file + .set_len(disk_size) + .map_err(|x| VirtioBlockError::BackingFile(x, overlay_path.clone()))?; + } else if overlay_size != disk_size { + return Err(VirtioBlockError::BackingFile( + std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!( + "overlay size ({overlay_size}) does not match base size ({disk_size})" + ), + ), + overlay_path, + )); + } + + let overlay_engine = + block_io::OverlayFileEngine::from_files(base_file, overlay_file, disk_size, block_size, None) + .map_err(|e| VirtioBlockError::FileEngine(block_io::BlockIoError::Overlay(e)))?; + + Ok(Self { + file_path: overlay_path, + base_path: Some(base_image_path), + file_engine: FileEngine::Overlay(overlay_engine), + nsectors: disk_size >> SECTOR_SHIFT, + image_id, + }) + } + + /// Create an overlay file engine restoring from a previously saved bitmap. + pub fn new_overlay_with_bitmap( + base_image_path: String, + overlay_path: String, + block_size: u32, + bitmap: block_io::dirty_bitmap::DirtyBitmap, + ) -> Result { + let mut base_file = OpenOptions::new() + .read(true) + .open(PathBuf::from(&base_image_path)) + .map_err(|x| VirtioBlockError::BackingFile(x, base_image_path.clone()))?; + + let disk_size = Self::file_size(&base_image_path, &mut base_file)?; + let image_id = Self::build_disk_image_id(&base_file); + + let overlay_file = OpenOptions::new() + .read(true) + .write(true) + .open(PathBuf::from(&overlay_path)) + .map_err(|x| VirtioBlockError::BackingFile(x, overlay_path.clone()))?; + + let overlay_engine = block_io::OverlayFileEngine::from_files( + base_file, + overlay_file, + disk_size, + block_size, + Some(bitmap), + ) + .map_err(|e| VirtioBlockError::FileEngine(block_io::BlockIoError::Overlay(e)))?; + + Ok(Self { + file_path: overlay_path, + base_path: Some(base_image_path), + file_engine: FileEngine::Overlay(overlay_engine), + nsectors: disk_size >> SECTOR_SHIFT, + image_id, + }) + } + + /// Create an overlay file engine by applying a delta to a fresh overlay. + /// Used for VM cloning: the delta contains only the dirty blocks from a source VM. + pub fn new_overlay_from_delta( + base_image_path: String, + overlay_path: String, + delta_path: &Path, + ) -> Result { + let mut base_file = OpenOptions::new() + .read(true) + .open(PathBuf::from(&base_image_path)) + .map_err(|x| VirtioBlockError::BackingFile(x, base_image_path.clone()))?; + + let disk_size = Self::file_size(&base_image_path, &mut base_file)?; + let image_id = Self::build_disk_image_id(&base_file); + + // Create a fresh sparse overlay. + let overlay_file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(PathBuf::from(&overlay_path)) + .map_err(|x| VirtioBlockError::BackingFile(x, overlay_path.clone()))?; + overlay_file + .set_len(disk_size) + .map_err(|x| VirtioBlockError::BackingFile(x, overlay_path.clone()))?; + + let mut overlay_file = overlay_file; + + // Apply the delta to populate the overlay and get the bitmap. + let (bitmap, _stats) = block_io::delta::apply_delta(&mut overlay_file, delta_path) + .map_err(|e| { + VirtioBlockError::BackingFile( + std::io::Error::new(std::io::ErrorKind::Other, format!("{:?}", e)), + delta_path.display().to_string(), + ) + })?; + + let overlay_engine = block_io::OverlayFileEngine::from_files( + base_file, + overlay_file, + disk_size, + bitmap.block_size(), + Some(bitmap), + ) + .map_err(|e| VirtioBlockError::FileEngine(block_io::BlockIoError::Overlay(e)))?; + + Ok(Self { + file_path: overlay_path, + base_path: Some(base_image_path), + file_engine: FileEngine::Overlay(overlay_engine), + nsectors: disk_size >> SECTOR_SHIFT, + image_id, + }) + } + /// Update the path to the file backing the block device pub fn update( &mut self, @@ -189,7 +347,7 @@ pub struct VirtioBlockConfig { /// If set to true, the drive is opened in read-only mode. Otherwise, the /// drive is opened as read-write. pub is_read_only: bool, - /// Path of the backing file on the host + /// Path of the backing file on the host (overlay path when io_engine is Overlay). pub path_on_host: String, /// Rate Limiter for I/O operations. pub rate_limiter: Option, @@ -197,6 +355,9 @@ pub struct VirtioBlockConfig { #[serde(default)] #[serde(rename = "io_engine")] pub file_engine_type: FileEngineType, + /// Read-only base image path for overlay mode. + #[serde(default)] + pub base_path: Option, } impl TryFrom<&BlockDeviceConfig> for VirtioBlockConfig { @@ -204,6 +365,16 @@ impl TryFrom<&BlockDeviceConfig> for VirtioBlockConfig { fn try_from(value: &BlockDeviceConfig) -> Result { if let (Some(path_on_host), None) = (&value.path_on_host, &value.socket) { + let engine_type = value.file_engine_type.unwrap_or_default(); + + // Validate overlay config: base_path required when engine is Overlay. + if engine_type == FileEngineType::Overlay && value.base_path.is_none() { + return Err(VirtioBlockError::Config); + } + if engine_type != FileEngineType::Overlay && value.base_path.is_some() { + return Err(VirtioBlockError::Config); + } + Ok(Self { drive_id: value.drive_id.clone(), partuuid: value.partuuid.clone(), @@ -213,7 +384,8 @@ impl TryFrom<&BlockDeviceConfig> for VirtioBlockConfig { is_read_only: value.is_read_only.unwrap_or(false), path_on_host: path_on_host.clone(), rate_limiter: value.rate_limiter, - file_engine_type: value.file_engine_type.unwrap_or_default(), + file_engine_type: engine_type, + base_path: value.base_path.clone(), }) } else { Err(VirtioBlockError::Config) @@ -235,6 +407,7 @@ impl From for BlockDeviceConfig { file_engine_type: Some(value.file_engine_type), socket: None, + base_path: value.base_path, } } } @@ -271,7 +444,7 @@ macro_rules! unwrap_async_file_engine_or_return { ($file_engine: expr) => { match $file_engine { FileEngine::Async(engine) => engine, - FileEngine::Sync(_) => { + FileEngine::Sync(_) | FileEngine::Overlay(_) => { error!("The block device doesn't use an async IO engine"); return; } @@ -284,11 +457,20 @@ impl VirtioBlock { /// /// The given file must be seekable and sizable. pub fn new(config: VirtioBlockConfig) -> Result { - let disk_properties = DiskProperties::new( - config.path_on_host, - config.is_read_only, - config.file_engine_type, - )?; + let disk_properties = if config.file_engine_type == FileEngineType::Overlay { + let base_path = config + .base_path + .as_ref() + .ok_or(VirtioBlockError::Config)? + .clone(); + DiskProperties::new_overlay(base_path, config.path_on_host.clone(), DEFAULT_BLOCK_SIZE)? + } else { + DiskProperties::new( + config.path_on_host.clone(), + config.is_read_only, + config.file_engine_type, + )? + }; let rate_limiter = config .rate_limiter @@ -307,6 +489,10 @@ impl VirtioBlock { avail_features |= 1u64 << VIRTIO_BLK_F_RO; }; + if config.file_engine_type == FileEngineType::Overlay { + avail_features |= 1u64 << VIRTIO_BLK_F_DISCARD; + } + let queue_evts = [EventFd::new(libc::EFD_NONBLOCK).map_err(VirtioBlockError::EventFd)?]; let queues = BLOCK_QUEUE_SIZES.iter().map(|&s| Queue::new(s)).collect(); @@ -350,6 +536,7 @@ impl VirtioBlock { cache_type: self.cache_type, rate_limiter: rl.into_option(), file_engine_type: self.file_engine_type(), + base_path: None, } } @@ -534,7 +721,18 @@ impl VirtioBlock { } /// Update the backing file and the config space of the block device. + /// Not supported for overlay devices — the bitmap would become inconsistent + /// with the new file, leading to silent data corruption. pub fn update_disk_image(&mut self, disk_image_path: String) -> Result<(), VirtioBlockError> { + if matches!(self.disk.file_engine, FileEngine::Overlay(_)) { + return Err(VirtioBlockError::BackingFile( + std::io::Error::new( + std::io::ErrorKind::Unsupported, + "hot-update is not supported for overlay block devices", + ), + disk_image_path, + )); + } self.disk.update(disk_image_path, self.read_only)?; self.config_space.capacity = self.disk.nsectors.to_le(); // virtio_block_config_space(); @@ -559,6 +757,7 @@ impl VirtioBlock { match self.disk.file_engine { FileEngine::Sync(_) => FileEngineType::Sync, FileEngine::Async(_) => FileEngineType::Async, + FileEngine::Overlay(_) => FileEngineType::Overlay, } } @@ -579,6 +778,20 @@ impl VirtioBlock { self.process_async_completion_queue(); } } + + /// Write a delta file for this device if it uses an overlay engine. + /// Returns `Ok(Some(stats))` if a delta was written, `Ok(None)` if not an overlay device. + pub fn write_delta( + &mut self, + delta_path: &Path, + ) -> Result, block_io::delta::DeltaError> { + if let FileEngine::Overlay(ref mut engine) = self.disk.file_engine { + let stats = engine.write_delta(delta_path)?; + Ok(Some(stats)) + } else { + Ok(None) + } + } } impl VirtioDevice for VirtioBlock { @@ -728,6 +941,7 @@ mod tests { file_engine_type: Default::default(), socket: None, + base_path: None, }; VirtioBlockConfig::try_from(&block_config).unwrap(); @@ -743,6 +957,7 @@ mod tests { file_engine_type: Default::default(), socket: Some("sock".to_string()), + base_path: None, }; VirtioBlockConfig::try_from(&block_config).unwrap_err(); @@ -758,6 +973,7 @@ mod tests { file_engine_type: Default::default(), socket: Some("sock".to_string()), + base_path: None, }; VirtioBlockConfig::try_from(&block_config).unwrap_err(); } diff --git a/src/vmm/src/devices/virtio/block/virtio/io/delta.rs b/src/vmm/src/devices/virtio/block/virtio/io/delta.rs new file mode 100644 index 00000000000..c934f5a8de0 --- /dev/null +++ b/src/vmm/src/devices/virtio/block/virtio/io/delta.rs @@ -0,0 +1,438 @@ +// Copyright 2026 Superserve AI. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Delta file format for efficient snapshot persistence of overlay block devices. +//! +//! A delta file captures only the dirty blocks from an overlay, enabling O(dirty) +//! snapshot sizes instead of O(disk). The format includes CRC64 checksums for +//! integrity validation. +//! +//! Format: +//! ```text +//! [Header: 32 bytes] +//! magic: u64 = 0x4F564C5944454C54 ("OVLYDELT") +//! version: u32 = 1 +//! block_size: u32 +//! total_blocks: u64 +//! dirty_count: u64 +//! [Bitmap section] +//! bitmap_len: u32 +//! bitmap_data: [u8; bitmap_len] +//! bitmap_crc: u64 +//! [Data section] +//! For each dirty block (in index order): +//! block_data: [u8; block_size] +//! data_crc: u64 +//! ``` + +use std::fs::File; +use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write}; +use std::path::Path; +use std::time::Instant; + +use crc64::crc64; + +use super::dirty_bitmap::DirtyBitmap; + +const DELTA_MAGIC: u64 = 0x4F56_4C59_4445_4C54; // "OVLYDELT" +const DELTA_VERSION: u32 = 1; + +/// Statistics from a delta write or apply operation. +#[derive(Debug)] +pub struct DeltaStats { + pub dirty_blocks: u64, + pub bytes_written: u64, + pub duration_us: u64, +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum DeltaError { + /// I/O error: {0} + Io(std::io::Error), + /// Invalid magic number: expected 0x{expected:016X}, got 0x{actual:016X} + InvalidMagic { expected: u64, actual: u64 }, + /// Unsupported delta version: {0} + UnsupportedVersion(u32), + /// Bitmap CRC mismatch: expected 0x{expected:016X}, got 0x{computed:016X} + BitmapCrcMismatch { expected: u64, computed: u64 }, + /// Data CRC mismatch: expected 0x{expected:016X}, got 0x{computed:016X} + DataCrcMismatch { expected: u64, computed: u64 }, + /// Dirty count mismatch: header says {header}, bitmap has {bitmap} + DirtyCountMismatch { header: u64, bitmap: u64 }, + /// Bitmap deserialization failed: {0} + Bitmap(super::dirty_bitmap::DirtyBitmapError), + /// Delta too large: {dirty_count} dirty blocks * {block_size} bytes exceeds limit + TooLarge { dirty_count: u64, block_size: u32 }, +} + +impl From for DeltaError { + fn from(e: std::io::Error) -> Self { + DeltaError::Io(e) + } +} + +/// Write a delta file containing only the dirty blocks from the overlay. +pub fn write_delta( + overlay: &mut File, + bitmap: &DirtyBitmap, + delta_path: &Path, +) -> Result { + let start = Instant::now(); + let dirty_count = bitmap.dirty_count(); + let block_size = bitmap.block_size(); + let total_blocks = bitmap.total_blocks(); + + let delta_file = File::create(delta_path)?; + let mut writer = BufWriter::new(delta_file); + + // Write header. + writer.write_all(&DELTA_MAGIC.to_le_bytes())?; + writer.write_all(&DELTA_VERSION.to_le_bytes())?; + writer.write_all(&block_size.to_le_bytes())?; + writer.write_all(&total_blocks.to_le_bytes())?; + writer.write_all(&dirty_count.to_le_bytes())?; + + // Write bitmap section. + let bitmap_bytes = bitmap.serialize(); + let bitmap_len = bitmap_bytes.len() as u32; + writer.write_all(&bitmap_len.to_le_bytes())?; + writer.write_all(&bitmap_bytes)?; + let bitmap_crc = crc64(0, &bitmap_bytes); + writer.write_all(&bitmap_crc.to_le_bytes())?; + + // Write data section: read each dirty block from overlay and write it. + let mut data_crc: u64 = 0; + let mut block_buf = vec![0u8; block_size as usize]; + let mut bytes_written: u64 = 0; + + for block_idx in bitmap.iter_dirty() { + let offset = block_idx * u64::from(block_size); + overlay.seek(SeekFrom::Start(offset))?; + overlay.read_exact(&mut block_buf)?; + + data_crc = crc64(data_crc, &block_buf); + writer.write_all(&block_buf)?; + bytes_written += u64::from(block_size); + } + + writer.write_all(&data_crc.to_le_bytes())?; + writer.flush()?; + writer.get_ref().sync_all()?; + + Ok(DeltaStats { + dirty_blocks: dirty_count, + bytes_written, + duration_us: start.elapsed().as_micros() as u64, + }) +} + +/// Apply a delta file to an overlay, restoring the dirty blocks and bitmap. +pub fn apply_delta( + overlay: &mut File, + delta_path: &Path, +) -> Result<(DirtyBitmap, DeltaStats), DeltaError> { + let start = Instant::now(); + + let delta_file = File::open(delta_path)?; + let mut reader = BufReader::new(delta_file); + + // Read and validate header. + let magic = read_u64(&mut reader)?; + if magic != DELTA_MAGIC { + return Err(DeltaError::InvalidMagic { + expected: DELTA_MAGIC, + actual: magic, + }); + } + + let version = read_u32(&mut reader)?; + if version != DELTA_VERSION { + return Err(DeltaError::UnsupportedVersion(version)); + } + + let block_size = read_u32(&mut reader)?; + let total_blocks = read_u64(&mut reader)?; + let dirty_count = read_u64(&mut reader)?; + + // Bound check: prevent OOM from malicious delta files. + let max_data_size = total_blocks * u64::from(block_size); + let delta_data_size = dirty_count * u64::from(block_size); + if delta_data_size > max_data_size { + return Err(DeltaError::TooLarge { + dirty_count, + block_size, + }); + } + + // Read bitmap section. + let bitmap_len = read_u32(&mut reader)? as usize; + let mut bitmap_bytes = vec![0u8; bitmap_len]; + reader.read_exact(&mut bitmap_bytes)?; + let stored_bitmap_crc = read_u64(&mut reader)?; + + // Validate bitmap CRC. + let computed_bitmap_crc = crc64(0, &bitmap_bytes); + if stored_bitmap_crc != computed_bitmap_crc { + return Err(DeltaError::BitmapCrcMismatch { + expected: stored_bitmap_crc, + computed: computed_bitmap_crc, + }); + } + + // Deserialize bitmap. + let bitmap = + DirtyBitmap::deserialize(&bitmap_bytes, block_size, total_blocks).map_err(DeltaError::Bitmap)?; + + // Validate dirty count matches bitmap. + if bitmap.dirty_count() != dirty_count { + return Err(DeltaError::DirtyCountMismatch { + header: dirty_count, + bitmap: bitmap.dirty_count(), + }); + } + + // Read data section: write each dirty block to the overlay. + let mut data_crc: u64 = 0; + let mut block_buf = vec![0u8; block_size as usize]; + let mut bytes_written: u64 = 0; + + for block_idx in bitmap.iter_dirty() { + reader.read_exact(&mut block_buf)?; + data_crc = crc64(data_crc, &block_buf); + + let offset = block_idx * u64::from(block_size); + overlay.seek(SeekFrom::Start(offset))?; + overlay.write_all(&block_buf)?; + bytes_written += u64::from(block_size); + } + + // Validate data CRC. + let stored_data_crc = read_u64(&mut reader)?; + if stored_data_crc != data_crc { + return Err(DeltaError::DataCrcMismatch { + expected: stored_data_crc, + computed: data_crc, + }); + } + + overlay.flush()?; + + Ok(( + bitmap, + DeltaStats { + dirty_blocks: dirty_count, + bytes_written, + duration_us: start.elapsed().as_micros() as u64, + }, + )) +} + +fn read_u64(reader: &mut impl Read) -> Result { + let mut buf = [0u8; 8]; + reader.read_exact(&mut buf)?; + Ok(u64::from_le_bytes(buf)) +} + +fn read_u32(reader: &mut impl Read) -> Result { + let mut buf = [0u8; 4]; + reader.read_exact(&mut buf)?; + Ok(u32::from_le_bytes(buf)) +} + +#[cfg(test)] +mod tests { + use vmm_sys_util::tempfile::TempFile; + + use super::*; + use crate::devices::virtio::block::virtio::io::dirty_bitmap::DEFAULT_BLOCK_SIZE; + + fn create_test_overlay(size: u64, dirty_offsets: &[(u64, &[u8])]) -> (File, DirtyBitmap) { + let f = TempFile::new().unwrap().into_file(); + f.set_len(size).unwrap(); + + let mut bitmap = DirtyBitmap::new(size, DEFAULT_BLOCK_SIZE).unwrap(); + let mut file = f; + + for (offset, data) in dirty_offsets { + file.seek(SeekFrom::Start(*offset)).unwrap(); + file.write_all(data).unwrap(); + bitmap.set(*offset, data.len() as u32); + } + file.flush().unwrap(); + + (file, bitmap) + } + + #[test] + fn test_write_and_apply_delta() { + let disk_size: u64 = 4 * 4096; // 4 blocks + + // Create overlay with 2 dirty blocks. + let block0_data = vec![0xAA_u8; 4096]; + let block2_data = vec![0xBB_u8; 4096]; + let (mut overlay, bitmap) = create_test_overlay( + disk_size, + &[(0, &block0_data), (8192, &block2_data)], + ); + + // Write delta. + let delta_file = TempFile::new().unwrap(); + let delta_path = delta_file.as_path().to_path_buf(); + let write_stats = write_delta(&mut overlay, &bitmap, &delta_path).unwrap(); + + assert_eq!(write_stats.dirty_blocks, 2); + assert_eq!(write_stats.bytes_written, 2 * 4096); + + // Create a fresh overlay and apply the delta. + let fresh_overlay = TempFile::new().unwrap().into_file(); + fresh_overlay.set_len(disk_size).unwrap(); + let mut fresh_overlay = fresh_overlay; + + let (restored_bitmap, apply_stats) = + apply_delta(&mut fresh_overlay, &delta_path).unwrap(); + + assert_eq!(apply_stats.dirty_blocks, 2); + assert_eq!(apply_stats.bytes_written, 2 * 4096); + + // Verify bitmap matches. + assert_eq!(restored_bitmap.dirty_count(), 2); + assert!(restored_bitmap.is_set(0)); + assert!(!restored_bitmap.is_set(1)); + assert!(restored_bitmap.is_set(2)); + assert!(!restored_bitmap.is_set(3)); + + // Verify data matches. + let mut buf = vec![0u8; 4096]; + fresh_overlay.seek(SeekFrom::Start(0)).unwrap(); + fresh_overlay.read_exact(&mut buf).unwrap(); + assert_eq!(buf, block0_data); + + fresh_overlay.seek(SeekFrom::Start(8192)).unwrap(); + fresh_overlay.read_exact(&mut buf).unwrap(); + assert_eq!(buf, block2_data); + } + + #[test] + fn test_empty_delta() { + let disk_size: u64 = 4 * 4096; + let (mut overlay, bitmap) = create_test_overlay(disk_size, &[]); + + let delta_file = TempFile::new().unwrap(); + let delta_path = delta_file.as_path().to_path_buf(); + let stats = write_delta(&mut overlay, &bitmap, &delta_path).unwrap(); + + assert_eq!(stats.dirty_blocks, 0); + assert_eq!(stats.bytes_written, 0); + + let fresh = TempFile::new().unwrap().into_file(); + fresh.set_len(disk_size).unwrap(); + let mut fresh = fresh; + let (restored_bitmap, _) = apply_delta(&mut fresh, &delta_path).unwrap(); + assert_eq!(restored_bitmap.dirty_count(), 0); + } + + #[test] + fn test_full_dirty_delta() { + let disk_size: u64 = 2 * 4096; + let data0 = vec![0xCC_u8; 4096]; + let data1 = vec![0xDD_u8; 4096]; + let (mut overlay, bitmap) = + create_test_overlay(disk_size, &[(0, &data0), (4096, &data1)]); + + let delta_file = TempFile::new().unwrap(); + let delta_path = delta_file.as_path().to_path_buf(); + let stats = write_delta(&mut overlay, &bitmap, &delta_path).unwrap(); + + assert_eq!(stats.dirty_blocks, 2); + + let fresh = TempFile::new().unwrap().into_file(); + fresh.set_len(disk_size).unwrap(); + let mut fresh = fresh; + let (restored_bitmap, _) = apply_delta(&mut fresh, &delta_path).unwrap(); + assert_eq!(restored_bitmap.dirty_count(), 2); + } + + #[test] + fn test_corrupted_magic() { + let disk_size: u64 = 4096; + let (mut overlay, bitmap) = + create_test_overlay(disk_size, &[(0, &vec![0xFF_u8; 4096])]); + + let delta_file = TempFile::new().unwrap(); + let delta_path = delta_file.as_path().to_path_buf(); + write_delta(&mut overlay, &bitmap, &delta_path).unwrap(); + + // Corrupt the magic bytes. + { + let mut f = std::fs::OpenOptions::new() + .write(true) + .open(&delta_path) + .unwrap(); + f.write_all(&0u64.to_le_bytes()).unwrap(); + } + + let fresh = TempFile::new().unwrap().into_file(); + fresh.set_len(disk_size).unwrap(); + let mut fresh = fresh; + let err = apply_delta(&mut fresh, &delta_path).unwrap_err(); + assert!(matches!(err, DeltaError::InvalidMagic { .. })); + } + + #[test] + fn test_corrupted_bitmap_crc() { + let disk_size: u64 = 4096; + let (mut overlay, bitmap) = + create_test_overlay(disk_size, &[(0, &vec![0xFF_u8; 4096])]); + + let delta_file = TempFile::new().unwrap(); + let delta_path = delta_file.as_path().to_path_buf(); + write_delta(&mut overlay, &bitmap, &delta_path).unwrap(); + + // Corrupt a byte in the bitmap section (after header 32 bytes + bitmap_len 4 bytes). + { + let mut f = std::fs::OpenOptions::new() + .write(true) + .open(&delta_path) + .unwrap(); + f.seek(SeekFrom::Start(36)).unwrap(); + f.write_all(&[0xFF]).unwrap(); + } + + let fresh = TempFile::new().unwrap().into_file(); + fresh.set_len(disk_size).unwrap(); + let mut fresh = fresh; + let err = apply_delta(&mut fresh, &delta_path).unwrap_err(); + assert!(matches!(err, DeltaError::BitmapCrcMismatch { .. })); + } + + #[test] + fn test_corrupted_data_crc() { + let disk_size: u64 = 4096; + let (mut overlay, bitmap) = + create_test_overlay(disk_size, &[(0, &vec![0xFF_u8; 4096])]); + + let delta_file = TempFile::new().unwrap(); + let delta_path = delta_file.as_path().to_path_buf(); + write_delta(&mut overlay, &bitmap, &delta_path).unwrap(); + + // Read file to find data section offset, then corrupt a data byte. + let file_len = std::fs::metadata(&delta_path).unwrap().len(); + // Data CRC is at the end (last 8 bytes), data block is just before it. + // Corrupt a byte in the data block. + { + let mut f = std::fs::OpenOptions::new() + .write(true) + .open(&delta_path) + .unwrap(); + // Seek to somewhere in the data block (well past the bitmap section). + f.seek(SeekFrom::End(-100)).unwrap(); + f.write_all(&[0x00]).unwrap(); + } + + let fresh = TempFile::new().unwrap().into_file(); + fresh.set_len(disk_size).unwrap(); + let mut fresh = fresh; + let err = apply_delta(&mut fresh, &delta_path).unwrap_err(); + assert!(matches!(err, DeltaError::DataCrcMismatch { .. })); + } +} diff --git a/src/vmm/src/devices/virtio/block/virtio/io/dirty_bitmap.rs b/src/vmm/src/devices/virtio/block/virtio/io/dirty_bitmap.rs new file mode 100644 index 00000000000..37ace6f561b --- /dev/null +++ b/src/vmm/src/devices/virtio/block/virtio/io/dirty_bitmap.rs @@ -0,0 +1,409 @@ +// Copyright 2026 Superserve AI. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Dirty bitmap for tracking modified blocks in an overlay filesystem. +//! +//! Tracks which blocks have been written to, enabling efficient delta snapshots +//! that only persist modified blocks. + +use bitvec::vec::BitVec; + +/// Default block size for dirty tracking (4KB — matches host page size). +pub const DEFAULT_BLOCK_SIZE: u32 = 4096; + +/// Minimum allowed block size (must be at least one sector). +const MIN_BLOCK_SIZE: u32 = 512; + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum DirtyBitmapError { + /// Invalid block size: {0} (must be power of 2, >= 512) + InvalidBlockSize(u32), + /// Bitmap data length mismatch: expected {expected}, got {actual} + LengthMismatch { expected: usize, actual: usize }, + /// Disk size is zero + ZeroDiskSize, + /// Block index {index} out of bounds (total_blocks={total_blocks}) + OutOfBounds { index: u64, total_blocks: u64 }, +} + +#[derive(Debug, Clone)] +pub struct DirtyBitmap { + bits: BitVec, + block_size: u32, + total_blocks: u64, +} + +impl DirtyBitmap { + /// Create a new dirty bitmap for a disk of `disk_size_bytes` with the given block granularity. + pub fn new(disk_size_bytes: u64, block_size: u32) -> Result { + if disk_size_bytes == 0 { + return Err(DirtyBitmapError::ZeroDiskSize); + } + if block_size < MIN_BLOCK_SIZE || !block_size.is_power_of_two() { + return Err(DirtyBitmapError::InvalidBlockSize(block_size)); + } + + let block_size_u64 = u64::from(block_size); + // Ceiling division: number of blocks needed to cover the entire disk. + let total_blocks = disk_size_bytes + .checked_add(block_size_u64 - 1) + .expect("disk size overflow") + / block_size_u64; + + Ok(Self { + bits: BitVec::repeat(false, total_blocks as usize), + block_size, + total_blocks, + }) + } + + /// Mark all blocks covering the byte range `[offset, offset + len)` as dirty. + pub fn set(&mut self, offset: u64, len: u32) { + if len == 0 { + return; + } + let start_block = self.offset_to_block(offset); + let end_offset = offset.saturating_add(u64::from(len)).saturating_sub(1); + let end_block = self.offset_to_block(end_offset); + + let clamped_end = end_block.min(self.total_blocks - 1); + for block in start_block..=clamped_end { + self.bits.set(block as usize, true); + } + } + + /// Mark a specific block as clean. + pub fn unset(&mut self, block_idx: u64) { + if block_idx < self.total_blocks { + self.bits.set(block_idx as usize, false); + } + } + + /// Check whether a specific block index is dirty. + pub fn is_set(&self, block_idx: u64) -> bool { + if block_idx >= self.total_blocks { + return false; + } + self.bits[block_idx as usize] + } + + /// Check whether all blocks in a byte range have the same dirty state. + /// + /// Returns: + /// - `Some(true)` if all blocks in the range are dirty + /// - `Some(false)` if all blocks in the range are clean + /// - `None` if the range contains a mix of dirty and clean blocks + pub fn is_range_uniform(&self, offset: u64, len: u32) -> Option { + if len == 0 { + return Some(false); + } + let start_block = self.offset_to_block(offset); + let end_offset = offset.saturating_add(u64::from(len)).saturating_sub(1); + let end_block = self.offset_to_block(end_offset).min(self.total_blocks - 1); + + let first = self.is_set(start_block); + for block in (start_block + 1)..=end_block { + if self.is_set(block) != first { + return None; + } + } + Some(first) + } + + /// Clear all dirty bits. + pub fn clear(&mut self) { + self.bits.fill(false); + } + + /// Return the number of dirty blocks. + pub fn dirty_count(&self) -> u64 { + self.bits.count_ones() as u64 + } + + /// Iterate over the indices of all dirty blocks. + pub fn iter_dirty(&self) -> impl Iterator + '_ { + self.bits + .iter() + .enumerate() + .filter(|(_, bit)| **bit) + .map(|(idx, _)| idx as u64) + } + + /// Serialize the bitmap to bytes for snapshot persistence. + pub fn serialize(&self) -> Vec { + let raw_slice = self.bits.as_raw_slice(); + let mut bytes = Vec::with_capacity(raw_slice.len() * std::mem::size_of::()); + for &word in raw_slice { + bytes.extend_from_slice(&word.to_le_bytes()); + } + bytes + } + + /// Deserialize a bitmap from bytes, validating the data. + pub fn deserialize( + bytes: &[u8], + block_size: u32, + total_blocks: u64, + ) -> Result { + if total_blocks == 0 { + return Err(DirtyBitmapError::ZeroDiskSize); + } + if block_size < MIN_BLOCK_SIZE || !block_size.is_power_of_two() { + return Err(DirtyBitmapError::InvalidBlockSize(block_size)); + } + + let bits_needed = total_blocks as usize; + let words_needed = (bits_needed + (usize::BITS as usize - 1)) / usize::BITS as usize; + let expected_len = words_needed * std::mem::size_of::(); + + if bytes.len() != expected_len { + return Err(DirtyBitmapError::LengthMismatch { + expected: expected_len, + actual: bytes.len(), + }); + } + + let word_size = std::mem::size_of::(); + let mut raw: Vec = Vec::with_capacity(words_needed); + for chunk in bytes.chunks_exact(word_size) { + let mut word_bytes = [0u8; std::mem::size_of::()]; + word_bytes.copy_from_slice(chunk); + raw.push(usize::from_le_bytes(word_bytes)); + } + + let mut bits = BitVec::from_vec(raw); + bits.truncate(bits_needed); + + Ok(Self { + bits, + block_size, + total_blocks, + }) + } + + /// Get the block size in bytes. + pub fn block_size(&self) -> u32 { + self.block_size + } + + /// Get the total number of blocks. + pub fn total_blocks(&self) -> u64 { + self.total_blocks + } + + /// Convert a byte offset to a block index. + fn offset_to_block(&self, offset: u64) -> u64 { + offset / u64::from(self.block_size) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + const DISK_SIZE: u64 = 1024 * 1024; // 1MB + const BLOCK_SIZE: u32 = 4096; + + #[test] + fn test_new_valid() { + let bm = DirtyBitmap::new(DISK_SIZE, BLOCK_SIZE).unwrap(); + assert_eq!(bm.total_blocks(), 256); // 1MB / 4KB + assert_eq!(bm.block_size(), BLOCK_SIZE); + assert_eq!(bm.dirty_count(), 0); + } + + #[test] + fn test_new_non_aligned_disk_size() { + // Disk size not a multiple of block size — should round up. + let bm = DirtyBitmap::new(4097, 4096).unwrap(); + assert_eq!(bm.total_blocks(), 2); + } + + #[test] + fn test_new_zero_disk_size() { + assert!(matches!( + DirtyBitmap::new(0, BLOCK_SIZE), + Err(DirtyBitmapError::ZeroDiskSize) + )); + } + + #[test] + fn test_new_invalid_block_size() { + // Not power of 2. + assert!(matches!( + DirtyBitmap::new(DISK_SIZE, 1000), + Err(DirtyBitmapError::InvalidBlockSize(1000)) + )); + // Too small. + assert!(matches!( + DirtyBitmap::new(DISK_SIZE, 256), + Err(DirtyBitmapError::InvalidBlockSize(256)) + )); + } + + #[test] + fn test_set_and_is_set() { + let mut bm = DirtyBitmap::new(DISK_SIZE, BLOCK_SIZE).unwrap(); + + // Write at offset 0, length 512 — should dirty block 0. + bm.set(0, 512); + assert!(bm.is_set(0)); + assert!(!bm.is_set(1)); + assert_eq!(bm.dirty_count(), 1); + + // Write spanning two blocks (offset 4000, length 200) — blocks 0 and 1. + bm.set(4000, 200); + assert!(bm.is_set(0)); + assert!(bm.is_set(1)); + assert_eq!(bm.dirty_count(), 2); + } + + #[test] + fn test_set_last_block() { + let mut bm = DirtyBitmap::new(DISK_SIZE, BLOCK_SIZE).unwrap(); + let last_offset = DISK_SIZE - 512; + bm.set(last_offset, 512); + assert!(bm.is_set(bm.total_blocks() - 1)); + assert_eq!(bm.dirty_count(), 1); + } + + #[test] + fn test_set_zero_len() { + let mut bm = DirtyBitmap::new(DISK_SIZE, BLOCK_SIZE).unwrap(); + bm.set(0, 0); + assert_eq!(bm.dirty_count(), 0); + } + + #[test] + fn test_is_set_out_of_bounds() { + let bm = DirtyBitmap::new(DISK_SIZE, BLOCK_SIZE).unwrap(); + assert!(!bm.is_set(bm.total_blocks())); + assert!(!bm.is_set(u64::MAX)); + } + + #[test] + fn test_is_range_uniform() { + let mut bm = DirtyBitmap::new(DISK_SIZE, BLOCK_SIZE).unwrap(); + + // All clean. + assert_eq!(bm.is_range_uniform(0, 8192), Some(false)); + + // Dirty blocks 0 and 1. + bm.set(0, 8192); + assert_eq!(bm.is_range_uniform(0, 8192), Some(true)); + + // Mixed: blocks 0-1 dirty, block 2 clean. + assert_eq!(bm.is_range_uniform(0, 12288), None); + + // Zero length. + assert_eq!(bm.is_range_uniform(0, 0), Some(false)); + } + + #[test] + fn test_clear() { + let mut bm = DirtyBitmap::new(DISK_SIZE, BLOCK_SIZE).unwrap(); + bm.set(0, 4096); + bm.set(8192, 4096); + assert_eq!(bm.dirty_count(), 2); + + bm.clear(); + assert_eq!(bm.dirty_count(), 0); + assert!(!bm.is_set(0)); + assert!(!bm.is_set(2)); + } + + #[test] + fn test_iter_dirty() { + let mut bm = DirtyBitmap::new(DISK_SIZE, BLOCK_SIZE).unwrap(); + bm.set(0, 4096); // block 0 + bm.set(8192, 4096); // block 2 + bm.set(16384, 4096); // block 4 + + let dirty: Vec = bm.iter_dirty().collect(); + assert_eq!(dirty, vec![0, 2, 4]); + } + + #[test] + fn test_serialize_deserialize_roundtrip() { + let mut bm = DirtyBitmap::new(DISK_SIZE, BLOCK_SIZE).unwrap(); + bm.set(0, 4096); + bm.set(8192, 4096); + bm.set(DISK_SIZE - 4096, 4096); + + let serialized = bm.serialize(); + let restored = + DirtyBitmap::deserialize(&serialized, BLOCK_SIZE, bm.total_blocks()).unwrap(); + + assert_eq!(restored.total_blocks(), bm.total_blocks()); + assert_eq!(restored.block_size(), bm.block_size()); + assert_eq!(restored.dirty_count(), bm.dirty_count()); + + let orig_dirty: Vec = bm.iter_dirty().collect(); + let restored_dirty: Vec = restored.iter_dirty().collect(); + assert_eq!(orig_dirty, restored_dirty); + } + + #[test] + fn test_deserialize_invalid_length() { + assert!(matches!( + DirtyBitmap::deserialize(&[0u8; 3], BLOCK_SIZE, 256), + Err(DirtyBitmapError::LengthMismatch { .. }) + )); + } + + #[test] + fn test_deserialize_invalid_block_size() { + assert!(matches!( + DirtyBitmap::deserialize(&[], 100, 10), + Err(DirtyBitmapError::InvalidBlockSize(100)) + )); + } + + #[test] + fn test_deserialize_zero_total_blocks() { + assert!(matches!( + DirtyBitmap::deserialize(&[], BLOCK_SIZE, 0), + Err(DirtyBitmapError::ZeroDiskSize) + )); + } + + #[test] + fn test_sector_sized_block() { + let mut bm = DirtyBitmap::new(4096, 512).unwrap(); + assert_eq!(bm.total_blocks(), 8); + + bm.set(512, 512); // block 1 + assert!(!bm.is_set(0)); + assert!(bm.is_set(1)); + assert!(!bm.is_set(2)); + } + + #[test] + fn test_large_disk() { + // 10GB disk with 4KB blocks = 2,621,440 blocks. + let bm = DirtyBitmap::new(10 * 1024 * 1024 * 1024, BLOCK_SIZE).unwrap(); + assert_eq!(bm.total_blocks(), 2_621_440); + assert_eq!(bm.dirty_count(), 0); + } + + #[test] + fn test_serialize_empty_bitmap() { + let bm = DirtyBitmap::new(DISK_SIZE, BLOCK_SIZE).unwrap(); + let serialized = bm.serialize(); + let restored = + DirtyBitmap::deserialize(&serialized, BLOCK_SIZE, bm.total_blocks()).unwrap(); + assert_eq!(restored.dirty_count(), 0); + } + + #[test] + fn test_serialize_full_bitmap() { + let mut bm = DirtyBitmap::new(DISK_SIZE, BLOCK_SIZE).unwrap(); + bm.set(0, DISK_SIZE as u32); + assert_eq!(bm.dirty_count(), bm.total_blocks()); + + let serialized = bm.serialize(); + let restored = + DirtyBitmap::deserialize(&serialized, BLOCK_SIZE, bm.total_blocks()).unwrap(); + assert_eq!(restored.dirty_count(), bm.total_blocks()); + } +} diff --git a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs index b7aa8061d76..d0cc68d9f99 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs @@ -2,12 +2,16 @@ // SPDX-License-Identifier: Apache-2.0 pub mod async_io; +pub mod delta; +pub mod dirty_bitmap; +pub mod overlay_io; pub mod sync_io; use std::fmt::Debug; use std::fs::File; pub use self::async_io::{AsyncFileEngine, AsyncIoError}; +pub use self::overlay_io::{OverlayFileEngine, OverlayIoError}; pub use self::sync_io::{SyncFileEngine, SyncIoError}; use crate::devices::virtio::block::virtio::PendingRequest; use crate::devices::virtio::block::virtio::device::FileEngineType; @@ -31,6 +35,8 @@ pub enum BlockIoError { Sync(SyncIoError), /// Async error: {0} Async(AsyncIoError), + /// Overlay error: {0} + Overlay(OverlayIoError), } impl BlockIoError { @@ -54,6 +60,7 @@ pub enum FileEngine { #[allow(unused)] Async(AsyncFileEngine), Sync(SyncFileEngine), + Overlay(OverlayFileEngine), } impl FileEngine { @@ -63,6 +70,11 @@ impl FileEngine { AsyncFileEngine::from_file(file).map_err(BlockIoError::Async)?, )), FileEngineType::Sync => Ok(FileEngine::Sync(SyncFileEngine::from_file(file))), + FileEngineType::Overlay => { + Err(BlockIoError::Overlay( + overlay_io::OverlayIoError::NotConstructibleFromFile, + )) + } } } @@ -70,6 +82,7 @@ impl FileEngine { match self { FileEngine::Async(engine) => engine.update_file(file).map_err(BlockIoError::Async)?, FileEngine::Sync(engine) => engine.update_file(file), + FileEngine::Overlay(engine) => engine.update_overlay(file), }; Ok(()) @@ -80,6 +93,8 @@ impl FileEngine { match self { FileEngine::Async(engine) => engine.file(), FileEngine::Sync(engine) => engine.file(), + // Overlay has two files — return the overlay (writable) file for test compatibility. + FileEngine::Overlay(engine) => engine.overlay_file(), } } @@ -106,6 +121,13 @@ impl FileEngine { error: BlockIoError::Sync(err), }), }, + FileEngine::Overlay(engine) => match engine.read(offset, mem, addr, count) { + Ok(count) => Ok(FileEngineOk::Executed(RequestOk { req, count })), + Err(err) => Err(RequestError { + req, + error: BlockIoError::Overlay(err), + }), + }, } } @@ -132,6 +154,13 @@ impl FileEngine { error: BlockIoError::Sync(err), }), }, + FileEngine::Overlay(engine) => match engine.write(offset, mem, addr, count) { + Ok(count) => Ok(FileEngineOk::Executed(RequestOk { req, count })), + Err(err) => Err(RequestError { + req, + error: BlockIoError::Overlay(err), + }), + }, } } @@ -154,6 +183,21 @@ impl FileEngine { error: BlockIoError::Sync(err), }), }, + FileEngine::Overlay(engine) => match engine.flush() { + Ok(_) => Ok(FileEngineOk::Executed(RequestOk { req, count: 0 })), + Err(err) => Err(RequestError { + req, + error: BlockIoError::Overlay(err), + }), + }, + } + } + + pub fn discard(&mut self, offset: u64, len: u64) -> Result<(), BlockIoError> { + match self { + FileEngine::Overlay(engine) => engine.discard(offset, len).map_err(BlockIoError::Overlay), + // Non-overlay engines don't support discard — this is a no-op. + FileEngine::Sync(_) | FileEngine::Async(_) => Ok(()), } } @@ -161,6 +205,7 @@ impl FileEngine { match self { FileEngine::Async(engine) => engine.drain(discard).map_err(BlockIoError::Async), FileEngine::Sync(_engine) => Ok(()), + FileEngine::Overlay(_engine) => Ok(()), } } @@ -170,6 +215,7 @@ impl FileEngine { engine.drain_and_flush(discard).map_err(BlockIoError::Async) } FileEngine::Sync(engine) => engine.flush().map_err(BlockIoError::Sync), + FileEngine::Overlay(engine) => engine.flush().map_err(BlockIoError::Overlay), } } } diff --git a/src/vmm/src/devices/virtio/block/virtio/io/overlay_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/overlay_io.rs new file mode 100644 index 00000000000..69baab91a5f --- /dev/null +++ b/src/vmm/src/devices/virtio/block/virtio/io/overlay_io.rs @@ -0,0 +1,515 @@ +// Copyright 2026 Superserve AI. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Synchronous copy-on-write overlay file engine. +//! +//! Routes reads between a shared read-only base image and a per-VM sparse overlay +//! file using a dirty bitmap. Writes always go to the overlay. + +use std::fs::File; +use std::io::{Seek, SeekFrom, Write}; + +use vm_memory::{GuestMemoryError, ReadVolatile, WriteVolatile}; + +use super::delta; +use super::dirty_bitmap::{DirtyBitmap, DirtyBitmapError}; +use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryMmap}; + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum OverlayIoError { + /// Base read seek: {0} + BaseSeek(std::io::Error), + /// Base read transfer: {0} + BaseTransfer(GuestMemoryError), + /// Overlay seek: {0} + OverlaySeek(std::io::Error), + /// Overlay read transfer: {0} + OverlayReadTransfer(GuestMemoryError), + /// Overlay write transfer: {0} + OverlayWriteTransfer(GuestMemoryError), + /// Overlay flush: {0} + OverlayFlush(std::io::Error), + /// Overlay sync: {0} + OverlaySync(std::io::Error), + /// Size mismatch: base={base_size}, overlay={overlay_size} + SizeMismatch { base_size: u64, overlay_size: u64 }, + /// Overlay engine cannot be created via from_file — use DiskProperties::new_overlay() + NotConstructibleFromFile, + /// Bitmap error: {0} + Bitmap(DirtyBitmapError), +} + +#[derive(Debug)] +pub struct OverlayFileEngine { + base: File, + overlay: File, + bitmap: DirtyBitmap, +} + +// OverlayFileEngine contains File and DirtyBitmap (Vec-backed), both of which are Send. +// No manual unsafe impl needed — derived automatically. + +impl OverlayFileEngine { + /// Create a new overlay engine from a read-only base file and a writable overlay file. + /// + /// The overlay file must have the same logical size as the base file (sparse is fine). + /// If `bitmap` is `None`, a fresh empty bitmap is created. + pub fn from_files( + base: File, + overlay: File, + disk_size: u64, + block_size: u32, + bitmap: Option, + ) -> Result { + let bitmap = match bitmap { + Some(bm) => bm, + None => DirtyBitmap::new(disk_size, block_size).map_err(OverlayIoError::Bitmap)?, + }; + + Ok(Self { + base, + overlay, + bitmap, + }) + } + + /// Update the overlay file handle. + /// Note: this does NOT reset the bitmap. Callers must ensure the new overlay + /// is consistent with the current bitmap state. Hot-update of overlay devices + /// is rejected at the VirtioBlock level to prevent data corruption. + pub(crate) fn update_overlay(&mut self, overlay: File) { + self.overlay = overlay; + } + + /// Get a reference to the dirty bitmap. + pub fn bitmap(&self) -> &DirtyBitmap { + &self.bitmap + } + + /// Get a reference to the overlay file. + #[cfg(test)] + pub fn overlay_file(&self) -> &File { + &self.overlay + } + + /// Get a mutable reference to the overlay file (for delta export). + pub fn overlay_file_mut(&mut self) -> &mut File { + &mut self.overlay + } + + /// Discard blocks in the overlay: clear bitmap bits and punch holes in the overlay file. + pub fn discard(&mut self, offset: u64, len: u64) -> Result<(), OverlayIoError> { + if len == 0 { + return Ok(()); + } + + // Clear bitmap bits for the discarded range. + let block_size = u64::from(self.bitmap.block_size()); + let start_block = offset / block_size; + let end_offset = offset.saturating_add(len).saturating_sub(1); + let end_block = end_offset / block_size; + let clamped_end = end_block.min(self.bitmap.total_blocks() - 1); + + for block in start_block..=clamped_end { + self.bitmap.unset(block); + } + + // Punch a hole in the overlay file to reclaim host disk space. + #[cfg(target_os = "linux")] + { + use std::os::unix::io::AsRawFd; + let fd = self.overlay.as_raw_fd(); + // SAFETY: fallocate with FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE is safe + // on a valid fd with valid offset/len. + let ret = unsafe { + libc::fallocate( + fd, + libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, + offset as i64, + len as i64, + ) + }; + if ret != 0 { + // Hole punching failure is non-fatal — the bitmap is already cleared, + // so reads will go to base. We just don't reclaim the space. + let _ = std::io::Error::last_os_error(); + } + } + + Ok(()) + } + + /// Write a delta file containing only dirty blocks from this overlay. + pub fn write_delta( + &mut self, + delta_path: &std::path::Path, + ) -> Result { + delta::write_delta(&mut self.overlay, &self.bitmap, delta_path) + } + + /// Read from the appropriate source (base or overlay) based on the dirty bitmap. + pub fn read( + &mut self, + offset: u64, + mem: &GuestMemoryMmap, + addr: GuestAddress, + count: u32, + ) -> Result { + match self.bitmap.is_range_uniform(offset, count) { + Some(false) => { + // All clean — read entirely from base. + self.read_from_base(offset, mem, addr, count) + } + Some(true) => { + // All dirty — read entirely from overlay. + self.read_from_overlay(offset, mem, addr, count) + } + None => { + // Mixed — split into contiguous runs from the same source. + self.read_mixed(offset, mem, addr, count) + } + } + } + + /// Write to the overlay and mark blocks as dirty. + pub fn write( + &mut self, + offset: u64, + mem: &GuestMemoryMmap, + addr: GuestAddress, + count: u32, + ) -> Result { + self.overlay + .seek(SeekFrom::Start(offset)) + .map_err(OverlayIoError::OverlaySeek)?; + mem.get_slice(addr, count as usize) + .and_then(|slice| Ok(self.overlay.write_all_volatile(&slice)?)) + .map_err(OverlayIoError::OverlayWriteTransfer)?; + + self.bitmap.set(offset, count); + Ok(count) + } + + /// Flush the overlay file to disk. Base is read-only and never needs flushing. + pub fn flush(&mut self) -> Result<(), OverlayIoError> { + self.overlay + .flush() + .map_err(OverlayIoError::OverlayFlush)?; + self.overlay + .sync_all() + .map_err(OverlayIoError::OverlaySync) + } + + /// Read a contiguous range from the base file. + fn read_from_base( + &mut self, + offset: u64, + mem: &GuestMemoryMmap, + addr: GuestAddress, + count: u32, + ) -> Result { + self.base + .seek(SeekFrom::Start(offset)) + .map_err(OverlayIoError::BaseSeek)?; + mem.get_slice(addr, count as usize) + .and_then(|mut slice| Ok(self.base.read_exact_volatile(&mut slice)?)) + .map_err(OverlayIoError::BaseTransfer)?; + Ok(count) + } + + /// Read a contiguous range from the overlay file. + fn read_from_overlay( + &mut self, + offset: u64, + mem: &GuestMemoryMmap, + addr: GuestAddress, + count: u32, + ) -> Result { + self.overlay + .seek(SeekFrom::Start(offset)) + .map_err(OverlayIoError::OverlaySeek)?; + mem.get_slice(addr, count as usize) + .and_then(|mut slice| Ok(self.overlay.read_exact_volatile(&mut slice)?)) + .map_err(OverlayIoError::OverlayReadTransfer)?; + Ok(count) + } + + /// Handle a read that spans both dirty and clean blocks. + /// + /// Splits the range into contiguous runs from the same source and reads each + /// run separately. This is the slow path — most reads hit the fast path above. + fn read_mixed( + &mut self, + offset: u64, + mem: &GuestMemoryMmap, + addr: GuestAddress, + count: u32, + ) -> Result { + let block_size = u64::from(self.bitmap.block_size()); + let end_offset = offset + u64::from(count); + + let mut current_offset = offset; + let mut current_addr = addr; + + while current_offset < end_offset { + let current_block = current_offset / block_size; + let is_dirty = self.bitmap.is_set(current_block); + + // Find the end of this contiguous run of same-source blocks. + let mut run_end_block = current_block + 1; + while run_end_block * block_size < end_offset { + if self.bitmap.is_set(run_end_block) != is_dirty { + break; + } + run_end_block += 1; + } + + // Calculate the byte range for this run, clamped to the request bounds. + let run_byte_start = current_offset; + let run_byte_end = (run_end_block * block_size).min(end_offset); + let run_len = (run_byte_end - run_byte_start) as u32; + + if is_dirty { + self.read_from_overlay(run_byte_start, mem, current_addr, run_len)?; + } else { + self.read_from_base(run_byte_start, mem, current_addr, run_len)?; + } + + current_offset = run_byte_end; + current_addr = GuestAddress(current_addr.0 + u64::from(run_len)); + } + + Ok(count) + } +} + +#[cfg(test)] +mod tests { + use vmm_sys_util::tempfile::TempFile; + + use super::*; + use crate::devices::virtio::block::virtio::io::dirty_bitmap::DEFAULT_BLOCK_SIZE; + use crate::vmm_config::machine_config::HugePageConfig; + use crate::vstate::memory; + use crate::vstate::memory::{Bytes, GuestRegionMmapExt}; + + const FILE_LEN: u64 = 16384; // 4 blocks of 4KB + const MEM_LEN: usize = 16384; + + fn create_mem() -> GuestMemoryMmap { + GuestMemoryMmap::from_regions( + memory::anonymous( + [(GuestAddress(0), MEM_LEN)].into_iter(), + true, + HugePageConfig::None, + ) + .unwrap() + .into_iter() + .map(|region| GuestRegionMmapExt::dram_from_mmap_region(region, 0)) + .collect(), + ) + .unwrap() + } + + fn create_base_file(data: &[u8]) -> File { + let f = TempFile::new().unwrap().into_file(); + use std::io::Write; + (&f).write_all(data).unwrap(); + f + } + + fn create_overlay_file(size: u64) -> File { + let f = TempFile::new().unwrap().into_file(); + f.set_len(size).unwrap(); + f + } + + fn create_engine(base_data: &[u8]) -> OverlayFileEngine { + let base = create_base_file(base_data); + let overlay = create_overlay_file(base_data.len() as u64); + OverlayFileEngine::from_files( + base, + overlay, + base_data.len() as u64, + DEFAULT_BLOCK_SIZE, + None, + ) + .unwrap() + } + + #[test] + fn test_read_from_base_only() { + let base_data: Vec = (0..FILE_LEN).map(|i| (i % 251) as u8).collect(); + let mut engine = create_engine(&base_data); + let mem = create_mem(); + + // Read first 512 bytes — should come from base. + engine.read(0, &mem, GuestAddress(0), 512).unwrap(); + + let mut buf = vec![0u8; 512]; + mem.read_slice(&mut buf, GuestAddress(0)).unwrap(); + assert_eq!(buf, &base_data[..512]); + } + + #[test] + fn test_write_then_read_from_overlay() { + let base_data = vec![0xAA_u8; FILE_LEN as usize]; + let mut engine = create_engine(&base_data); + + // Write different data to overlay. + let write_data = vec![0xBB_u8; 512]; + let mem = create_mem(); + mem.write(&write_data, GuestAddress(0)).unwrap(); + engine.write(0, &mem, GuestAddress(0), 512).unwrap(); + + // Read back — should get overlay data, not base. + let mem = create_mem(); + engine.read(0, &mem, GuestAddress(0), 512).unwrap(); + + let mut buf = vec![0u8; 512]; + mem.read_slice(&mut buf, GuestAddress(0)).unwrap(); + assert_eq!(buf, write_data); + } + + #[test] + fn test_base_not_modified_after_write() { + let base_data = vec![0xAA_u8; FILE_LEN as usize]; + let mut engine = create_engine(&base_data); + + // Write to overlay. + let write_data = vec![0xBB_u8; 4096]; + let mem = create_mem(); + mem.write(&write_data, GuestAddress(0)).unwrap(); + engine.write(0, &mem, GuestAddress(0), 4096).unwrap(); + + // Read from base directly — should still be original data. + let mem = create_mem(); + engine.read_from_base(0, &mem, GuestAddress(0), 512).unwrap(); + + let mut buf = vec![0u8; 512]; + mem.read_slice(&mut buf, GuestAddress(0)).unwrap(); + assert_eq!(buf, vec![0xAA_u8; 512]); + } + + #[test] + fn test_mixed_read() { + let base_data = vec![0xAA_u8; FILE_LEN as usize]; + let mut engine = create_engine(&base_data); + + // Write to block 0 only (first 4KB). + let write_data = vec![0xBB_u8; 4096]; + let mem = create_mem(); + mem.write(&write_data, GuestAddress(0)).unwrap(); + engine.write(0, &mem, GuestAddress(0), 4096).unwrap(); + + // Read 8KB spanning block 0 (dirty) and block 1 (clean). + let mem = create_mem(); + engine.read(0, &mem, GuestAddress(0), 8192).unwrap(); + + let mut buf = vec![0u8; 8192]; + mem.read_slice(&mut buf, GuestAddress(0)).unwrap(); + + // First 4KB should be overlay data (0xBB). + assert_eq!(&buf[..4096], &vec![0xBB_u8; 4096]); + // Second 4KB should be base data (0xAA). + assert_eq!(&buf[4096..8192], &vec![0xAA_u8; 4096]); + } + + #[test] + fn test_write_updates_bitmap() { + let base_data = vec![0u8; FILE_LEN as usize]; + let mut engine = create_engine(&base_data); + + assert_eq!(engine.bitmap().dirty_count(), 0); + + let mem = create_mem(); + engine.write(0, &mem, GuestAddress(0), 512).unwrap(); + assert_eq!(engine.bitmap().dirty_count(), 1); + + // Write to a different block. + engine.write(4096, &mem, GuestAddress(0), 512).unwrap(); + assert_eq!(engine.bitmap().dirty_count(), 2); + } + + #[test] + fn test_flush() { + let base_data = vec![0u8; FILE_LEN as usize]; + let mut engine = create_engine(&base_data); + // Flush should succeed even with no writes. + engine.flush().unwrap(); + } + + #[test] + fn test_read_at_offset() { + let base_data: Vec = (0..FILE_LEN).map(|i| (i % 251) as u8).collect(); + let mut engine = create_engine(&base_data); + let mem = create_mem(); + + let offset = 4096u64; + let count = 512u32; + engine + .read(offset, &mem, GuestAddress(0), count) + .unwrap(); + + let mut buf = vec![0u8; count as usize]; + mem.read_slice(&mut buf, GuestAddress(0)).unwrap(); + assert_eq!(buf, &base_data[offset as usize..(offset as usize + count as usize)]); + } + + #[test] + fn test_write_at_offset_then_read() { + let base_data = vec![0xAA_u8; FILE_LEN as usize]; + let mut engine = create_engine(&base_data); + + // Write at offset 4096 (block 1). + let write_data = vec![0xCC_u8; 512]; + let mem = create_mem(); + mem.write(&write_data, GuestAddress(0)).unwrap(); + engine.write(4096, &mem, GuestAddress(0), 512).unwrap(); + + // Read back from same offset. + let mem = create_mem(); + engine.read(4096, &mem, GuestAddress(0), 512).unwrap(); + + let mut buf = vec![0u8; 512]; + mem.read_slice(&mut buf, GuestAddress(0)).unwrap(); + assert_eq!(buf, write_data); + + // Block 0 should still be base data. + let mem = create_mem(); + engine.read(0, &mem, GuestAddress(0), 512).unwrap(); + + let mut buf = vec![0u8; 512]; + mem.read_slice(&mut buf, GuestAddress(0)).unwrap(); + assert_eq!(buf, vec![0xAA_u8; 512]); + } + + #[test] + fn test_overwrite_same_block() { + let base_data = vec![0xAA_u8; FILE_LEN as usize]; + let mut engine = create_engine(&base_data); + + // First write. + let data1 = vec![0xBB_u8; 512]; + let mem = create_mem(); + mem.write(&data1, GuestAddress(0)).unwrap(); + engine.write(0, &mem, GuestAddress(0), 512).unwrap(); + + // Overwrite same location. + let data2 = vec![0xCC_u8; 512]; + let mem = create_mem(); + mem.write(&data2, GuestAddress(0)).unwrap(); + engine.write(0, &mem, GuestAddress(0), 512).unwrap(); + + // Should get the second write. + let mem = create_mem(); + engine.read(0, &mem, GuestAddress(0), 512).unwrap(); + + let mut buf = vec![0u8; 512]; + mem.read_slice(&mut buf, GuestAddress(0)).unwrap(); + assert_eq!(buf, data2); + + // Bitmap should still show 1 dirty block. + assert_eq!(engine.bitmap().dirty_count(), 1); + } +} diff --git a/src/vmm/src/devices/virtio/block/virtio/mod.rs b/src/vmm/src/devices/virtio/block/virtio/mod.rs index 9e97d6d3897..530741cde08 100644 --- a/src/vmm/src/devices/virtio/block/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/mod.rs @@ -5,7 +5,7 @@ pub mod device; mod event_handler; -mod io; +pub mod io; pub mod metrics; pub mod persist; pub mod request; diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index c4288460a56..a9d8a58374b 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -11,6 +11,8 @@ use super::device::DiskProperties; use super::*; use crate::devices::virtio::block::persist::BlockConstructorArgs; use crate::devices::virtio::block::virtio::device::FileEngineType; +use crate::devices::virtio::block::virtio::io::dirty_bitmap::DEFAULT_BLOCK_SIZE; +use crate::devices::virtio::block::virtio::io::FileEngine; use crate::devices::virtio::block::virtio::metrics::BlockMetricsPerDevice; use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDeviceType}; use crate::devices::virtio::generated::virtio_blk::VIRTIO_BLK_F_RO; @@ -29,6 +31,8 @@ pub enum FileEngineTypeState { Sync, /// Async File Engine. Async, + /// Overlay File Engine (sync COW with dirty bitmap). + Overlay, } impl From for FileEngineTypeState { @@ -36,6 +40,7 @@ impl From for FileEngineTypeState { match file_engine_type { FileEngineType::Sync => FileEngineTypeState::Sync, FileEngineType::Async => FileEngineTypeState::Async, + FileEngineType::Overlay => FileEngineTypeState::Overlay, } } } @@ -45,21 +50,69 @@ impl From for FileEngineType { match file_engine_type_state { FileEngineTypeState::Sync => FileEngineType::Sync, FileEngineTypeState::Async => FileEngineType::Async, + FileEngineTypeState::Overlay => FileEngineType::Overlay, } } } +/// Overlay-specific state persisted in snapshots. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct OverlayState { + /// Path to the read-only base image. + pub base_path: String, + /// Path to the writable overlay file. + pub overlay_path: String, + /// Serialized dirty bitmap bytes. + pub dirty_bitmap: Vec, + /// Block size used for dirty tracking. + pub block_size: u32, + /// Total number of blocks in the bitmap. + pub total_blocks: u64, + /// Optional delta directory for cloning. Not serialized — set at restore time. + #[serde(skip)] + pub delta_dir: Option, +} + /// Holds info about the block device. Gets saved in snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VirtioBlockState { - id: String, + pub id: String, partuuid: Option, cache_type: CacheType, root_device: bool, - disk_path: String, + pub disk_path: String, pub virtio_state: VirtioDeviceState, rate_limiter_state: RateLimiterState, file_engine_type: FileEngineTypeState, + /// Overlay state, present only for overlay-backed block devices. + #[serde(default)] + pub overlay_state: Option, +} + +impl VirtioBlock { + fn restore_overlay_from_bitmap( + overlay: &OverlayState, + ) -> Result { + use crate::devices::virtio::block::virtio::io::dirty_bitmap::DirtyBitmap; + + let bitmap = DirtyBitmap::deserialize( + &overlay.dirty_bitmap, + overlay.block_size, + overlay.total_blocks, + ) + .map_err(|e| { + VirtioBlockError::FileEngine(io::BlockIoError::Overlay( + io::OverlayIoError::Bitmap(e), + )) + })?; + + DiskProperties::new_overlay_with_bitmap( + overlay.base_path.clone(), + overlay.overlay_path.clone(), + overlay.block_size, + bitmap, + ) + } } impl Persist<'_> for VirtioBlock { @@ -68,7 +121,20 @@ impl Persist<'_> for VirtioBlock { type Error = VirtioBlockError; fn save(&self) -> Self::State { - // Save device state. + let overlay_state = if let FileEngine::Overlay(ref engine) = self.disk.file_engine { + let bitmap = engine.bitmap(); + Some(OverlayState { + base_path: self.disk.base_path.clone().unwrap_or_default(), + overlay_path: self.disk.file_path.clone(), + dirty_bitmap: bitmap.serialize(), + block_size: bitmap.block_size(), + total_blocks: bitmap.total_blocks(), + delta_dir: None, + }) + } else { + None + }; + VirtioBlockState { id: self.id.clone(), partuuid: self.partuuid.clone(), @@ -78,6 +144,7 @@ impl Persist<'_> for VirtioBlock { virtio_state: VirtioDeviceState::from_device(self), rate_limiter_state: self.rate_limiter.save(), file_engine_type: FileEngineTypeState::from(self.file_engine_type()), + overlay_state, } } @@ -89,11 +156,36 @@ impl Persist<'_> for VirtioBlock { let rate_limiter = RateLimiter::restore((), &state.rate_limiter_state) .map_err(VirtioBlockError::RateLimiter)?; - let disk_properties = DiskProperties::new( - state.disk_path.clone(), - is_read_only, - state.file_engine_type.into(), - )?; + let disk_properties = if let Some(ref overlay) = state.overlay_state { + // Check if a delta directory was set (for cloning). + let delta_path = overlay + .delta_dir + .as_ref() + .map(|dir| dir.join(format!("{}.delta", state.id))); + + if let Some(ref delta_path) = delta_path { + if delta_path.exists() { + // Clone path: apply delta to a fresh overlay. + DiskProperties::new_overlay_from_delta( + overlay.base_path.clone(), + overlay.overlay_path.clone(), + delta_path, + )? + } else { + // No delta file — fall through to bitmap restore. + Self::restore_overlay_from_bitmap(overlay)? + } + } else { + // Normal restore: use the serialized bitmap. + Self::restore_overlay_from_bitmap(overlay)? + } + } else { + DiskProperties::new( + state.disk_path.clone(), + is_read_only, + state.file_engine_type.into(), + )? + }; let queue_evts = [EventFd::new(libc::EFD_NONBLOCK).map_err(VirtioBlockError::EventFd)?]; @@ -162,6 +254,7 @@ mod tests { cache_type: CacheType::Writeback, rate_limiter: None, file_engine_type: FileEngineType::default(), + base_path: None, }; let block = VirtioBlock::new(config).unwrap(); @@ -203,6 +296,7 @@ mod tests { cache_type: CacheType::Unsafe, rate_limiter: None, file_engine_type: FileEngineType::default(), + base_path: None, }; let block = VirtioBlock::new(config).unwrap(); @@ -228,4 +322,100 @@ mod tests { // Test that block specific fields are the same. assert_eq!(restored_block.disk.file_path, block.disk.file_path); } + + #[test] + fn test_overlay_persistence() { + use std::io::Write; + + // Create base image with known data. + let base_file = TempFile::new().unwrap(); + let base_data = vec![0xAA_u8; 0x1000]; + base_file.as_file().write_all(&base_data).unwrap(); + + // Create overlay file (empty, will be sized to match base). + let overlay_file = TempFile::new().unwrap(); + + let base_path = base_file.as_path().to_str().unwrap().to_string(); + let overlay_path = overlay_file.as_path().to_str().unwrap().to_string(); + + let config = VirtioBlockConfig { + drive_id: "overlay_test".to_string(), + path_on_host: overlay_path.clone(), + is_root_device: false, + partuuid: None, + is_read_only: false, + cache_type: CacheType::Unsafe, + rate_limiter: None, + file_engine_type: FileEngineType::Overlay, + base_path: Some(base_path.clone()), + }; + + let block = VirtioBlock::new(config).unwrap(); + + // Verify overlay state is present in save. + let block_state = block.save(); + assert!(block_state.overlay_state.is_some()); + + let overlay_state = block_state.overlay_state.as_ref().unwrap(); + assert_eq!(overlay_state.base_path, base_path); + assert_eq!(overlay_state.overlay_path, overlay_path); + assert_eq!(overlay_state.block_size, 4096); + + // Serialize and deserialize. + let serialized = bitcode::serialize(&block_state).unwrap(); + let restored_state: VirtioBlockState = bitcode::deserialize(&serialized).unwrap(); + + // Verify overlay state survived serialization. + assert!(restored_state.overlay_state.is_some()); + let restored_overlay = restored_state.overlay_state.as_ref().unwrap(); + assert_eq!(restored_overlay.base_path, base_path); + assert_eq!(restored_overlay.block_size, overlay_state.block_size); + assert_eq!(restored_overlay.total_blocks, overlay_state.total_blocks); + assert_eq!(restored_overlay.dirty_bitmap, overlay_state.dirty_bitmap); + + // Restore the block device. + let guest_mem = default_mem(); + let restored_block = + VirtioBlock::restore(BlockConstructorArgs { mem: guest_mem }, &restored_state).unwrap(); + + // Verify restored device is overlay type. + assert_eq!(restored_block.file_engine_type(), FileEngineType::Overlay); + assert_eq!(restored_block.disk.file_path, overlay_path); + assert_eq!(restored_block.disk.base_path.as_deref(), Some(base_path.as_str())); + } + + #[test] + fn test_old_snapshot_without_overlay_state() { + // Simulate restoring an old snapshot that has no overlay_state field. + let f = TempFile::new().unwrap(); + f.as_file().set_len(0x1000).unwrap(); + + let config = VirtioBlockConfig { + drive_id: "test".to_string(), + path_on_host: f.as_path().to_str().unwrap().to_string(), + is_root_device: false, + partuuid: None, + is_read_only: false, + cache_type: CacheType::Unsafe, + rate_limiter: None, + file_engine_type: FileEngineType::default(), + base_path: None, + }; + + let block = VirtioBlock::new(config).unwrap(); + let block_state = block.save(); + + // overlay_state should be None for non-overlay devices. + assert!(block_state.overlay_state.is_none()); + + // Serialize, deserialize, restore — should work fine. + let serialized = bitcode::serialize(&block_state).unwrap(); + let restored_state: VirtioBlockState = bitcode::deserialize(&serialized).unwrap(); + assert!(restored_state.overlay_state.is_none()); + + let guest_mem = default_mem(); + let restored_block = + VirtioBlock::restore(BlockConstructorArgs { mem: guest_mem }, &restored_state).unwrap(); + assert_eq!(restored_block.file_engine_type(), FileEngineType::Sync); + } } diff --git a/src/vmm/src/devices/virtio/block/virtio/request.rs b/src/vmm/src/devices/virtio/block/virtio/request.rs index 68857fa9444..fffc7f62115 100644 --- a/src/vmm/src/devices/virtio/block/virtio/request.rs +++ b/src/vmm/src/devices/virtio/block/virtio/request.rs @@ -14,7 +14,8 @@ use crate::devices::virtio::block::virtio::device::DiskProperties; use crate::devices::virtio::block::virtio::metrics::BlockDeviceMetrics; pub use crate::devices::virtio::generated::virtio_blk::{ VIRTIO_BLK_ID_BYTES, VIRTIO_BLK_S_IOERR, VIRTIO_BLK_S_OK, VIRTIO_BLK_S_UNSUPP, - VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID, VIRTIO_BLK_T_IN, VIRTIO_BLK_T_OUT, + VIRTIO_BLK_T_DISCARD, VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID, VIRTIO_BLK_T_IN, + VIRTIO_BLK_T_OUT, }; use crate::devices::virtio::queue::DescriptorChain; use crate::logger::{IncMetric, error}; @@ -34,6 +35,7 @@ pub enum RequestType { Out, Flush, GetDeviceID, + Discard, Unsupported(u32), } @@ -44,6 +46,7 @@ impl From for RequestType { VIRTIO_BLK_T_OUT => RequestType::Out, VIRTIO_BLK_T_FLUSH => RequestType::Flush, VIRTIO_BLK_T_GET_ID => RequestType::GetDeviceID, + VIRTIO_BLK_T_DISCARD => RequestType::Discard, t => RequestType::Unsupported(t), } } @@ -176,6 +179,9 @@ impl PendingRequest { (Ok(transferred_data_len), RequestType::GetDeviceID) => { Status::from_data(self.data_len, transferred_data_len, true) } + (Ok(_), RequestType::Discard) => Status::Ok { + num_bytes_to_mem: 0, + }, (_, RequestType::Unsupported(op)) => Status::Unsupported { op }, (Err(err), _) => Status::IoErr { num_bytes_to_mem: 0, @@ -308,6 +314,12 @@ impl Request { return Err(VirtioBlockError::InvalidOffset); } } + RequestType::Discard => { + // Discard segments are 16 bytes each (sector u64, num_sectors u32, flags u32). + if req.data_len == 0 || !req.data_len.is_multiple_of(16) { + return Err(VirtioBlockError::InvalidDataLength); + } + } RequestType::GetDeviceID => { if req.data_len < VIRTIO_BLK_ID_BYTES { return Err(VirtioBlockError::InvalidDataLength); @@ -383,6 +395,55 @@ impl Request { .write(self.offset(), mem, self.data_addr, self.data_len, pending) } RequestType::Flush => disk.file_engine.flush(pending), + RequestType::Discard => { + // The discard data descriptor contains virtio_blk_discard_write_zeroes + // segments (16 bytes each: sector u64, num_sectors u32, flags u32). + // Parse and discard each segment. + let segment_size = 16u32; + let num_segments = self.data_len / segment_size; + let mut discard_err = None; + + for i in 0..num_segments { + let seg_addr = GuestAddress(self.data_addr.0 + u64::from(i * segment_size)); + let sector: u64 = match mem.read_obj(seg_addr) { + Ok(v) => v, + Err(_) => { + discard_err = Some(block_io::BlockIoError::Overlay( + block_io::OverlayIoError::OverlaySeek(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "failed to read discard segment sector", + )), + )); + break; + } + }; + let num_sectors: u32 = match mem.read_obj(GuestAddress(seg_addr.0 + 8)) { + Ok(v) => v, + Err(_) => { + discard_err = Some(block_io::BlockIoError::Overlay( + block_io::OverlayIoError::OverlaySeek(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "failed to read discard segment num_sectors", + )), + )); + break; + } + }; + let offset = sector << SECTOR_SHIFT; + let len = u64::from(num_sectors) << SECTOR_SHIFT; + + if let Err(e) = disk.file_engine.discard(offset, len) { + discard_err = Some(e); + break; + } + } + + let res = match discard_err { + Some(e) => Err(IoErr::FileEngine(e)), + None => Ok(0), + }; + return ProcessingResult::Executed(pending.finish(mem, res, block_metrics)); + } RequestType::GetDeviceID => { let res = mem .write_slice(&disk.image_id, self.data_addr) @@ -731,6 +792,7 @@ mod tests { RequestType::Out => VIRTIO_BLK_T_OUT, RequestType::Flush => VIRTIO_BLK_T_FLUSH, RequestType::GetDeviceID => VIRTIO_BLK_T_GET_ID, + RequestType::Discard => VIRTIO_BLK_T_DISCARD, RequestType::Unsupported(id) => id, } } @@ -740,7 +802,7 @@ mod tests { fn request_type_flags(request_type: RequestType) -> u16 { match request_type { RequestType::In => VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, - RequestType::Out => VIRTQ_DESC_F_NEXT, + RequestType::Out | RequestType::Discard => VIRTQ_DESC_F_NEXT, RequestType::Flush => VIRTQ_DESC_F_NEXT, RequestType::GetDeviceID => VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, RequestType::Unsupported(_) => VIRTQ_DESC_F_NEXT, diff --git a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs index e4f23c6a038..295cd295f12 100644 --- a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs @@ -58,6 +58,7 @@ pub fn default_block_with_path(path: String, file_engine_type: FileEngineType) - }), }), file_engine_type, + base_path: None, }; // The default block device is read-write and non-root. @@ -119,7 +120,7 @@ pub fn simulate_queue_and_async_completion_events(b: &mut VirtioBlock, expected_ simulate_queue_event(b, None); simulate_async_completion_event(b, expected_irq); } - FileEngine::Sync(_) => { + FileEngine::Sync(_) | FileEngine::Overlay(_) => { simulate_queue_event(b, Some(expected_irq)); } } diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 21e0cc1c0fc..c65ca9b7fbf 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -160,7 +160,7 @@ pub enum CreateSnapshotError { } /// Snapshot version -pub const SNAPSHOT_VERSION: Version = Version::new(9, 0, 0); +pub const SNAPSHOT_VERSION: Version = Version::new(9, 1, 0); /// Creates a Microvm snapshot. pub fn create_snapshot( @@ -177,6 +177,18 @@ pub fn create_snapshot( vmm.vm .snapshot_memory_to_file(¶ms.mem_file_path, params.snapshot_type)?; + // Write delta files for overlay block devices if a delta directory is specified. + if let Some(ref delta_dir) = params.block_delta_dir { + vmm.device_manager + .write_block_deltas(delta_dir) + .map_err(|e| { + CreateSnapshotError::SnapshotBackingFile( + "write_block_deltas", + std::io::Error::new(std::io::ErrorKind::Other, format!("{:?}", e)), + ) + })?; + } + // We need to mark queues as dirty again for all activated devices. The reason we // do it here is that we don't mark pages as dirty during runtime // for queue objects. @@ -408,6 +420,24 @@ pub fn restore_from_snapshot( .clone_from(&vsock_override.uds_path); } + // If block_delta_dir is set, stamp it onto overlay block device states + // so they apply deltas during restore (for cloning). + if let Some(ref delta_dir) = params.block_delta_dir { + use crate::devices::virtio::block::persist::BlockState; + for block_state in microvm_state + .device_states + .mmio_state + .block_devices + .iter_mut() + { + if let BlockState::Virtio(ref mut vs) = block_state.device_state { + if let Some(ref mut overlay) = vs.overlay_state { + overlay.delta_dir = Some(delta_dir.clone()); + } + } + } + } + let track_dirty_pages = params.track_dirty_pages; let vcpu_count = microvm_state diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index cc45aafe16f..48270710e9d 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -610,6 +610,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }, tmp_file, ) diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 4617890a0e4..4124ae535ac 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -1173,6 +1173,7 @@ mod tests { snapshot_type: SnapshotType::Full, snapshot_path: PathBuf::new(), mem_file_path: PathBuf::new(), + block_delta_dir: None, }, ))); #[cfg(target_arch = "x86_64")] @@ -1236,6 +1237,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }, ))); check_unsupported(runtime_request(VmmAction::InsertNetworkDevice( @@ -1286,6 +1288,7 @@ mod tests { resume_vm: false, network_overrides: vec![], vsock_override: None, + block_delta_dir: None, }, ))); check_unsupported(runtime_request(VmmAction::SetEntropyDevice( diff --git a/src/vmm/src/vmm_config/drive.rs b/src/vmm/src/vmm_config/drive.rs index 2d3fddac830..05f770c3457 100644 --- a/src/vmm/src/vmm_config/drive.rs +++ b/src/vmm/src/vmm_config/drive.rs @@ -65,6 +65,9 @@ pub struct BlockDeviceConfig { // VhostUserBlock specific fields /// Path to the vhost-user socket. pub socket: Option, + + /// Read-only base image path for overlay mode. Required when io_engine is "Overlay". + pub base_path: Option, } /// Only provided fields will be updated. I.e. if any optional fields @@ -215,6 +218,7 @@ mod tests { file_engine_type: self.file_engine_type, socket: self.socket.clone(), + base_path: self.base_path.clone(), } } } @@ -242,6 +246,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }; let mut block_devs = BlockBuilder::new(); @@ -276,6 +281,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }; let mut block_devs = BlockBuilder::new(); @@ -308,6 +314,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }; let mut block_devs = BlockBuilder::new(); @@ -337,6 +344,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }; let dummy_file_2 = TempFile::new().unwrap(); @@ -353,6 +361,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }; let mut block_devs = BlockBuilder::new(); @@ -380,6 +389,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }; let dummy_file_2 = TempFile::new().unwrap(); @@ -396,6 +406,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }; let dummy_file_3 = TempFile::new().unwrap(); @@ -412,6 +423,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }; let mut block_devs = BlockBuilder::new(); @@ -453,6 +465,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }; let dummy_file_2 = TempFile::new().unwrap(); @@ -469,6 +482,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }; let dummy_file_3 = TempFile::new().unwrap(); @@ -485,6 +499,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }; let mut block_devs = BlockBuilder::new(); @@ -527,6 +542,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }; let dummy_file_2 = TempFile::new().unwrap(); @@ -543,6 +559,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }; let mut block_devs = BlockBuilder::new(); @@ -615,6 +632,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }; // Switch roots and add a PARTUUID for the new one. let mut root_block_device_old = root_block_device; @@ -631,6 +649,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }; block_devs.insert(root_block_device_old, false).unwrap(); @@ -657,6 +676,7 @@ mod tests { file_engine_type: Some(FileEngineType::Sync), socket: None, + base_path: None, }; let mut block_devs = BlockBuilder::new(); @@ -687,6 +707,7 @@ mod tests { file_engine_type: None, socket: None, + base_path: None, }; let block = Block::new(config).unwrap(); diff --git a/src/vmm/src/vmm_config/snapshot.rs b/src/vmm/src/vmm_config/snapshot.rs index 1da084887f3..d97f33f9757 100644 --- a/src/vmm/src/vmm_config/snapshot.rs +++ b/src/vmm/src/vmm_config/snapshot.rs @@ -45,6 +45,11 @@ pub struct CreateSnapshotParams { pub snapshot_path: PathBuf, /// Path to the file that will contain the guest memory. pub mem_file_path: PathBuf, + /// Optional directory for block device delta files. When set, overlay block + /// devices will write delta files (containing only dirty blocks) into this + /// directory, named `{drive_id}.delta`. + #[serde(default)] + pub block_delta_dir: Option, } /// Allows for changing the mapping between tap devices and host devices @@ -81,6 +86,10 @@ pub struct LoadSnapshotParams { pub network_overrides: Vec, /// When set, the vsock backend UDS path will be overridden pub vsock_override: Option, + /// Optional directory containing block device delta files for cloning. + /// Each overlay device will look for `{drive_id}.delta` in this directory + /// and apply it to a fresh overlay, enabling fast VM cloning. + pub block_delta_dir: Option, } /// Stores the configuration for loading a snapshot that is provided by the user. @@ -113,6 +122,9 @@ pub struct LoadSnapshotConfig { /// Whether or not to override the vsock backend UDS path. #[serde(skip_serializing_if = "Option::is_none")] pub vsock_override: Option, + /// Optional directory containing block device delta files for cloning. + #[serde(default)] + pub block_delta_dir: Option, } /// Stores the configuration used for managing snapshot memory. diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 4d58b95a426..d8cf28a2f55 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -229,6 +229,7 @@ fn verify_create_snapshot( snapshot_type, snapshot_path: snapshot_file.as_path().to_path_buf(), mem_file_path: memory_file.as_path().to_path_buf(), + block_delta_dir: None, }; controller @@ -296,6 +297,7 @@ fn verify_load_snapshot(snapshot_file: TempFile, memory_file: TempFile) { resume_vm: true, network_overrides: vec![], vsock_override: None, + block_delta_dir: None, })) .unwrap(); @@ -381,6 +383,7 @@ fn verify_load_snap_disallowed_after_boot_resources(res: VmmAction, res_name: &s resume_vm: false, network_overrides: vec![], vsock_override: None, + block_delta_dir: None, }); let err = preboot_api_controller.handle_preboot_request(req); assert!( @@ -416,6 +419,7 @@ fn test_preboot_load_snap_disallowed_after_boot_resources() { file_engine_type: None, socket: None, + base_path: None, }; let req = VmmAction::InsertBlockDevice(config);