Skip to content
4 changes: 4 additions & 0 deletions kvm-bindings/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,7 @@ pub use self::arm64::*;
mod riscv64;
#[cfg(target_arch = "riscv64")]
pub use self::riscv64::*;

// linux defines these based on _BITUL macros and bindgen fails to generate them
pub const KVM_DIRTY_GFN_F_DIRTY: u32 = 0b1;
pub const KVM_DIRTY_GFN_F_RESET: u32 = 0b10;
20 changes: 20 additions & 0 deletions kvm-ioctls/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,26 @@

## Upcoming Release

### Fixed

- Fixed `VmFd::enable_cap` available for all architectures

### Added

- Added `KvmDirtyLogRing` structure to mmap the dirty log ring.
- Added `KVM_DIRTY_GFN_F_DIRTY` and `KVM_DIRTY_GFN_F_RESET` bitflags.
- Added `KvmDirtyLogRing` iterator type for accessing dirty log entries.
- Added `dirty_log_ring` field to `VcpuFd` to access per-vCpu dirty rings.
- Inserted fences in KvmDirtyLogRing iterator `next` for architectures with weak memory consistency that require Acquire/Release
- Added `DirtyLogRingInfo` struct and `dirty_log_ring_info` field to `VmFd` to
track dirty ring configuration.
- Added `enable_dirty_log_ring` function on `VmFd` to check corresponding
capabilities and enable KVM's dirty log ring.
- Added `VcpuFd::dirty_log_ring_iter()` to iterate over dirty guest frame numbers.
- Added `VmFd::reset_dirty_rings()` to reset all dirty rings for the VM.

- Plumb through KVM_CAP_DIRTY_LOG_RING as DirtyLogRing cap.

## v0.24.0

### Added
Expand Down
1 change: 1 addition & 0 deletions kvm-ioctls/src/cap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,4 +169,5 @@ pub enum Cap {
NestedState = KVM_CAP_NESTED_STATE,
#[cfg(target_arch = "x86_64")]
X2ApicApi = KVM_CAP_X2APIC_API,
DirtyLogRing = KVM_CAP_DIRTY_LOG_RING,
}
119 changes: 118 additions & 1 deletion kvm-ioctls/src/ioctls/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
use std::mem::size_of;
use std::os::unix::io::AsRawFd;
use std::ptr::{NonNull, null_mut};
use std::sync::atomic::{Ordering, fence};

use kvm_bindings::{
KVM_COALESCED_MMIO_PAGE_OFFSET, kvm_coalesced_mmio, kvm_coalesced_mmio_ring, kvm_run,
KVM_COALESCED_MMIO_PAGE_OFFSET, KVM_DIRTY_GFN_F_DIRTY, KVM_DIRTY_GFN_F_RESET,
KVM_DIRTY_LOG_PAGE_OFFSET, kvm_coalesced_mmio, kvm_coalesced_mmio_ring, kvm_dirty_gfn, kvm_run,
};
use vmm_sys_util::errno;

Expand All @@ -29,6 +31,121 @@ pub mod vm;
/// is otherwise a direct mapping to Result.
pub type Result<T> = std::result::Result<T, errno::Error>;

/// A wrapper around the KVM dirty log ring page.
#[derive(Debug)]
pub(crate) struct KvmDirtyLogRing {
/// Next potentially dirty guest frame number slot index
next_dirty: u64,
/// Memory-mapped array of dirty guest frame number entries
gfns: NonNull<kvm_dirty_gfn>,
/// Ring size mask (size-1) for efficient modulo operations
mask: u64,
/// `true` if we need to use Acquire/Release memory ordering
use_acq_rel: bool,
}

impl KvmDirtyLogRing {
/// Maps the KVM dirty log ring from the vCPU file descriptor.
///
/// # Arguments
/// * `fd` - vCPU file descriptor to mmap from.
/// * `size` - Size of memory region in bytes.
pub(crate) fn mmap_from_fd<F: AsRawFd>(
fd: &F,
bytes: usize,
use_acq_rel: bool,
) -> Result<Self> {
// SAFETY: We trust the sysconf libc function and we're calling it
// with a correct parameter.
let page_size = match unsafe { libc::sysconf(libc::_SC_PAGESIZE) } {
-1 => return Err(errno::Error::last()),
ps => ps as usize,
};

let offset = page_size * KVM_DIRTY_LOG_PAGE_OFFSET as usize;

if bytes % std::mem::size_of::<kvm_dirty_gfn>() != 0 {
// Size of dirty ring in bytes must be multiples of slot size
return Err(errno::Error::new(libc::EINVAL));
}
let slots = bytes / std::mem::size_of::<kvm_dirty_gfn>();
if !slots.is_power_of_two() {
// Number of slots must be power of two
return Err(errno::Error::new(libc::EINVAL));
}

// SAFETY: KVM guarantees that there is a page at offset
// KVM_DIRTY_LOG_PAGE_OFFSET * PAGE_SIZE if the appropriate
// capability is available. If it is not, the call will simply
// fail.
let gfns = unsafe {
NonNull::<kvm_dirty_gfn>::new(libc::mmap(
null_mut(),
bytes,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_SHARED,
fd.as_raw_fd(),
offset as i64,
) as *mut kvm_dirty_gfn)
.filter(|addr| addr.as_ptr() != libc::MAP_FAILED as *mut kvm_dirty_gfn)
.ok_or_else(errno::Error::last)?
};
Ok(Self {
next_dirty: 0,
gfns,
mask: (slots - 1) as u64,
use_acq_rel,
})
}
}

impl Drop for KvmDirtyLogRing {
fn drop(&mut self) {
// SAFETY: This is safe because we mmap the page ourselves, and nobody
// else is holding a reference to it.
unsafe {
libc::munmap(
self.gfns.as_ptr().cast(),
(self.mask + 1) as usize * std::mem::size_of::<kvm_dirty_gfn>(),
);
}
}
}

impl Iterator for KvmDirtyLogRing {
type Item = (u32, u64);
fn next(&mut self) -> Option<Self::Item> {
let i = self.next_dirty & self.mask;
// SAFETY: i is not larger than mask, thus is a valid offset into self.gfns,
// therefore this operation produces a valid pointer to a kvm_dirty_gfn
let gfn_ptr = unsafe { self.gfns.add(i as usize).as_ptr() };

if self.use_acq_rel {
fence(Ordering::Acquire);
}

// SAFETY: Can read a valid pointer to a kvm_dirty_gfn
let gfn = unsafe { gfn_ptr.read_volatile() };

if gfn.flags & KVM_DIRTY_GFN_F_DIRTY == 0 {
// next_dirty stays the same, it will become the next dirty element
None
} else {
self.next_dirty += 1;
let mut updated_gfn = gfn;
updated_gfn.flags ^= KVM_DIRTY_GFN_F_RESET;
// SAFETY: Can write to a valid pointer to a kvm_dirty_gfn
unsafe {
gfn_ptr.write_volatile(updated_gfn);
};
if self.use_acq_rel {
fence(Ordering::Release);
}
Some((gfn.slot, gfn.offset))
}
}
}

/// A wrapper around the coalesced MMIO ring page.
#[derive(Debug)]
pub(crate) struct KvmCoalescedIoRing {
Expand Down
181 changes: 179 additions & 2 deletions kvm-ioctls/src/ioctls/vcpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use libc::EINVAL;
use std::fs::File;
use std::os::unix::io::{AsRawFd, RawFd};

use crate::ioctls::{KvmCoalescedIoRing, KvmRunWrapper, Result};
use crate::ioctls::{KvmCoalescedIoRing, KvmDirtyLogRing, KvmRunWrapper, Result};
use crate::kvm_ioctls::*;
use vmm_sys_util::errno;
use vmm_sys_util::ioctl::{ioctl, ioctl_with_mut_ref, ioctl_with_ref};
Expand Down Expand Up @@ -197,6 +197,9 @@ pub struct VcpuFd {
kvm_run_ptr: KvmRunWrapper,
/// A pointer to the coalesced MMIO page
coalesced_mmio_ring: Option<KvmCoalescedIoRing>,
/// A pointer to the dirty log ring
#[allow(unused)]
dirty_log_ring: Option<KvmDirtyLogRing>,
}

/// KVM Sync Registers used to tell KVM which registers to sync
Expand Down Expand Up @@ -2104,6 +2107,37 @@ impl VcpuFd {
}
}

/// Gets the dirty log ring iterator if one is mapped.
///
/// Returns an iterator over dirty guest frame numbers as (slot, offset) tuples.
/// Returns `None` if no dirty log ring has been mapped.
///
/// # Returns
///
/// An optional iterator over the dirty log ring entries.
///
/// # Example
///
/// ```no_run
/// # use kvm_ioctls::Kvm;
/// # use kvm_ioctls::Cap;
/// let kvm = Kvm::new().unwrap();
/// let mut vm = kvm.create_vm().unwrap();
/// vm.enable_dirty_log_ring(None).unwrap();
/// let mut vcpu = vm.create_vcpu(0).unwrap();
/// if kvm.check_extension(Cap::DirtyLogRing) {
/// if let Some(mut iter) = vcpu.dirty_log_ring_iter() {
/// for (slot, offset) in iter {
/// println!("Dirty page in slot {} at offset {}", slot, offset);
/// }
/// }
/// }
/// ```
#[cfg(target_arch = "x86_64")]
pub fn dirty_log_ring_iter(&mut self) -> Option<impl Iterator<Item = (u32, u64)>> {
self.dirty_log_ring.as_mut()
}

/// Maps the coalesced MMIO ring page. This allows reading entries from
/// the ring via [`coalesced_mmio_read()`](VcpuFd::coalesced_mmio_read).
///
Expand Down Expand Up @@ -2159,11 +2193,16 @@ impl VcpuFd {
/// This should not be exported as a public function because the preferred way is to use
/// `create_vcpu` from `VmFd`. The function cannot be part of the `VcpuFd` implementation because
/// then it would be exported with the public `VcpuFd` interface.
pub fn new_vcpu(vcpu: File, kvm_run_ptr: KvmRunWrapper) -> VcpuFd {
pub fn new_vcpu(
vcpu: File,
kvm_run_ptr: KvmRunWrapper,
dirty_log_ring: Option<KvmDirtyLogRing>,
) -> VcpuFd {
VcpuFd {
vcpu,
kvm_run_ptr,
coalesced_mmio_ring: None,
dirty_log_ring,
}
}

Expand Down Expand Up @@ -2835,6 +2874,144 @@ mod tests {
}
}

#[cfg(target_arch = "x86_64")]
#[test]
fn test_run_code_dirty_log_ring() {
use std::io::Write;

let kvm = Kvm::new().unwrap();
let mut vm = kvm.create_vm().unwrap();

// Enable dirty log ring
let need_bitmap = vm.enable_dirty_log_ring(None).unwrap();

// This example is based on https://lwn.net/Articles/658511/
#[rustfmt::skip]
let code = [
0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
0x00, 0xd8, /* add %bl, %al */
0x04, b'0', /* add $'0', %al */
0xee, /* out %al, %dx */
0xec, /* in %dx, %al */
0xc6, 0x06, 0x00, 0x80, 0x00, /* movl $0, (0x8000); This generates a MMIO Write.*/
0x8a, 0x16, 0x00, 0x80, /* movl (0x8000), %dl; This generates a MMIO Read.*/
0xc6, 0x06, 0x00, 0x20, 0x00, /* movl $0, (0x2000); Dirty one page in guest mem. */
0xf4, /* hlt */
];
let expected_rips: [u64; 3] = [0x1003, 0x1005, 0x1007];

let mem_size = 0x4000;
let load_addr = mmap_anonymous(mem_size).as_ptr();
let guest_addr: u64 = 0x1000;
let slot: u32 = 0;
let mem_region = kvm_userspace_memory_region {
slot,
guest_phys_addr: guest_addr,
memory_size: mem_size as u64,
userspace_addr: load_addr as u64,
flags: KVM_MEM_LOG_DIRTY_PAGES,
};
unsafe {
vm.set_user_memory_region(mem_region).unwrap();
}

unsafe {
// Get a mutable slice of `mem_size` from `load_addr`.
// This is safe because we mapped it before.
let mut slice = std::slice::from_raw_parts_mut(load_addr, mem_size);
slice.write_all(&code).unwrap();
}

let mut vcpu_fd = vm.create_vcpu(0).unwrap();

let mut vcpu_sregs = vcpu_fd.get_sregs().unwrap();
assert_ne!(vcpu_sregs.cs.base, 0);
assert_ne!(vcpu_sregs.cs.selector, 0);
vcpu_sregs.cs.base = 0;
vcpu_sregs.cs.selector = 0;
vcpu_fd.set_sregs(&vcpu_sregs).unwrap();

let mut vcpu_regs = vcpu_fd.get_regs().unwrap();
// Set the Instruction Pointer to the guest address where we loaded the code.
vcpu_regs.rip = guest_addr;
vcpu_regs.rax = 2;
vcpu_regs.rbx = 3;
vcpu_regs.rflags = 2;
vcpu_fd.set_regs(&vcpu_regs).unwrap();

let mut debug_struct = kvm_guest_debug {
control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP,
pad: 0,
arch: kvm_guest_debug_arch {
debugreg: [0, 0, 0, 0, 0, 0, 0, 0],
},
};
vcpu_fd.set_guest_debug(&debug_struct).unwrap();

let mut instr_idx = 0;
loop {
match vcpu_fd.run().expect("run failed") {
VcpuExit::IoIn(addr, data) => {
assert_eq!(addr, 0x3f8);
assert_eq!(data.len(), 1);
}
VcpuExit::IoOut(addr, data) => {
assert_eq!(addr, 0x3f8);
assert_eq!(data.len(), 1);
assert_eq!(data[0], b'5');
}
VcpuExit::MmioRead(addr, data) => {
assert_eq!(addr, 0x8000);
assert_eq!(data.len(), 1);
}
VcpuExit::MmioWrite(addr, data) => {
assert_eq!(addr, 0x8000);
assert_eq!(data.len(), 1);
assert_eq!(data[0], 0);
}
VcpuExit::Debug(debug) => {
if instr_idx == expected_rips.len() - 1 {
// Disabling debugging/single-stepping
debug_struct.control = 0;
vcpu_fd.set_guest_debug(&debug_struct).unwrap();
} else if instr_idx >= expected_rips.len() {
unreachable!();
}
let vcpu_regs = vcpu_fd.get_regs().unwrap();
assert_eq!(vcpu_regs.rip, expected_rips[instr_idx]);
assert_eq!(debug.exception, 1);
assert_eq!(debug.pc, expected_rips[instr_idx]);
// Check first 15 bits of DR6
let mask = (1 << 16) - 1;
assert_eq!(debug.dr6 & mask, 0b100111111110000);
// Bit 10 in DR7 is always 1
assert_eq!(debug.dr7, 1 << 10);
instr_idx += 1;
}
VcpuExit::Hlt => {
// The code snippet dirties 2 pages:
// * one when the code itself is loaded in memory;
// * and one more from the `movl` that writes to address 0x8000

let dirty_pages: u32 =
u32::try_from(vcpu_fd.dirty_log_ring_iter().unwrap().count()).unwrap()
+ if need_bitmap {
let dirty_pages_bitmap = vm.get_dirty_log(slot, mem_size).unwrap();
dirty_pages_bitmap
.into_iter()
.map(|page| page.count_ones())
.sum()
} else {
0
};
assert_eq!(dirty_pages, 2);
break;
}
r => panic!("unexpected exit reason: {:?}", r),
}
}
}

#[test]
#[cfg(target_arch = "aarch64")]
fn test_get_preferred_target() {
Expand Down
Loading