diff --git a/.gitignore b/.gitignore index 155e4cbd8a8..f56db437d09 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ test_results/* /resources/linux /resources/x86_64 /resources/aarch64 +.env \ No newline at end of file diff --git a/.tool-versions b/.tool-versions new file mode 100644 index 00000000000..ff8f8f8c879 --- /dev/null +++ b/.tool-versions @@ -0,0 +1,2 @@ +gcloud 534.0.0 +rust 1.79.0 diff --git a/Cargo.lock b/Cargo.lock index 9ad999e44d0..d515b776e19 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -125,46 +125,41 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "aws-lc-fips-sys" -version = "0.12.13" +version = "0.13.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf12b67bc9c5168f68655aadb2a12081689a58f1d9b1484705e4d1810ed6e4ac" +checksum = "2608e5a7965cc9d58c56234d346c9c89b824c4c8652b6f047b3bd0a777c0644f" dependencies = [ - "bindgen 0.69.4", + "bindgen 0.69.5", "cc", "cmake", "dunce", "fs_extra", - "libc", - "paste", + "regex", ] [[package]] name = "aws-lc-rs" -version = "1.10.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdd82dba44d209fddb11c190e0a94b78651f95299598e472215667417a03ff1d" +checksum = "93fcc8f365936c834db5514fc45aee5b1202d677e6b40e48468aaaa8183ca8c7" dependencies = [ "aws-lc-fips-sys", "aws-lc-sys", - "mirai-annotations", - "paste", "untrusted", "zeroize", ] [[package]] name = "aws-lc-sys" -version = "0.22.0" +version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df7a4168111d7eb622a31b214057b8509c0a7e1794f44c546d742330dc793972" +checksum = "61b1d86e7705efe1be1b569bab41d4fa1e14e220b60a160f78de2db687add079" dependencies = [ - "bindgen 0.69.4", + "bindgen 0.69.5", "cc", "cmake", "dunce", "fs_extra", - "libc", - "paste", ] [[package]] @@ -204,9 +199,9 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.69.4" +version = "0.69.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0" +checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" dependencies = [ "bitflags 2.6.0", "cexpr", @@ -955,12 +950,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" -[[package]] -name = "mirai-annotations" -version = "1.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1" - [[package]] name = "nix" version = "0.27.1" diff --git a/Makefile b/Makefile new file mode 100644 index 00000000000..1fda2f26881 --- /dev/null +++ b/Makefile @@ -0,0 +1,12 @@ +-include .env + +.PHONY: build +build: + ./scripts/build.sh + +.PHONY: upload +upload: + ./scripts/upload.sh $(GCP_PROJECT_ID) + +.PHONY: build-and-upload +make build-and-upload: build upload diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index 48d94a0f050..b04d2886a35 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -220,6 +220,10 @@ "syscall": "madvise", "comment": "Used by the VirtIO balloon device and by musl for some customer workloads. It is also used by aws-lc during random number generation. They setup a memory page that mark with MADV_WIPEONFORK to be able to detect forks. They also call it with -1 to see if madvise is supported in certain platforms." }, + { + "syscall": "mincore", + "comment": "Used by get_memory_dirty_bitmap to check if memory pages are resident" + }, { "syscall": "mmap", "comment": "Used by the VirtIO balloon device", diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index 861b69c6b44..dea4b3ed83a 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -216,6 +216,10 @@ "syscall": "madvise", "comment": "Used by the VirtIO balloon device and by musl for some customer workloads. It is also used by aws-lc during random number generation. They setup a memory page that mark with MADV_WIPEONFORK to be able to detect forks. They also call it with -1 to see if madvise is supported in certain platforms." }, + { + "syscall": "mincore", + "comment": "Used by get_memory_dirty_bitmap to check if memory pages are resident" + }, { "syscall": "mmap", "comment": "Used by the VirtIO balloon device", @@ -524,8 +528,8 @@ "comment": "sigaltstack is used by Rust stdlib to remove alternative signal stack during thread teardown." }, { - "syscall": "getrandom", - "comment": "getrandom is used by `HttpServer` to reinialize `HashMap` after moving to the API thread" + "syscall": "getrandom", + "comment": "getrandom is used by `HttpServer` to reinialize `HashMap` after moving to the API thread" }, { "syscall": "accept4", @@ -1276,4 +1280,4 @@ } ] } -} +} \ No newline at end of file diff --git a/scripts/build.sh b/scripts/build.sh new file mode 100755 index 00000000000..6f459f63e07 --- /dev/null +++ b/scripts/build.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -euo pipefail + +# The format will be: v.._g — e.g. v1.7.2_g8bb88311 +# Extract full version from src/firecracker/swagger/firecracker.yaml +FC_VERSION=$(awk '/^info:/{flag=1} flag && /^ version:/{print $2; exit}' src/firecracker/swagger/firecracker.yaml) +commit_hash=$(git rev-parse --short HEAD) +version_name="v${FC_VERSION}_g${commit_hash}" +echo "Version name: $version_name" + +echo "Starting to build Firecracker version: $version_name" +tools/devtool -y build --release + +mkdir -p "./build/fc/${version_name}" +cp ./build/cargo_target/x86_64-unknown-linux-musl/release/firecracker "./build/fc/${version_name}/firecracker" +echo "Finished building Firecracker version: $version_name and copied to ./build/fc/${version_name}/firecracker" diff --git a/scripts/upload.sh b/scripts/upload.sh new file mode 100755 index 00000000000..4227c642593 --- /dev/null +++ b/scripts/upload.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +set -euo pipefail + +GCP_PROJECT_ID=$1 + +gsutil -h "Cache-Control:no-cache, max-age=0" cp -r "build/fc/*" "gs://${GCP_PROJECT_ID}-fc-versions" +if [ "$GCP_PROJECT_ID" == "e2b-prod" ]; then + # Upload kernel to GCP public builds bucket + gsutil -h "Cache-Control:no-cache, max-age=0" cp -r "build/fc/*" "gs://${GCP_PROJECT_ID}-public-builds/firecrackers/" +fi + +rm -rf build/fc/* diff --git a/src/cpu-template-helper/src/utils/mod.rs b/src/cpu-template-helper/src/utils/mod.rs index bd570840fc5..f457ca0b872 100644 --- a/src/cpu-template-helper/src/utils/mod.rs +++ b/src/cpu-template-helper/src/utils/mod.rs @@ -125,6 +125,7 @@ pub fn build_microvm_from_config( state: VmState::NotStarted, vmm_version: CPU_TEMPLATE_HELPER_VERSION.to_string(), app_name: "cpu-template-helper".to_string(), + memory_regions: None, }; let mut vm_resources = VmResources::from_json(&config, &instance_info, HTTP_MAX_PAYLOAD_SIZE, None) diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index adc0cb1ff83..9395fc508ed 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -35,7 +35,10 @@ vmm-sys-util = { version = "0.12.1", features = ["with-serde"] } [dev-dependencies] cargo_toml = "0.20.5" libc = "0.2.161" -regex = { version = "1.11.1", default-features = false, features = ["std", "unicode-perl"] } +regex = { version = "1.11.1", default-features = false, features = [ + "std", + "unicode-perl", +] } # Dev-Dependencies for uffd examples serde = { version = "1.0.214", features = ["derive"] } @@ -48,7 +51,12 @@ serde = { version = "1.0.214" } serde_json = "1.0.132" [features] -tracing = ["log-instrument", "seccompiler/tracing", "utils/tracing", "vmm/tracing"] +tracing = [ + "log-instrument", + "seccompiler/tracing", + "utils/tracing", + "vmm/tracing", +] gdb = ["vmm/gdb"] [lints] diff --git a/src/firecracker/src/api_server/mod.rs b/src/firecracker/src/api_server/mod.rs index 6ac2955af8f..85b6358b871 100644 --- a/src/firecracker/src/api_server/mod.rs +++ b/src/firecracker/src/api_server/mod.rs @@ -274,7 +274,7 @@ mod tests { Box::new(VmmAction::CreateSnapshot(CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), })), start_time_us, ); @@ -287,7 +287,7 @@ mod tests { Box::new(VmmAction::CreateSnapshot(CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), })), start_time_us, ); diff --git a/src/firecracker/src/api_server/parsed_request.rs b/src/firecracker/src/api_server/parsed_request.rs index 125463d1d05..00c04a6a34b 100644 --- a/src/firecracker/src/api_server/parsed_request.rs +++ b/src/firecracker/src/api_server/parsed_request.rs @@ -20,6 +20,7 @@ use super::request::logger::parse_put_logger; use super::request::machine_configuration::{ parse_get_machine_config, parse_patch_machine_config, parse_put_machine_config, }; +use super::request::memory::{parse_get_memory, parse_get_memory_mappings}; use super::request::metrics::parse_put_metrics; use super::request::mmds::{parse_get_mmds, parse_patch_mmds, parse_put_mmds}; use super::request::net::{parse_patch_net, parse_put_net}; @@ -82,6 +83,14 @@ impl TryFrom<&Request> for ParsedRequest { Ok(ParsedRequest::new_sync(VmmAction::GetFullVmConfig)) } (Method::Get, "machine-config", None) => parse_get_machine_config(), + (Method::Get, "memory", None) => match path_tokens.next() { + Some("mappings") => parse_get_memory_mappings(), + None => parse_get_memory(), + _ => Err(RequestError::InvalidPathMethod( + request_uri.to_string(), + Method::Get, + )), + }, (Method::Get, "mmds", None) => parse_get_mmds(), (Method::Get, _, Some(_)) => method_to_error(Method::Get), (Method::Put, "actions", Some(body)) => parse_put_actions(body), @@ -172,6 +181,8 @@ impl ParsedRequest { } VmmData::BalloonStats(stats) => Self::success_response_with_data(stats), VmmData::InstanceInformation(info) => Self::success_response_with_data(info), + VmmData::MemoryMappings(mappings) => Self::success_response_with_data(mappings), + VmmData::Memory(memory) => Self::success_response_with_data(memory), VmmData::VmmVersion(version) => Self::success_response_with_data( &serde_json::json!({ "firecracker_version": version.as_str() }), ), @@ -568,6 +579,12 @@ pub mod tests { VmmData::InstanceInformation(info) => { http_response(&serde_json::to_string(info).unwrap(), 200) } + VmmData::MemoryMappings(mappings) => { + http_response(&serde_json::to_string(mappings).unwrap(), 200) + } + VmmData::Memory(memory) => { + http_response(&serde_json::to_string(memory).unwrap(), 200) + } VmmData::VmmVersion(version) => http_response( &serde_json::json!({ "firecracker_version": version.as_str() }).to_string(), 200, @@ -589,6 +606,15 @@ pub mod tests { verify_ok_response_with(VmmData::MachineConfiguration(MachineConfig::default())); verify_ok_response_with(VmmData::MmdsValue(serde_json::from_str("{}").unwrap())); verify_ok_response_with(VmmData::InstanceInformation(InstanceInfo::default())); + verify_ok_response_with(VmmData::MemoryMappings( + vmm::vmm_config::instance_info::MemoryMappingsResponse { mappings: vec![] }, + )); + verify_ok_response_with(VmmData::Memory( + vmm::vmm_config::instance_info::MemoryResponse { + resident: vec![], + empty: vec![], + }, + )); verify_ok_response_with(VmmData::VmmVersion(String::default())); // Error. @@ -662,6 +688,30 @@ pub mod tests { ParsedRequest::try_from(&req).unwrap(); } + #[test] + fn test_try_from_get_memory_mappings() { + let (mut sender, receiver) = UnixStream::pair().unwrap(); + let mut connection = HttpConnection::new(receiver); + sender + .write_all(http_request("GET", "/memory/mappings", None).as_bytes()) + .unwrap(); + connection.try_read().unwrap(); + let req = connection.pop_parsed_request().unwrap(); + ParsedRequest::try_from(&req).unwrap(); + } + + #[test] + fn test_try_from_get_memory() { + let (mut sender, receiver) = UnixStream::pair().unwrap(); + let mut connection = HttpConnection::new(receiver); + sender + .write_all(http_request("GET", "/memory", None).as_bytes()) + .unwrap(); + connection.try_read().unwrap(); + let req = connection.pop_parsed_request().unwrap(); + ParsedRequest::try_from(&req).unwrap(); + } + #[test] fn test_try_from_get_version() { let (mut sender, receiver) = UnixStream::pair().unwrap(); diff --git a/src/firecracker/src/api_server/request/memory.rs b/src/firecracker/src/api_server/request/memory.rs new file mode 100644 index 00000000000..e879d6b3b02 --- /dev/null +++ b/src/firecracker/src/api_server/request/memory.rs @@ -0,0 +1,39 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use vmm::logger::{IncMetric, METRICS}; +use vmm::rpc_interface::VmmAction; + +use super::super::parsed_request::{ParsedRequest, RequestError}; + +pub(crate) fn parse_get_memory_mappings() -> Result { + METRICS.get_api_requests.instance_info_count.inc(); + Ok(ParsedRequest::new_sync(VmmAction::GetMemoryMappings)) +} + +pub(crate) fn parse_get_memory() -> Result { + METRICS.get_api_requests.instance_info_count.inc(); + Ok(ParsedRequest::new_sync(VmmAction::GetMemory)) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::api_server::parsed_request::RequestAction; + + #[test] + fn test_parse_get_memory_mappings_request() { + match parse_get_memory_mappings().unwrap().into_parts() { + (RequestAction::Sync(action), _) if *action == VmmAction::GetMemoryMappings => {} + _ => panic!("Test failed."), + } + } + + #[test] + fn test_parse_get_memory_request() { + match parse_get_memory().unwrap().into_parts() { + (RequestAction::Sync(action), _) if *action == VmmAction::GetMemory => {} + _ => panic!("Test failed."), + } + } +} diff --git a/src/firecracker/src/api_server/request/mod.rs b/src/firecracker/src/api_server/request/mod.rs index 0c1622798f4..4442436986c 100644 --- a/src/firecracker/src/api_server/request/mod.rs +++ b/src/firecracker/src/api_server/request/mod.rs @@ -10,6 +10,7 @@ pub mod entropy; pub mod instance_info; pub mod logger; pub mod machine_configuration; +pub mod memory; pub mod metrics; pub mod mmds; pub mod net; diff --git a/src/firecracker/src/api_server/request/snapshot.rs b/src/firecracker/src/api_server/request/snapshot.rs index 8878c224b5c..448fa95ad48 100644 --- a/src/firecracker/src/api_server/request/snapshot.rs +++ b/src/firecracker/src/api_server/request/snapshot.rs @@ -139,7 +139,7 @@ mod tests { let expected_config = CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::from("foo"), - mem_file_path: PathBuf::from("bar"), + mem_file_path: Some(PathBuf::from("bar")), }; assert_eq!( vmm_action_from_request(parse_put_snapshot(&Body::new(body), Some("create")).unwrap()), @@ -153,7 +153,7 @@ mod tests { let expected_config = CreateSnapshotParams { snapshot_type: SnapshotType::Full, snapshot_path: PathBuf::from("foo"), - mem_file_path: PathBuf::from("bar"), + mem_file_path: Some(PathBuf::from("bar")), }; assert_eq!( vmm_action_from_request(parse_put_snapshot(&Body::new(body), Some("create")).unwrap()), diff --git a/src/firecracker/src/main.rs b/src/firecracker/src/main.rs index 8fb5392afcf..1f0ec961ae8 100644 --- a/src/firecracker/src/main.rs +++ b/src/firecracker/src/main.rs @@ -337,6 +337,7 @@ fn main_exec() -> Result<(), MainError> { state: VmState::NotStarted, vmm_version: FIRECRACKER_VERSION.to_string(), app_name: "Firecracker".to_string(), + memory_regions: None, }; if let Some(metrics_path) = arguments.single_value("metrics-path") { diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index 1f2edb714b8..4200d4d0fab 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -618,6 +618,35 @@ paths: schema: $ref: "#/definitions/Error" + /memory/mappings: + get: + summary: Gets the memory mappings with skippable pages bitmap. + operationId: getMemoryMappings + responses: + 200: + description: OK + schema: + $ref: "#/definitions/MemoryMappingsResponse" + default: + description: Internal server error + schema: + $ref: "#/definitions/Error" + + /memory: + get: + summary: Gets the memory info (resident and empty pages). + description: Returns an object with resident and empty bitmaps. The resident bitmap marks all pages that are resident. The empty bitmap marks zero pages (subset of resident pages). This is checked at the pageSize of each region. All regions must have the same page size. + operationId: getMemory + responses: + 200: + description: OK + schema: + $ref: "#/definitions/MemoryResponse" + default: + description: Internal server error + schema: + $ref: "#/definitions/Error" + /version: get: summary: Gets the Firecracker version. @@ -991,6 +1020,59 @@ definitions: description: MicroVM hypervisor build version. type: string + GuestMemoryRegionMapping: + type: object + description: Describes the region of guest memory that can be used for creating the memfile. + required: + - base_host_virt_addr + - size + - offset + - page_size + properties: + base_host_virt_addr: + type: integer + size: + description: The size of the region in bytes. + type: integer + offset: + description: The offset of the region in bytes. + type: integer + page_size: + description: The page size in bytes. + type: integer + + MemoryMappingsResponse: + type: object + description: Response containing memory region mappings. + required: + - mappings + properties: + mappings: + type: array + description: The memory region mappings. + items: + $ref: "#/definitions/GuestMemoryRegionMapping" + + MemoryResponse: + type: object + description: Response containing the memory info (resident and empty pages). + required: + - resident + - empty + properties: + resident: + type: array + description: The resident bitmap as a vector of u64 values. Each bit represents if the page is resident. + items: + type: integer + format: uint64 + empty: + type: array + description: The empty bitmap as a vector of u64 values. Each bit represents if the page is zero (empty). This is a subset of the resident pages. + items: + type: integer + format: uint64 + Logger: type: object description: @@ -1192,7 +1274,6 @@ definitions: SnapshotCreateParams: type: object required: - - mem_file_path - snapshot_path properties: mem_file_path: diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index c9a032edb95..6b22376c482 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -9,15 +9,18 @@ license = "Apache-2.0" bench = false [dependencies] -acpi_tables = { path = "../acpi-tables" } -aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] } +acpi_tables = { path = "../acpi-tables" } +aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] } arrayvec = { version = "0.7.6", optional = true } aws-lc-rs = { version = "1.10.0", features = ["bindgen"] } base64 = "0.22.1" bincode = "1.2.1" bitflags = "2.6.0" crc64 = "2.0.0" -derive_more = { version = "1.0.0", default-features = false, features = ["from", "display"] } +derive_more = { version = "1.0.0", default-features = false, features = [ + "from", + "display", +] } displaydoc = "0.2.5" event-manager = "0.4.0" gdbstub = { version = "0.7.3", optional = true } @@ -43,7 +46,10 @@ userfaultfd = "0.8.1" utils = { path = "../utils" } vhost = { version = "0.13.0", features = ["vhost-user-frontend"] } vm-allocator = "0.1.0" -vm-memory = { version = "0.16.0", features = ["backend-mmap", "backend-bitmap"] } +vm-memory = { version = "0.16.0", features = [ + "backend-mmap", + "backend-bitmap", +] } vm-superio = "0.8.0" vmm-sys-util = { version = "0.12.1", features = ["with-serde"] } zerocopy = { version = "0.8.8" } @@ -80,3 +86,8 @@ harness = false [lints] workspace = true + +[patch.crates-io] +aws-lc-sys = "=0.29.0" +aws-lc-rs = "=1.13.1" +aws-lc-fips-sys = "=0.13.7" diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index c80f004e789..3054d9ae30e 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -153,6 +153,7 @@ use crate::vstate::memory::{ use crate::vstate::vcpu::VcpuState; pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse}; pub use crate::vstate::vm::Vm; +use serde::{Deserialize, Serialize}; /// Shorthand type for the EventManager flavour used by Firecracker. pub type EventManager = BaseEventManager>>; @@ -191,6 +192,20 @@ pub enum FcExitCode { ArgParsing = 153, } +/// Describes the region of guest memory that can be used for creating the memfile. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Deserialize, Serialize)] +pub struct GuestMemoryRegionMapping { + /// Base host virtual address where the guest memory contents for this region + /// should be copied/populated. + pub base_host_virt_addr: u64, + /// Region size. + pub size: usize, + /// Offset in the backend file/buffer where the region contents are. + pub offset: u64, + /// The configured page size for this memory region. + pub page_size: usize, +} + /// Timeout used in recv_timeout, when waiting for a vcpu response on /// Pause/Resume/Save/Restore. A high enough limit that should not be reached during normal usage, /// used to detect a potential vcpu deadlock. @@ -451,6 +466,22 @@ impl Vmm { &self.guest_memory } + /// Returns the memory mappings for the guest memory. + pub fn guest_memory_mappings(&self, vm_info: &VmInfo) -> Vec { + let mut offset = 0; + let mut mappings = Vec::new(); + for mem_region in self.guest_memory().iter() { + mappings.push(GuestMemoryRegionMapping { + base_host_virt_addr: mem_region.as_ptr() as u64, + size: mem_region.size(), + offset, + page_size: vm_info.huge_pages.page_size_kib(), + }); + offset += mem_region.size() as u64; + } + mappings + } + /// Sets RDA bit in serial console pub fn emulate_serial_init(&self) -> Result<(), EmulateSerialInitError> { // When restoring from a previously saved state, there is no serial diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 16d7ed72537..74c23bb4f75 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -171,7 +171,10 @@ pub fn create_snapshot( snapshot_state_to_file(µvm_state, ¶ms.snapshot_path)?; - snapshot_memory_to_file(vmm, ¶ms.mem_file_path, params.snapshot_type)?; + // Dump memory to file only if mem_file_path is specified + if let Some(ref mem_file_path) = params.mem_file_path { + snapshot_memory_to_file(vmm, mem_file_path, params.snapshot_type)?; + } Ok(()) } diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 566228fd53a..60616b09f9a 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -25,8 +25,8 @@ use crate::vmm_config::balloon::{ use crate::vmm_config::boot_source::{BootSourceConfig, BootSourceConfigError}; use crate::vmm_config::drive::{BlockDeviceConfig, BlockDeviceUpdateConfig, DriveError}; use crate::vmm_config::entropy::{EntropyDeviceConfig, EntropyDeviceError}; -use crate::vmm_config::instance_info::InstanceInfo; -use crate::vmm_config::machine_config::{MachineConfig, MachineConfigUpdate, VmConfigError}; +use crate::vmm_config::instance_info::{InstanceInfo, MemoryMappingsResponse, MemoryResponse}; +use crate::vmm_config::machine_config::{MachineConfig, MachineConfigError, MachineConfigUpdate}; use crate::vmm_config::metrics::{MetricsConfig, MetricsConfigError}; use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::{ @@ -65,6 +65,10 @@ pub enum VmmAction { GetVmMachineConfig, /// Get microVM instance information. GetVmInstanceInfo, + /// Get memory mappings with skippable pages bitmap. + GetMemoryMappings, + /// Get memory info (resident and empty pages). + GetMemory, /// Get microVM version. GetVmmVersion, /// Flush the metrics. This action can only be called after the logger has been configured. @@ -189,6 +193,10 @@ pub enum VmmData { MmdsValue(serde_json::Value), /// The microVM instance information. InstanceInformation(InstanceInfo), + /// Memory mappings with skippable pages bitmap. + MemoryMappings(MemoryMappingsResponse), + /// Memory info (resident and empty pages). + Memory(MemoryResponse), /// The microVM version. VmmVersion(String), } @@ -419,6 +427,7 @@ impl<'a> PrebootApiController<'a> { &self.vm_resources.vm_config, ))), GetVmInstanceInfo => Ok(VmmData::InstanceInformation(self.instance_info.clone())), + GetMemoryMappings | GetMemory => Err(VmmActionError::OperationNotSupportedPreBoot), GetVmmVersion => Ok(VmmData::VmmVersion(self.instance_info.vmm_version.clone())), InsertBlockDevice(config) => self.insert_block_device(config), InsertNetworkDevice(config) => self.insert_net_device(config), @@ -646,9 +655,30 @@ impl RuntimeApiController { GetVmMachineConfig => Ok(VmmData::MachineConfiguration(MachineConfig::from( &self.vm_resources.vm_config, ))), - GetVmInstanceInfo => Ok(VmmData::InstanceInformation( - self.vmm.lock().expect("Poisoned lock").instance_info(), - )), + GetVmInstanceInfo => { + let locked_vmm = self.vmm.lock().expect("Poisoned lock"); + let instance_info = locked_vmm.instance_info(); + Ok(VmmData::InstanceInformation(instance_info)) + } + GetMemoryMappings => { + let locked_vmm = self.vmm.lock().expect("Poisoned lock"); + let mappings = locked_vmm + .vm + .guest_memory_mappings(&VmInfo::from(&self.vm_resources)); + + Ok(VmmData::MemoryMappings(MemoryMappingsResponse { mappings })) + } + GetMemory => { + let locked_vmm = self.vmm.lock().expect("Poisoned lock"); + let (resident_bitmap, empty_bitmap) = locked_vmm + .vm + .get_memory_info(&VmInfo::from(&self.vm_resources)) + .map_err(|e| VmmActionError::InternalVmm(VmmError::Vm(e)))?; + Ok(VmmData::Memory(MemoryResponse { + resident: resident_bitmap, + empty: empty_bitmap, + })) + } GetVmmVersion => Ok(VmmData::VmmVersion( self.vmm.lock().expect("Poisoned lock").version(), )), @@ -1150,7 +1180,7 @@ mod tests { CreateSnapshotParams { snapshot_type: SnapshotType::Full, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), }, ))); #[cfg(target_arch = "x86_64")] diff --git a/src/vmm/src/vmm_config/instance_info.rs b/src/vmm/src/vmm_config/instance_info.rs index 67fd335deaa..b945803cec9 100644 --- a/src/vmm/src/vmm_config/instance_info.rs +++ b/src/vmm/src/vmm_config/instance_info.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt::{self, Display, Formatter}; +use crate::GuestMemoryRegionMapping; use serde::{ser, Serialize}; /// Enumerates microVM runtime states. @@ -46,4 +47,24 @@ pub struct InstanceInfo { pub vmm_version: String, /// The name of the application that runs the microVM. pub app_name: String, + /// The regions of the guest memory. + #[serde(skip_serializing_if = "Option::is_none")] + pub memory_regions: Option>, +} + +/// Response structure for the memory mappings endpoint. +#[derive(Clone, Debug, PartialEq, Eq, Serialize)] +pub struct MemoryMappingsResponse { + /// The memory region mappings. + pub mappings: Vec, +} + +/// Response structure for the memory endpoint. +#[derive(Clone, Debug, PartialEq, Eq, Serialize)] +pub struct MemoryResponse { + /// The resident bitmap as a vector of u64 values. Each bit represents if the page is resident. + pub resident: Vec, + /// The empty bitmap as a vector of u64 values. Each bit represents if the page is zero (empty). + /// This is a subset of the resident pages. + pub empty: Vec, } diff --git a/src/vmm/src/vmm_config/snapshot.rs b/src/vmm/src/vmm_config/snapshot.rs index e1850b74939..6ea0ee4a92a 100644 --- a/src/vmm/src/vmm_config/snapshot.rs +++ b/src/vmm/src/vmm_config/snapshot.rs @@ -44,7 +44,9 @@ pub struct CreateSnapshotParams { /// Path to the file that will contain the microVM state. pub snapshot_path: PathBuf, /// Path to the file that will contain the guest memory. - pub mem_file_path: PathBuf, + /// If not specified, the memory is not dumped to a file. + #[serde(skip_serializing_if = "Option::is_none")] + pub mem_file_path: Option, } /// Stores the configuration that will be used for loading a snapshot. diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 0f72abcf68f..51b37354354 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -8,11 +8,19 @@ #[cfg(target_arch = "x86_64")] use std::fmt; -#[cfg(target_arch = "x86_64")] -use kvm_bindings::{ - kvm_clock_data, kvm_irqchip, kvm_pit_config, kvm_pit_state2, CpuId, MsrList, - KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, - KVM_MAX_CPUID_ENTRIES, KVM_PIT_SPEAKER_DUMMY, +use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region}; +use kvm_ioctls::VmFd; +use serde::{Deserialize, Serialize}; +use vmm_sys_util::eventfd::EventFd; + +use crate::arch::host_page_size; +pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; +use crate::logger::info; +use crate::persist::{CreateSnapshotError, VmInfo}; +use crate::utils::u64_to_usize; +use crate::vmm_config::snapshot::SnapshotType; +use crate::vstate::memory::{ + Address, GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, }; use kvm_bindings::{kvm_userspace_memory_region, KVM_API_VERSION, KVM_MEM_LOG_DIRTY_PAGES}; use kvm_ioctls::{Kvm, VmFd}; @@ -296,7 +304,139 @@ impl Vm { }) } - /// Restore the KVM VM state + /// Gets the memory info (resident and empty pages) for all memory regions. + /// Returns two bitmaps: resident (all resident pages) and empty (zero pages, subset of resident). + /// This checks at the pageSize of each region and requires all regions to have the same page size. + pub fn get_memory_info(&self, vm_info: &VmInfo) -> Result<(Vec, Vec), VmError> { + let mappings = self.guest_memory_mappings(vm_info); + + if mappings.is_empty() { + return Ok((Vec::new(), Vec::new())); + } + + // Check that all regions have the same page size + let page_size = mappings[0].page_size; + if mappings.iter().any(|m| m.page_size != page_size) { + return Err(VmError::InvalidMemoryConfiguration( + "All memory regions must have the same page size".to_string(), + )); + } + + // Calculate total number of pages across all regions + let total_pages: usize = mappings.iter().map(|m| m.size / page_size).sum(); + let bitmap_size = total_pages.div_ceil(64); + let mut resident_bitmap = vec![0u64; bitmap_size]; + let mut empty_bitmap = vec![0u64; bitmap_size]; + + let mut global_page_idx = 0; + + // SAFETY: We're reading from valid memory regions that we own + unsafe { + // Pre-allocate zero buffer once per page size (reused for all pages) + // This is the most important optimization - avoids repeated allocations + let zero_buf = vec![0u8; page_size]; + + for mapping in &mappings { + // Find the memory region that matches this mapping + let mem_region = self + .guest_memory() + .iter() + .find(|region| region.as_ptr() as u64 == mapping.base_host_virt_addr) + .expect("Memory region not found for mapping"); + + let region_ptr = mem_region.as_ptr(); + let region_size = mem_region.size(); + let num_pages = region_size / page_size; + + // Use mincore on the entire region to check residency + let sys_page_size = host_page_size(); + let mincore_pages = region_size.div_ceil(sys_page_size); + let mut mincore_vec = vec![0u8; mincore_pages]; + + let mincore_result = libc::mincore( + region_ptr.cast::(), + region_size, + mincore_vec.as_mut_ptr(), + ); + + // Check each page + for page_idx in 0..num_pages { + let page_offset = page_idx * page_size; + let page_ptr = region_ptr.add(page_offset); + + // Check if page is resident using mincore + let is_resident = if mincore_result == 0 { + let page_mincore_start = page_offset / sys_page_size; + let page_mincore_count = page_size.div_ceil(sys_page_size); + if page_mincore_start + page_mincore_count <= mincore_vec.len() { + // Page is resident if any 4KB sub-page is resident (check LSB only) + mincore_vec[page_mincore_start..page_mincore_start + page_mincore_count] + .iter() + .any(|&v| (v & 0x1) != 0) + } else { + false + } + } else { + // If mincore failed, assume resident (conservative approach) + true + }; + + let bitmap_idx = global_page_idx / 64; + let bit_idx = global_page_idx % 64; + + if is_resident { + // Set bit in resident bitmap + if bitmap_idx < resident_bitmap.len() { + resident_bitmap[bitmap_idx] |= 1u64 << bit_idx; + } + + // Check if page is zero (empty) + let is_zero = libc::memcmp( + page_ptr.cast::(), + zero_buf.as_ptr().cast::(), + page_size, + ) == 0; + + // Set bit in empty bitmap if page is zero + if is_zero && bitmap_idx < empty_bitmap.len() { + empty_bitmap[bitmap_idx] |= 1u64 << bit_idx; + } + } + + global_page_idx += 1; + } + } + } + + Ok((resident_bitmap, empty_bitmap)) + } + + /// Resets the KVM dirty bitmap for each of the guest's memory regions. + pub fn reset_dirty_bitmap(&self) { + self.guest_memory() + .iter() + .zip(0u32..) + .for_each(|(region, slot)| { + let _ = self.fd().get_dirty_log(slot, u64_to_usize(region.len())); + }); + } + + /// Retrieves the KVM dirty bitmap for each of the guest's memory regions. + pub fn get_dirty_bitmap(&self) -> Result { + let mut bitmap: DirtyBitmap = HashMap::new(); + self.guest_memory() + .iter() + .zip(0u32..) + .try_for_each(|(region, slot)| { + self.fd() + .get_dirty_log(slot, u64_to_usize(region.len())) + .map(|bitmap_region| _ = bitmap.insert(slot, bitmap_region)) + })?; + Ok(bitmap) + } + + /// Takes a snapshot of the virtual machine running inside the given [`Vmm`] and saves it to + /// `mem_file_path`. /// /// # Errors /// diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 4312c6345db..2d8a7aed580 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -212,7 +212,7 @@ fn verify_create_snapshot(is_diff: bool) -> (TempFile, TempFile) { let snapshot_params = CreateSnapshotParams { snapshot_type, snapshot_path: snapshot_file.as_path().to_path_buf(), - mem_file_path: memory_file.as_path().to_path_buf(), + mem_file_path: Some(memory_file.as_path().to_path_buf()), }; controller diff --git a/tests/framework/http_api.py b/tests/framework/http_api.py index a1ee37174b0..1442a253b25 100644 --- a/tests/framework/http_api.py +++ b/tests/framework/http_api.py @@ -123,3 +123,5 @@ def __init__(self, api_usocket_full_name): self.snapshot_load = Resource(self, "/snapshot/load") self.cpu_config = Resource(self, "/cpu-config") self.entropy = Resource(self, "/entropy") + self.memory_mappings = Resource(self, "/memory/mappings") + self.memory = Resource(self, "/memory") diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index 5aebe7b5265..a03e8020098 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -18,6 +18,7 @@ import host_tools.network as net_tools from framework import utils_cpuid from framework.utils import get_firecracker_version_from_toml, is_io_uring_supported +from framework.microvm import HugePagesConfig MEM_LIMIT = 1000000000 @@ -1389,3 +1390,251 @@ def test_negative_snapshot_load_api(microvm_factory): # The snapshot/memory files above don't exist, but the request is otherwise syntactically valid. # In this case, Firecracker exits. vm.mark_killed() + + +def test_memory_mappings_pre_boot(uvm_plain): + """Test that memory mappings endpoint is not available before boot.""" + test_microvm = uvm_plain + test_microvm.spawn() + test_microvm.basic_config() + + # Use session directly since get() asserts on 200 + url = test_microvm.api.endpoint + "/memory/mappings" + res = test_microvm.api.session.get(url) + assert res.status_code == 400 + assert NOT_SUPPORTED_BEFORE_START in res.json()["fault_message"] + + +def test_memory_pre_boot(uvm_plain): + """Test that memory endpoint is not available before boot.""" + test_microvm = uvm_plain + test_microvm.spawn() + test_microvm.basic_config() + + # Use session directly since get() asserts on 200 + url = test_microvm.api.endpoint + "/memory" + res = test_microvm.api.session.get(url) + assert res.status_code == 400 + assert NOT_SUPPORTED_BEFORE_START in res.json()["fault_message"] + + +def test_memory_mappings_post_boot(uvm_plain): + """Test that memory mappings endpoint works after boot with hugepages.""" + test_microvm = uvm_plain + test_microvm.spawn() + test_microvm.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB) + test_microvm.start() + + response = test_microvm.api.memory_mappings.get() + assert response.status_code == 200 + + data = response.json() + assert isinstance(data, dict) + assert "mappings" in data + mappings = data["mappings"] + assert isinstance(mappings, list) + assert len(mappings) > 0 + + # Verify structure of each mapping + for mapping in mappings: + assert "base_host_virt_addr" in mapping + assert "size" in mapping + assert "offset" in mapping + assert "page_size" in mapping + assert isinstance(mapping["base_host_virt_addr"], int) + assert isinstance(mapping["size"], int) + assert isinstance(mapping["offset"], int) + assert isinstance(mapping["page_size"], int) + assert mapping["size"] > 0 + # Verify page size is 2MB (2097152 bytes) for hugepages + assert mapping["page_size"] == 2 * 1024 * 1024 + + +def test_memory_post_boot(uvm_plain): + """Test that memory endpoint works after boot with hugepages.""" + test_microvm = uvm_plain + test_microvm.spawn() + test_microvm.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB) + test_microvm.start() + + # Get memory mappings to determine page size and total memory + mappings_response = test_microvm.api.memory_mappings.get() + assert mappings_response.status_code == 200 + mappings_data = mappings_response.json() + assert isinstance(mappings_data, dict) + assert "mappings" in mappings_data + mappings = mappings_data["mappings"] + assert len(mappings) > 0 + + # All regions should have the same page size (2MB for hugepages) + page_size = mappings[0]["page_size"] + assert page_size == 2 * 1024 * 1024, "Expected 2MB page size for hugepages" + + # Verify all regions have the same page size + for mapping in mappings: + assert ( + mapping["page_size"] == page_size + ), "All regions must have the same page size" + + total_memory_size = sum(mapping["size"] for mapping in mappings) + total_pages = total_memory_size // page_size + expected_bitmap_size = (total_pages + 63) // 64 # ceil(total_pages / 64) + + # Get memory info + response = test_microvm.api.memory.get() + assert response.status_code == 200 + + data = response.json() + assert isinstance(data, dict) + assert "resident" in data + assert "empty" in data + resident_bitmap = data["resident"] + empty_bitmap = data["empty"] + assert isinstance(resident_bitmap, list) + assert isinstance(empty_bitmap, list) + assert len(resident_bitmap) == expected_bitmap_size + assert len(empty_bitmap) == expected_bitmap_size + + # Verify all values are valid u64 integers + for value in resident_bitmap: + assert isinstance(value, int) + assert value >= 0 + assert value <= 0xFFFFFFFFFFFFFFFF # Max u64 value + + for value in empty_bitmap: + assert isinstance(value, int) + assert value >= 0 + assert value <= 0xFFFFFFFFFFFFFFFF # Max u64 value + + # After boot, there should be at least one resident page + has_resident_page = any(value != 0 for value in resident_bitmap) + assert has_resident_page, "Expected at least one resident page after VM boot" + + # Empty pages should be a subset of resident pages + # (empty_bitmap & resident_bitmap) == empty_bitmap + for i in range(len(empty_bitmap)): + assert (empty_bitmap[i] & resident_bitmap[i]) == empty_bitmap[ + i + ], "Empty pages must be a subset of resident pages" + + +@pytest.mark.nonci +def test_memory_benchmark(microvm_factory, guest_kernel_linux_6_1, rootfs): + """Benchmark the memory endpoint performance (resident + zero page checking).""" + test_microvm = microvm_factory.build(guest_kernel_linux_6_1, rootfs) + test_microvm.spawn() + + # Use larger memory size for benchmarking + # Check available hugepages and use a size that fits (need at least some headroom) + # Default to 256MB if we can't determine, or use available - 64MB headroom + try: + with open("/sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages", "r") as f: + free_hugepages = int(f.read().strip()) + # Each hugepage is 2MB, reserve 32 pages (64MB) for system + available_mib = max(128, (free_hugepages - 32) * 2) + mem_size_mib = min(1024, available_mib) # Cap at 1GB for proper benchmark + except (FileNotFoundError, ValueError, OSError): + # Fallback to 256MB if we can't read hugepage info + mem_size_mib = 256 + test_microvm.basic_config( + mem_size_mib=mem_size_mib, huge_pages=HugePagesConfig.HUGETLBFS_2MB + ) + # Add network interface for SSH access + test_microvm.add_net_iface() + test_microvm.start() + + # Get memory mappings to determine actual memory size + mappings_response = test_microvm.api.memory_mappings.get() + assert mappings_response.status_code == 200 + mappings_data = mappings_response.json() + mappings = mappings_data["mappings"] + + # Calculate total memory size + total_memory_bytes = sum(mapping["size"] for mapping in mappings) + total_memory_mib = total_memory_bytes / (1024 * 1024) + page_size = mappings[0]["page_size"] + + # Ensure memory is resident by writing zeros to it via guest + # This will fault in the pages and make them resident + # Using tmpfs (/dev/shm) ensures the memory is actually resident + # Allocate a reasonable portion (e.g., 256MB) to avoid freezing the sandbox + fault_memory_mib = min(256, int(total_memory_mib * 0.25)) # 25% or max 256MB + test_microvm.ssh.run( + "dd if=/dev/zero of=/dev/shm/zero_mem bs=1M count={} 2>/dev/null || true".format( + fault_memory_mib + ) + ) + + # Give the system a moment to fault in pages + time.sleep(0.1) + + # Benchmark the /memory endpoint call + start_time = time.perf_counter() + response = test_microvm.api.memory.get() + end_time = time.perf_counter() + + assert response.status_code == 200 + data = response.json() + assert "resident" in data + assert "empty" in data + + # Verify the response is valid + resident_bitmap = data["resident"] + empty_bitmap = data["empty"] + + # Calculate expected bitmap size + page_size = mappings[0]["page_size"] + total_pages = total_memory_bytes // page_size + expected_bitmap_size = (total_pages + 63) // 64 + + assert len(resident_bitmap) == expected_bitmap_size + assert len(empty_bitmap) == expected_bitmap_size + + # Count actual resident pages (faulted-in memory) + resident_page_count = 0 + for bitmap_value in resident_bitmap: + # Count set bits in each u64 value + resident_page_count += bin(bitmap_value).count("1") + + # Calculate resident memory size (actual memory that was checked) + resident_memory_bytes = resident_page_count * page_size + resident_memory_mib = resident_memory_bytes / (1024 * 1024) + + # Calculate elapsed time and throughput based on actual resident memory + elapsed_seconds = end_time - start_time + + if resident_memory_bytes > 0: + throughput_mib_per_sec = resident_memory_mib / elapsed_seconds + time_per_mb_ms = (elapsed_seconds * 1000) / resident_memory_mib + else: + throughput_mib_per_sec = 0 + time_per_mb_ms = 0 + + # Count empty pages + empty_page_count = 0 + for bitmap_value in empty_bitmap: + empty_page_count += bin(bitmap_value).count("1") + + # Print benchmark results + print(f"\n{'='*60}") + print(f"Memory Benchmark Results") + print(f"{'='*60}") + print( + f"Total Memory: {total_memory_mib:.2f} MiB ({total_memory_bytes / (1024**3):.3f} GB)" + ) + print( + f"Resident Pages: {resident_page_count} / {total_pages} ({resident_page_count * 100 / total_pages:.1f}%)" + ) + print( + f"Resident Memory: {resident_memory_mib:.2f} MiB ({resident_memory_bytes / (1024**3):.3f} GB)" + ) + print( + f"Empty Pages: {empty_page_count} / {resident_page_count} ({empty_page_count * 100 / resident_page_count if resident_page_count > 0 else 0:.1f}% of resident)" + ) + print(f"Elapsed Time: {elapsed_seconds*1000:.2f} ms") + print(f"Throughput (resident): {throughput_mib_per_sec:.2f} MiB/s") + print(f"Time per MB (resident): {time_per_mb_ms:.3f} ms/MB") + print(f"{'='*60}\n") + + # Verify at least some pages are resident + assert resident_page_count > 0, "Expected at least one resident page"