Skip to content

Commit 1eed9e2

Browse files
authored
nvme_driver: don't flr nvme devices (#1714)
The default `vfio` device behavior is to issue a function level reset when attaching or detaching devices. It does so because the device is in an unknown or untrusted state. However, within the context of a trusted virtualization stack, OpenHCL can reasonably trust the state and behavior of the device. So, optimize performance by removing these function level resets for nvme devices. This follows the same model as already exists for MANA devices. The `nvme_driver` already shuts down the device (see `NvmeDriver::reset()`) and waits for the device to become disabled. A well behaved nvme device will not issue DMA after this point. That same device should tolerate a graceful start without an FLR. Pending work before this PR is ready to commit: - [x] Initial poc - [x] Parameterize via command line (provide a way to disable, and also easily run A/B tests) - [x] Fix #1714 (comment) - [x] Check no regressions on CI - [x] Test servicing locally with real NVMe devices
1 parent 6aad428 commit 1eed9e2

File tree

4 files changed

+111
-16
lines changed

4 files changed

+111
-16
lines changed

openhcl/underhill_core/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,7 @@ async fn launch_workers(
323323
gdbstub: opt.gdbstub,
324324
hide_isolation: opt.hide_isolation,
325325
nvme_keep_alive: opt.nvme_keep_alive,
326+
nvme_always_flr: opt.nvme_always_flr,
326327
test_configuration: opt.test_configuration,
327328
disable_uefi_frontpage: opt.disable_uefi_frontpage,
328329
};

openhcl/underhill_core/src/nvme_manager.rs

Lines changed: 100 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,12 @@ use pal_async::task::Spawn;
2626
use pal_async::task::Task;
2727
use std::collections::HashMap;
2828
use std::collections::hash_map;
29+
use std::sync::Arc;
2930
use thiserror::Error;
3031
use tracing::Instrument;
32+
use user_driver::vfio::PciDeviceResetMethod;
3133
use user_driver::vfio::VfioDevice;
34+
use user_driver::vfio::vfio_set_device_reset_method;
3235
use vm_resource::AsyncResolveResource;
3336
use vm_resource::ResourceId;
3437
use vm_resource::ResourceResolver;
@@ -92,6 +95,7 @@ impl NvmeManager {
9295
driver_source: &VmTaskDriverSource,
9396
vp_count: u32,
9497
save_restore_supported: bool,
98+
nvme_always_flr: bool,
9599
is_isolated: bool,
96100
saved_state: Option<NvmeSavedState>,
97101
dma_client_spawner: DmaClientSpawner,
@@ -103,6 +107,7 @@ impl NvmeManager {
103107
devices: HashMap::new(),
104108
vp_count,
105109
save_restore_supported,
110+
nvme_always_flr,
106111
is_isolated,
107112
dma_client_spawner,
108113
};
@@ -220,12 +225,99 @@ struct NvmeManagerWorker {
220225
vp_count: u32,
221226
/// Running environment (memory layout) allows save/restore.
222227
save_restore_supported: bool,
228+
nvme_always_flr: bool,
223229
/// If this VM is isolated or not. This influences DMA client allocations.
224230
is_isolated: bool,
225231
#[inspect(skip)]
226232
dma_client_spawner: DmaClientSpawner,
227233
}
228234

235+
async fn create_nvme_device(
236+
driver_source: &VmTaskDriverSource,
237+
pci_id: &str,
238+
vp_count: u32,
239+
nvme_always_flr: bool,
240+
is_isolated: bool,
241+
dma_client: Arc<dyn user_driver::DmaClient>,
242+
) -> Result<nvme_driver::NvmeDriver<VfioDevice>, InnerError> {
243+
// Disable FLR on vfio attach/detach; this allows faster system
244+
// startup/shutdown with the caveat that the device needs to be properly
245+
// sent through the shutdown path during servicing operations, as that is
246+
// the only cleanup performed. If the device fails to initialize, turn FLR
247+
// on and try again, so that the reset is invoked on the next attach.
248+
let update_reset = |method: PciDeviceResetMethod| {
249+
if let Err(err) = vfio_set_device_reset_method(pci_id, method) {
250+
tracing::warn!(
251+
?method,
252+
err = &err as &dyn std::error::Error,
253+
"Failed to update reset_method"
254+
);
255+
}
256+
};
257+
let mut last_err = None;
258+
let reset_methods = if nvme_always_flr {
259+
&[PciDeviceResetMethod::Flr][..]
260+
} else {
261+
// If this code can't create a device without resetting it, then still try to issue an FLR
262+
// in case that unwedges something weird in the device state.
263+
// (This is implicit when the code in [`try_create_nvme_device`] opens a handle to the
264+
// Vfio device).
265+
&[PciDeviceResetMethod::NoReset, PciDeviceResetMethod::Flr][..]
266+
};
267+
for reset_method in reset_methods {
268+
update_reset(*reset_method);
269+
match try_create_nvme_device(
270+
driver_source,
271+
pci_id,
272+
vp_count,
273+
is_isolated,
274+
dma_client.clone(),
275+
)
276+
.await
277+
{
278+
Ok(device) => {
279+
if !nvme_always_flr && !matches!(reset_method, PciDeviceResetMethod::NoReset) {
280+
update_reset(PciDeviceResetMethod::NoReset);
281+
}
282+
return Ok(device);
283+
}
284+
Err(err) => {
285+
tracing::error!(
286+
pci_id,
287+
?reset_method,
288+
err = &err as &dyn std::error::Error,
289+
"failed to create nvme device"
290+
);
291+
last_err = Some(err);
292+
}
293+
}
294+
}
295+
// Return the most reliable error (this code assumes that the reset methods are in increasing order
296+
// of reliability).
297+
Err(last_err.unwrap())
298+
}
299+
300+
async fn try_create_nvme_device(
301+
driver_source: &VmTaskDriverSource,
302+
pci_id: &str,
303+
vp_count: u32,
304+
is_isolated: bool,
305+
dma_client: Arc<dyn user_driver::DmaClient>,
306+
) -> Result<nvme_driver::NvmeDriver<VfioDevice>, InnerError> {
307+
let device = VfioDevice::new(driver_source, pci_id, dma_client)
308+
.instrument(tracing::info_span!("vfio_device_open", pci_id))
309+
.await
310+
.map_err(InnerError::Vfio)?;
311+
312+
// TODO: For now, any isolation means use bounce buffering. This
313+
// needs to change when we have nvme devices that support DMA to
314+
// confidential memory.
315+
nvme_driver::NvmeDriver::new(driver_source, vp_count, device, is_isolated)
316+
.instrument(tracing::info_span!("nvme_driver_init", pci_id))
317+
.await
318+
.map_err(InnerError::DeviceInitFailed)
319+
}
320+
229321
impl NvmeManagerWorker {
230322
async fn run(&mut self, mut recv: mesh::Receiver<Request>) {
231323
let (join_span, nvme_keepalive) = loop {
@@ -315,26 +407,18 @@ impl NvmeManagerWorker {
315407
})
316408
.map_err(InnerError::DmaClient)?;
317409

318-
let device = VfioDevice::new(&self.driver_source, entry.key(), dma_client)
319-
.instrument(tracing::info_span!("vfio_device_open", pci_id))
320-
.await
321-
.map_err(InnerError::Vfio)?;
322-
323-
// TODO: For now, any isolation means use bounce buffering. This
324-
// needs to change when we have nvme devices that support DMA to
325-
// confidential memory.
326-
let driver = nvme_driver::NvmeDriver::new(
410+
let driver = create_nvme_device(
327411
&self.driver_source,
412+
&pci_id,
328413
self.vp_count,
329-
device,
414+
self.nvme_always_flr,
330415
self.is_isolated,
416+
dma_client,
331417
)
332-
.instrument(tracing::info_span!(
333-
"nvme_driver_init",
334-
pci_id = entry.key()
335-
))
336-
.await
337-
.map_err(InnerError::DeviceInitFailed)?;
418+
.instrument(
419+
tracing::info_span!("create_nvme_device", %pci_id, self.nvme_always_flr),
420+
)
421+
.await?;
338422

339423
entry.insert(driver)
340424
}

openhcl/underhill_core/src/options.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,11 @@ pub struct Options {
138138
/// (OPENHCL_NVME_KEEP_ALIVE=1) Enable nvme keep alive when servicing.
139139
pub nvme_keep_alive: bool,
140140

141+
/// (OPENHCL_NVME_ALWAYS_FLR=1)
142+
/// Always use the FLR (Function Level Reset) path for NVMe devices,
143+
/// even if we would otherwise attempt to use VFIO's NoReset support.
144+
pub nvme_always_flr: bool,
145+
141146
/// (OPENHCL_TEST_CONFIG=\<TestScenarioConfig\>)
142147
/// Test configurations are designed to replicate specific behaviors and
143148
/// conditions in order to simulate various test scenarios.
@@ -233,6 +238,7 @@ impl Options {
233238
let gdbstub = parse_legacy_env_bool("OPENHCL_GDBSTUB");
234239
let gdbstub_port = parse_legacy_env_number("OPENHCL_GDBSTUB_PORT")?.map(|x| x as u32);
235240
let nvme_keep_alive = parse_env_bool("OPENHCL_NVME_KEEP_ALIVE");
241+
let nvme_always_flr = parse_env_bool("OPENHCL_NVME_ALWAYS_FLR");
236242
let test_configuration = parse_env_string("OPENHCL_TEST_CONFIG").and_then(|x| {
237243
x.to_string_lossy()
238244
.parse::<TestScenarioConfig>()
@@ -299,6 +305,7 @@ impl Options {
299305
halt_on_guest_halt,
300306
no_sidecar_hotplug,
301307
nvme_keep_alive,
308+
nvme_always_flr,
302309
test_configuration,
303310
disable_uefi_frontpage,
304311
})

openhcl/underhill_core/src/worker.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,8 @@ pub struct UnderhillEnvCfg {
282282
pub hide_isolation: bool,
283283
/// Enable nvme keep alive.
284284
pub nvme_keep_alive: bool,
285+
/// Don't skip FLR for NVMe devices.
286+
pub nvme_always_flr: bool,
285287
/// test configuration
286288
pub test_configuration: Option<TestScenarioConfig>,
287289
/// Disable the UEFI front page.
@@ -1881,6 +1883,7 @@ async fn new_underhill_vm(
18811883
&driver_source,
18821884
processor_topology.vp_count(),
18831885
save_restore_supported,
1886+
env_cfg.nvme_always_flr,
18841887
isolation.is_isolated(),
18851888
servicing_state.nvme_state.unwrap_or(None),
18861889
dma_manager.client_spawner(),

0 commit comments

Comments
 (0)