From d8e2347fef92f34ea8b8fdfb46f03fb9cac36c9c Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Thu, 10 Apr 2025 15:54:36 +0100 Subject: [PATCH 1/3] Increase delay before trying to enable sriov Starting this too early gets the cards into a bad state. You get the following error when trying to start an mdev device: ``` stderr: 'Error: Parent 0000:81:00.4 is not currently registered for mdev support' ``` --- roles/vgpu/templates/nvidia-sriov.service.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/vgpu/templates/nvidia-sriov.service.j2 b/roles/vgpu/templates/nvidia-sriov.service.j2 index 4cd4db0..a7fa3f1 100644 --- a/roles/vgpu/templates/nvidia-sriov.service.j2 +++ b/roles/vgpu/templates/nvidia-sriov.service.j2 @@ -13,7 +13,7 @@ User=root # NOTE(wszumski): There is a race in the driver initialization where if we run # this too early, then the mdev_support_devices entry doesn't show up in sysfs. # I was unable to get this to show up again without a reboot. -ExecStartPre=/bin/sleep 5 +ExecStartPre=/bin/sleep 30 # NOTE(wszumski): The sriov-manage script will unbind the nvidia driver to # initialize the virtual functions. If it fails part way through, the driver # can be left unbound, and subsequent executions of sriov-mange will fail. This From 84fe8ce53e0ed2c6c82613f2ce474c5d0a6312c4 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Thu, 10 Apr 2025 16:50:20 +0100 Subject: [PATCH 2/3] Address review comments --- roles/vgpu/defaults/main.yml | 3 +++ roles/vgpu/templates/nvidia-sriov.service.j2 | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/roles/vgpu/defaults/main.yml b/roles/vgpu/defaults/main.yml index 7a01b1f..2a44958 100644 --- a/roles/vgpu/defaults/main.yml +++ b/roles/vgpu/defaults/main.yml @@ -5,6 +5,9 @@ vgpu_driver_dkms: false vgpu_do_reboot: true vgpu_reboot_timeout: 3600 +# Time in seconds to sleep before enabling sriov on +vgpu_sriov_init_delay: 30 + # Deprecated: use vgpu_definitions instead. vgpu_mig_definitions: [] vgpu_definitions: "{{ vgpu_mig_definitions }}" diff --git a/roles/vgpu/templates/nvidia-sriov.service.j2 b/roles/vgpu/templates/nvidia-sriov.service.j2 index a7fa3f1..d57f69f 100644 --- a/roles/vgpu/templates/nvidia-sriov.service.j2 +++ b/roles/vgpu/templates/nvidia-sriov.service.j2 @@ -13,7 +13,7 @@ User=root # NOTE(wszumski): There is a race in the driver initialization where if we run # this too early, then the mdev_support_devices entry doesn't show up in sysfs. # I was unable to get this to show up again without a reboot. -ExecStartPre=/bin/sleep 30 +ExecStartPre=/bin/sleep {{ vgpu_sriov_init_delay }} # NOTE(wszumski): The sriov-manage script will unbind the nvidia driver to # initialize the virtual functions. If it fails part way through, the driver # can be left unbound, and subsequent executions of sriov-mange will fail. This From 58a8667ee1c876a7cfdfbff918eda6e682d89e3d Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Thu, 10 Apr 2025 16:52:36 +0100 Subject: [PATCH 3/3] Whitespace... --- roles/vgpu/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/vgpu/defaults/main.yml b/roles/vgpu/defaults/main.yml index 2a44958..4e2290f 100644 --- a/roles/vgpu/defaults/main.yml +++ b/roles/vgpu/defaults/main.yml @@ -5,7 +5,7 @@ vgpu_driver_dkms: false vgpu_do_reboot: true vgpu_reboot_timeout: 3600 -# Time in seconds to sleep before enabling sriov on +# Time in seconds to sleep before enabling sriov vgpu_sriov_init_delay: 30 # Deprecated: use vgpu_definitions instead.