diff --git a/roles/vgpu/defaults/main.yml b/roles/vgpu/defaults/main.yml index 7a01b1f..4e2290f 100644 --- a/roles/vgpu/defaults/main.yml +++ b/roles/vgpu/defaults/main.yml @@ -5,6 +5,9 @@ vgpu_driver_dkms: false vgpu_do_reboot: true vgpu_reboot_timeout: 3600 +# Time in seconds to sleep before enabling sriov +vgpu_sriov_init_delay: 30 + # Deprecated: use vgpu_definitions instead. vgpu_mig_definitions: [] vgpu_definitions: "{{ vgpu_mig_definitions }}" diff --git a/roles/vgpu/templates/nvidia-sriov.service.j2 b/roles/vgpu/templates/nvidia-sriov.service.j2 index 4cd4db0..d57f69f 100644 --- a/roles/vgpu/templates/nvidia-sriov.service.j2 +++ b/roles/vgpu/templates/nvidia-sriov.service.j2 @@ -13,7 +13,7 @@ User=root # NOTE(wszumski): There is a race in the driver initialization where if we run # this too early, then the mdev_support_devices entry doesn't show up in sysfs. # I was unable to get this to show up again without a reboot. -ExecStartPre=/bin/sleep 5 +ExecStartPre=/bin/sleep {{ vgpu_sriov_init_delay }} # NOTE(wszumski): The sriov-manage script will unbind the nvidia driver to # initialize the virtual functions. If it fails part way through, the driver # can be left unbound, and subsequent executions of sriov-mange will fail. This