From 2408b38b11bb6c138dd9e94f572f45355dc7acc4 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 10 Jan 2025 13:16:21 +0000 Subject: [PATCH 1/6] Make enabling sriov more reliable There is a race condition on boot: ``` [stack@gpu2 ~]$ sudo journalctl -u nvidia-sriov-0000:17:00.0.service Jan 09 12:13:11 gpu2 systemd[1]: Starting Enable SR-IOV on Nvidia card (0000:17:00.0)... Jan 09 12:13:16 gpu2 sriov-manage[3802]: Enabling VFs on 0000:17:00.0 Jan 09 12:13:17 gpu2 sriov-manage[3837]: awk: (FILENAME=- FNR=1) warning: error writing standard output: File exists Jan 09 12:13:17 gpu2 sriov-manage[3899]: awk: (FILENAME=- FNR=1) warning: error writing standard output: No such device Jan 09 12:13:17 gpu2 systemd[1]: nvidia-sriov-0000:17:00.0.service: Main process exited, code=exited, status=1/FAILURE Jan 09 12:13:17 gpu2 systemd[1]: nvidia-sriov-0000:17:00.0.service: Failed with result 'exit-code'. Jan 09 12:13:17 gpu2 systemd[1]: Failed to start Enable SR-IOV on Nvidia card (0000:17:00.0). ``` This has been observed on Rocky 9.4 with NVIDIA-AI-Enterprise-Linux-KVM-550.127.06-550.127.05-553.24. We can work around this by retrying. The sriov-manage script can leave the driver unbound, so we first ensure that the NVIDIA driver is bound to the card. --- roles/vgpu/templates/nvidia-sriov.service.j2 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/roles/vgpu/templates/nvidia-sriov.service.j2 b/roles/vgpu/templates/nvidia-sriov.service.j2 index c058f6c..50e7dfa 100644 --- a/roles/vgpu/templates/nvidia-sriov.service.j2 +++ b/roles/vgpu/templates/nvidia-sriov.service.j2 @@ -6,12 +6,15 @@ After=local-fs.target {{ vgpu_systemd_device[vgpu_definition.pci_address] }} Wants={{ vgpu_systemd_device[vgpu_definition.pci_address] }} [Service] +Restart=on-failure +RestartSec=30 Type=oneshot User=root # NOTE(wszumski): There is a race in the driver initialization where if we run this too early, then # the mdev_support_devices entry doesn't show up in sysfs. I was unable to get this to show up again # without a reboot. ExecStartPre=/bin/sleep 5 +ExecStart=/bin/bash -c "echo '{{ vgpu_definition.pci_address }}' > /sys/bus/pci/drivers/nvidia/bind || true" ExecStart=/usr/lib/nvidia/sriov-manage -e {{ vgpu_definition.pci_address }} RemainAfterExit=yes From aa10d8ae03e593022c883f2edc230924b3f26dc9 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 10 Jan 2025 13:58:35 +0000 Subject: [PATCH 2/6] Add retries to mdev unit Otherwise it can fail with a dependency failure --- roles/vgpu/templates/nvidia-mdev.service.j2 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/roles/vgpu/templates/nvidia-mdev.service.j2 b/roles/vgpu/templates/nvidia-mdev.service.j2 index d5b287c..0867080 100644 --- a/roles/vgpu/templates/nvidia-mdev.service.j2 +++ b/roles/vgpu/templates/nvidia-mdev.service.j2 @@ -10,6 +10,8 @@ Requires=nvidia-sriov-{{ vgpu_definition.pci_address }}.service {% endif %} [Service] +Restart=on-failure +RestartSec=30 Type=oneshot User=root ExecStartPre=/bin/sleep 5 From 486a47ef328334d338f90d08fd40db0b140f8d5b Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 10 Jan 2025 14:36:48 +0000 Subject: [PATCH 3/6] Take2: Fix failure to start on dependency failure --- roles/vgpu/templates/nvidia-mdev.service.j2 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/roles/vgpu/templates/nvidia-mdev.service.j2 b/roles/vgpu/templates/nvidia-mdev.service.j2 index 0867080..bd23660 100644 --- a/roles/vgpu/templates/nvidia-mdev.service.j2 +++ b/roles/vgpu/templates/nvidia-mdev.service.j2 @@ -4,9 +4,6 @@ Before=docker.service {% if vgpu_definition.mig_devices is defined %} After=nvidia-mig-manager.service Requires=nvidia-mig-manager.service -{% else %} -After=nvidia-sriov-{{ vgpu_definition.pci_address }}.service -Requires=nvidia-sriov-{{ vgpu_definition.pci_address }}.service {% endif %} [Service] @@ -14,7 +11,10 @@ Restart=on-failure RestartSec=30 Type=oneshot User=root -ExecStartPre=/bin/sleep 5 +{% if vgpu_definition.mig_devices is not defined %} +# Workaround lack of UpheldBy/RestartMode=direct in systemd<254 +ExecStartPre=/usr/bin/systemctl is-active nvidia-sriov-{{ vgpu_definition.pci_address }}.service +{% endif %} ExecStart=-/usr/sbin/mdevctl start --uuid %i RemainAfterExit=yes From b75712df66c3935a086e8167a9b427d9c66c2361 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 10 Jan 2025 15:46:56 +0000 Subject: [PATCH 4/6] Improve comment --- roles/vgpu/templates/nvidia-mdev.service.j2 | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/roles/vgpu/templates/nvidia-mdev.service.j2 b/roles/vgpu/templates/nvidia-mdev.service.j2 index bd23660..e651756 100644 --- a/roles/vgpu/templates/nvidia-mdev.service.j2 +++ b/roles/vgpu/templates/nvidia-mdev.service.j2 @@ -12,7 +12,9 @@ RestartSec=30 Type=oneshot User=root {% if vgpu_definition.mig_devices is not defined %} -# Workaround lack of UpheldBy/RestartMode=direct in systemd<254 +# Workaround lack of UpheldBy/RestartMode=direct in systemd<254 to ensure unit is +# started when the dependency fails, see: +# https://unix.stackexchange.com/questions/213185/restarting-systemd-service-on-dependency-failure ExecStartPre=/usr/bin/systemctl is-active nvidia-sriov-{{ vgpu_definition.pci_address }}.service {% endif %} ExecStart=-/usr/sbin/mdevctl start --uuid %i From 31cc1666e3a6c534d9ba60a28c2b0e685c370278 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Wed, 12 Mar 2025 17:10:37 +0000 Subject: [PATCH 5/6] Add comment explaing why we rebind the nvidia driver --- roles/vgpu/templates/nvidia-sriov.service.j2 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/roles/vgpu/templates/nvidia-sriov.service.j2 b/roles/vgpu/templates/nvidia-sriov.service.j2 index 50e7dfa..27510ba 100644 --- a/roles/vgpu/templates/nvidia-sriov.service.j2 +++ b/roles/vgpu/templates/nvidia-sriov.service.j2 @@ -14,6 +14,9 @@ User=root # the mdev_support_devices entry doesn't show up in sysfs. I was unable to get this to show up again # without a reboot. ExecStartPre=/bin/sleep 5 +# NOTE(wszumski): The sriov-manage script will unbind the nvidia driver to initialize the virtual functions. +# If it fails part way through, the driver can be left unbound and subsequent executions of sriov-mange +# will fail. This ensures that the nvidia driver is always bound before we run sriov-manage. ExecStart=/bin/bash -c "echo '{{ vgpu_definition.pci_address }}' > /sys/bus/pci/drivers/nvidia/bind || true" ExecStart=/usr/lib/nvidia/sriov-manage -e {{ vgpu_definition.pci_address }} RemainAfterExit=yes From 4a2f6e815ba65fe843d0720f09bab02e12cfa6dd Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Wed, 12 Mar 2025 17:15:51 +0000 Subject: [PATCH 6/6] Fix line length on comments --- roles/vgpu/templates/nvidia-sriov.service.j2 | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/roles/vgpu/templates/nvidia-sriov.service.j2 b/roles/vgpu/templates/nvidia-sriov.service.j2 index 27510ba..4cd4db0 100644 --- a/roles/vgpu/templates/nvidia-sriov.service.j2 +++ b/roles/vgpu/templates/nvidia-sriov.service.j2 @@ -10,13 +10,14 @@ Restart=on-failure RestartSec=30 Type=oneshot User=root -# NOTE(wszumski): There is a race in the driver initialization where if we run this too early, then -# the mdev_support_devices entry doesn't show up in sysfs. I was unable to get this to show up again -# without a reboot. +# NOTE(wszumski): There is a race in the driver initialization where if we run +# this too early, then the mdev_support_devices entry doesn't show up in sysfs. +# I was unable to get this to show up again without a reboot. ExecStartPre=/bin/sleep 5 -# NOTE(wszumski): The sriov-manage script will unbind the nvidia driver to initialize the virtual functions. -# If it fails part way through, the driver can be left unbound and subsequent executions of sriov-mange -# will fail. This ensures that the nvidia driver is always bound before we run sriov-manage. +# NOTE(wszumski): The sriov-manage script will unbind the nvidia driver to +# initialize the virtual functions. If it fails part way through, the driver +# can be left unbound, and subsequent executions of sriov-mange will fail. This +# ensures that the nvidia driver is always bound before we run sriov-manage. ExecStart=/bin/bash -c "echo '{{ vgpu_definition.pci_address }}' > /sys/bus/pci/drivers/nvidia/bind || true" ExecStart=/usr/lib/nvidia/sriov-manage -e {{ vgpu_definition.pci_address }} RemainAfterExit=yes