From 2408b38b11bb6c138dd9e94f572f45355dc7acc4 Mon Sep 17 00:00:00 2001
From: Will Szumski <will@stackhpc.com>
Date: Fri, 10 Jan 2025 13:16:21 +0000
Subject: [PATCH 1/6] Make enabling sriov more reliable

There is a race condition on boot:

```
[stack@gpu2 ~]$ sudo journalctl -u nvidia-sriov-0000:17:00.0.service
Jan 09 12:13:11 gpu2 systemd[1]: Starting Enable SR-IOV on Nvidia card (0000:17:00.0)...
Jan 09 12:13:16 gpu2 sriov-manage[3802]: Enabling VFs on 0000:17:00.0
Jan 09 12:13:17 gpu2 sriov-manage[3837]: awk: (FILENAME=- FNR=1) warning: error writing standard output: File exists
Jan 09 12:13:17 gpu2 sriov-manage[3899]: awk: (FILENAME=- FNR=1) warning: error writing standard output: No such device
Jan 09 12:13:17 gpu2 systemd[1]: nvidia-sriov-0000:17:00.0.service: Main process exited, code=exited, status=1/FAILURE
Jan 09 12:13:17 gpu2 systemd[1]: nvidia-sriov-0000:17:00.0.service: Failed with result 'exit-code'.
Jan 09 12:13:17 gpu2 systemd[1]: Failed to start Enable SR-IOV on Nvidia card (0000:17:00.0).
```

This has been observed on Rocky 9.4 with NVIDIA-AI-Enterprise-Linux-KVM-550.127.06-550.127.05-553.24.

We can work around this by retrying. The sriov-manage script can leave the driver unbound, so we first ensure that the NVIDIA driver is bound to the card.
---
 roles/vgpu/templates/nvidia-sriov.service.j2 | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/roles/vgpu/templates/nvidia-sriov.service.j2 b/roles/vgpu/templates/nvidia-sriov.service.j2
index c058f6c..50e7dfa 100644
--- a/roles/vgpu/templates/nvidia-sriov.service.j2
+++ b/roles/vgpu/templates/nvidia-sriov.service.j2
@@ -6,12 +6,15 @@ After=local-fs.target {{ vgpu_systemd_device[vgpu_definition.pci_address] }}
 Wants={{ vgpu_systemd_device[vgpu_definition.pci_address] }}
 
 [Service]
+Restart=on-failure
+RestartSec=30
 Type=oneshot
 User=root
 # NOTE(wszumski): There is a race in the driver initialization where if we run this too early, then
 # the mdev_support_devices entry doesn't show up in sysfs. I was unable to get this to show up again
 # without a reboot.
 ExecStartPre=/bin/sleep 5
+ExecStart=/bin/bash -c "echo '{{ vgpu_definition.pci_address }}' > /sys/bus/pci/drivers/nvidia/bind || true"
 ExecStart=/usr/lib/nvidia/sriov-manage -e {{ vgpu_definition.pci_address }}
 RemainAfterExit=yes
 

From aa10d8ae03e593022c883f2edc230924b3f26dc9 Mon Sep 17 00:00:00 2001
From: Will Szumski <will@stackhpc.com>
Date: Fri, 10 Jan 2025 13:58:35 +0000
Subject: [PATCH 2/6] Add retries to mdev unit

Otherwise it can fail with a dependency failure
---
 roles/vgpu/templates/nvidia-mdev.service.j2 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/roles/vgpu/templates/nvidia-mdev.service.j2 b/roles/vgpu/templates/nvidia-mdev.service.j2
index d5b287c..0867080 100644
--- a/roles/vgpu/templates/nvidia-mdev.service.j2
+++ b/roles/vgpu/templates/nvidia-mdev.service.j2
@@ -10,6 +10,8 @@ Requires=nvidia-sriov-{{ vgpu_definition.pci_address }}.service
 {% endif %}
 
 [Service]
+Restart=on-failure
+RestartSec=30
 Type=oneshot
 User=root
 ExecStartPre=/bin/sleep 5

From 486a47ef328334d338f90d08fd40db0b140f8d5b Mon Sep 17 00:00:00 2001
From: Will Szumski <will@stackhpc.com>
Date: Fri, 10 Jan 2025 14:36:48 +0000
Subject: [PATCH 3/6] Take2: Fix failure to start on dependency failure

---
 roles/vgpu/templates/nvidia-mdev.service.j2 | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/roles/vgpu/templates/nvidia-mdev.service.j2 b/roles/vgpu/templates/nvidia-mdev.service.j2
index 0867080..bd23660 100644
--- a/roles/vgpu/templates/nvidia-mdev.service.j2
+++ b/roles/vgpu/templates/nvidia-mdev.service.j2
@@ -4,9 +4,6 @@ Before=docker.service
 {% if vgpu_definition.mig_devices is defined %}
 After=nvidia-mig-manager.service
 Requires=nvidia-mig-manager.service
-{% else %}
-After=nvidia-sriov-{{ vgpu_definition.pci_address }}.service
-Requires=nvidia-sriov-{{ vgpu_definition.pci_address }}.service
 {% endif %}
 
 [Service]
@@ -14,7 +11,10 @@ Restart=on-failure
 RestartSec=30
 Type=oneshot
 User=root
-ExecStartPre=/bin/sleep 5
+{% if vgpu_definition.mig_devices is not defined %}
+# Workaround lack of UpheldBy/RestartMode=direct in systemd<254
+ExecStartPre=/usr/bin/systemctl is-active nvidia-sriov-{{ vgpu_definition.pci_address }}.service
+{% endif %}
 ExecStart=-/usr/sbin/mdevctl start --uuid %i
 RemainAfterExit=yes
 

From b75712df66c3935a086e8167a9b427d9c66c2361 Mon Sep 17 00:00:00 2001
From: Will Szumski <will@stackhpc.com>
Date: Fri, 10 Jan 2025 15:46:56 +0000
Subject: [PATCH 4/6] Improve comment

---
 roles/vgpu/templates/nvidia-mdev.service.j2 | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/roles/vgpu/templates/nvidia-mdev.service.j2 b/roles/vgpu/templates/nvidia-mdev.service.j2
index bd23660..e651756 100644
--- a/roles/vgpu/templates/nvidia-mdev.service.j2
+++ b/roles/vgpu/templates/nvidia-mdev.service.j2
@@ -12,7 +12,9 @@ RestartSec=30
 Type=oneshot
 User=root
 {% if vgpu_definition.mig_devices is not defined %}
-# Workaround lack of UpheldBy/RestartMode=direct in systemd<254
+# Workaround lack of UpheldBy/RestartMode=direct in systemd<254 to ensure unit is
+# started when the dependency fails, see:
+# https://unix.stackexchange.com/questions/213185/restarting-systemd-service-on-dependency-failure
 ExecStartPre=/usr/bin/systemctl is-active nvidia-sriov-{{ vgpu_definition.pci_address }}.service
 {% endif %}
 ExecStart=-/usr/sbin/mdevctl start --uuid %i

From 31cc1666e3a6c534d9ba60a28c2b0e685c370278 Mon Sep 17 00:00:00 2001
From: Will Szumski <will@stackhpc.com>
Date: Wed, 12 Mar 2025 17:10:37 +0000
Subject: [PATCH 5/6] Add comment explaing why we rebind the nvidia driver

---
 roles/vgpu/templates/nvidia-sriov.service.j2 | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/roles/vgpu/templates/nvidia-sriov.service.j2 b/roles/vgpu/templates/nvidia-sriov.service.j2
index 50e7dfa..27510ba 100644
--- a/roles/vgpu/templates/nvidia-sriov.service.j2
+++ b/roles/vgpu/templates/nvidia-sriov.service.j2
@@ -14,6 +14,9 @@ User=root
 # the mdev_support_devices entry doesn't show up in sysfs. I was unable to get this to show up again
 # without a reboot.
 ExecStartPre=/bin/sleep 5
+# NOTE(wszumski): The sriov-manage script will unbind the nvidia driver to initialize the virtual functions.
+# If it fails part way through, the driver can be left unbound and subsequent executions of sriov-mange
+# will fail. This ensures that the nvidia driver is always bound before we run sriov-manage.
 ExecStart=/bin/bash -c "echo '{{ vgpu_definition.pci_address }}' > /sys/bus/pci/drivers/nvidia/bind || true"
 ExecStart=/usr/lib/nvidia/sriov-manage -e {{ vgpu_definition.pci_address }}
 RemainAfterExit=yes

From 4a2f6e815ba65fe843d0720f09bab02e12cfa6dd Mon Sep 17 00:00:00 2001
From: Will Szumski <will@stackhpc.com>
Date: Wed, 12 Mar 2025 17:15:51 +0000
Subject: [PATCH 6/6] Fix line length on comments

---
 roles/vgpu/templates/nvidia-sriov.service.j2 | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/roles/vgpu/templates/nvidia-sriov.service.j2 b/roles/vgpu/templates/nvidia-sriov.service.j2
index 27510ba..4cd4db0 100644
--- a/roles/vgpu/templates/nvidia-sriov.service.j2
+++ b/roles/vgpu/templates/nvidia-sriov.service.j2
@@ -10,13 +10,14 @@ Restart=on-failure
 RestartSec=30
 Type=oneshot
 User=root
-# NOTE(wszumski): There is a race in the driver initialization where if we run this too early, then
-# the mdev_support_devices entry doesn't show up in sysfs. I was unable to get this to show up again
-# without a reboot.
+# NOTE(wszumski): There is a race in the driver initialization where if we run
+# this too early, then the mdev_support_devices entry doesn't show up in sysfs.
+# I was unable to get this to show up again without a reboot.
 ExecStartPre=/bin/sleep 5
-# NOTE(wszumski): The sriov-manage script will unbind the nvidia driver to initialize the virtual functions.
-# If it fails part way through, the driver can be left unbound and subsequent executions of sriov-mange
-# will fail. This ensures that the nvidia driver is always bound before we run sriov-manage.
+# NOTE(wszumski): The sriov-manage script will unbind the nvidia driver to
+# initialize the virtual functions.  If it fails part way through, the driver
+# can be left unbound, and subsequent executions of sriov-mange will fail. This
+# ensures that the nvidia driver is always bound before we run sriov-manage.
 ExecStart=/bin/bash -c "echo '{{ vgpu_definition.pci_address }}' > /sys/bus/pci/drivers/nvidia/bind || true"
 ExecStart=/usr/lib/nvidia/sriov-manage -e {{ vgpu_definition.pci_address }}
 RemainAfterExit=yes