Skip to content

Commit 5601558

Browse files
committed
Add option to specify rebuild image via a file
You can add a file on the node to trigger the rebuild. We look for a file with an image uuid in the following location: /var/spool/slurm/REBUILD_IMAGE_UUID This is required when you can't specify a reboot reason, such as when using a job to trigger the rebuild. Note the file takes precidence over an sinfo reason, in part because it means we don't need to reply on sinfo being installed in a specific location in this case.
1 parent 76c9c93 commit 5601558

File tree

3 files changed

+34
-3
lines changed

3 files changed

+34
-3
lines changed

README.rst

+8
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,14 @@ If you don't specifiy the image, it will default to doing a rebuild with
4141
the existing image. If you don't have "rebuild" at the start of your
4242
reason, openstack nodes will do a regular reboot.
4343

44+
Should you need to use a job to trigger a rebuild,
45+
so you can't specify a custom reason for the reboot,
46+
you can add a file with an image uuid in the following location
47+
on the node you want to rebuild: /var/spool/slurm/REBUILD_IMAGE_UUID
48+
In addition, when this file is present, we no longer call out
49+
to sinfo. This can be very useful if you have sinfo installed
50+
in a non-standard location.
51+
4452
slurm-stats
4553
^^^^^^^^^^^
4654

slurm_openstack_tools/reboot.py

+19-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# -*- coding: utf-8 -*-
1+
#!/usr/bin/env python3
22

33
# Licensed under the Apache License, Version 2.0 (the "License"); you may
44
# not use this file except in compliance with the License. You may obtain
@@ -33,6 +33,7 @@
3333
logger.addHandler(handler)
3434

3535
INSTANCE_UUID_FILE = "/var/lib/cloud/data/instance-id"
36+
REBUILD_IMAGE_UUID_FILE = "/var/spool/slurm/REBUILD_IMAGE_UUID"
3637

3738

3839
def get_openstack_server_id():
@@ -43,6 +44,15 @@ def get_openstack_server_id():
4344
return f.readline().strip()
4445

4546

47+
def get_rebuild_image_from_file():
48+
if not path.exists(REBUILD_IMAGE_UUID_FILE):
49+
return None
50+
with open(REBUILD_IMAGE_UUID_FILE) as f:
51+
image = f.readline().strip()
52+
logger.info(f"spool file requested image:{image}")
53+
return image
54+
55+
4656
def get_sinfo_path():
4757
# TODO(johngarbutt): get this from environment or config file?
4858
sinfo_alt_path = "/usr/local/software/slurm/current/bin/sinfo"
@@ -52,6 +62,12 @@ def get_sinfo_path():
5262

5363

5464
def get_reboot_reason():
65+
image_uuid = get_rebuild_image_from_file()
66+
if image_uuid:
67+
# don't need to check sinfo
68+
# TODO(johngarbutt) need a cleaner interface
69+
return f"rebuild image:{image_uuid}"
70+
5571
# find our short hostname (without fqdn):
5672
hostname = socket.gethostname().split(".")[0]
5773
sinfo_path = get_sinfo_path()
@@ -78,7 +94,7 @@ def get_image_from_reason(reason):
7894
if len(image_tokens) == 2 and image_tokens[0] == "image":
7995
if image_tokens[1]:
8096
image = image_tokens[1]
81-
logger.info(f"user requested image:%{image}")
97+
logger.info(f"requested image:%{image}")
8298
return image
8399

84100

@@ -90,7 +106,7 @@ def rebuild_openstack_server(server_id, reason):
90106
image_uuid = get_image_from_reason(reason)
91107
if not image_uuid:
92108
image_uuid = server.image.id
93-
logger.info(f"fallback to existing image:%{image_uuid}")
109+
logger.info(f"fallback to existing image:{image_uuid}")
94110

95111
# Note that OpenStack will power down the server as part of the rebuild
96112
logger.info(f"rebuilding server %{server_id} with image %{image_uuid}")

slurm_openstack_tools/tests/test_reboot.py

+7
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,10 @@ def test_rebuild_or_reboot_non_openstack(
8787
reboot.rebuild_or_reboot()
8888
mock_exec.assert_called_once_with("reboot", ["reboot"])
8989
mock_id.assert_called_once_with()
90+
91+
@mock.patch.object(reboot, "get_rebuild_image_from_file",
92+
return_value="uuid")
93+
def test_get_reboot_reason(self, mock_get_image):
94+
reason = reboot.get_reboot_reason()
95+
self.assertEqual("rebuild image:uuid", reason)
96+
mock_get_image.assert_called_once_with()

0 commit comments

Comments
 (0)