diff --git a/README.rst b/README.rst index a9fdf62..b77142a 100644 --- a/README.rst +++ b/README.rst @@ -41,6 +41,14 @@ If you don't specifiy the image, it will default to doing a rebuild with the existing image. If you don't have "rebuild" at the start of your reason, openstack nodes will do a regular reboot. +Should you need to use a job to trigger a rebuild, +so you can't specify a custom reason for the reboot, +you can add a file with an image uuid in the following location +on the node you want to rebuild: /var/spool/slurm/REBUILD_IMAGE_UUID +In addition, when this file is present, we no longer call out +to sinfo. This can be very useful if you have sinfo installed +in a non-standard location. + slurm-stats ^^^^^^^^^^^ diff --git a/slurm_openstack_tools/reboot.py b/slurm_openstack_tools/reboot.py index 7d8bc0a..3b9516c 100644 --- a/slurm_openstack_tools/reboot.py +++ b/slurm_openstack_tools/reboot.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +#!/usr/bin/env python3 # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -33,6 +33,7 @@ logger.addHandler(handler) INSTANCE_UUID_FILE = "/var/lib/cloud/data/instance-id" +REBUILD_IMAGE_UUID_FILE = "/var/spool/slurm/REBUILD_IMAGE_UUID" def get_openstack_server_id(): @@ -43,6 +44,15 @@ def get_openstack_server_id(): return f.readline().strip() +def get_rebuild_image_from_file(): + if not path.exists(REBUILD_IMAGE_UUID_FILE): + return None + with open(REBUILD_IMAGE_UUID_FILE) as f: + image = f.readline().strip() + logger.info(f"spool file requested image:{image}") + return image + + def get_sinfo_path(): # TODO(johngarbutt): get this from environment or config file? sinfo_alt_path = "/usr/local/software/slurm/current/bin/sinfo" @@ -52,6 +62,12 @@ def get_sinfo_path(): def get_reboot_reason(): + image_uuid = get_rebuild_image_from_file() + if image_uuid: + # don't need to check sinfo + # TODO(johngarbutt) need a cleaner interface + return f"rebuild image:{image_uuid}" + # find our short hostname (without fqdn): hostname = socket.gethostname().split(".")[0] sinfo_path = get_sinfo_path() @@ -78,7 +94,7 @@ def get_image_from_reason(reason): if len(image_tokens) == 2 and image_tokens[0] == "image": if image_tokens[1]: image = image_tokens[1] - logger.info(f"user requested image:%{image}") + logger.info(f"requested image:{image}") return image @@ -90,7 +106,7 @@ def rebuild_openstack_server(server_id, reason): image_uuid = get_image_from_reason(reason) if not image_uuid: image_uuid = server.image.id - logger.info(f"fallback to existing image:%{image_uuid}") + logger.info(f"fallback to existing image:{image_uuid}") # Note that OpenStack will power down the server as part of the rebuild logger.info(f"rebuilding server %{server_id} with image %{image_uuid}") diff --git a/slurm_openstack_tools/tests/test_reboot.py b/slurm_openstack_tools/tests/test_reboot.py index 4b9481d..8061001 100644 --- a/slurm_openstack_tools/tests/test_reboot.py +++ b/slurm_openstack_tools/tests/test_reboot.py @@ -87,3 +87,10 @@ def test_rebuild_or_reboot_non_openstack( reboot.rebuild_or_reboot() mock_exec.assert_called_once_with("reboot", ["reboot"]) mock_id.assert_called_once_with() + + @mock.patch.object(reboot, "get_rebuild_image_from_file", + return_value="uuid") + def test_get_reboot_reason(self, mock_get_image): + reason = reboot.get_reboot_reason() + self.assertEqual("rebuild image:uuid", reason) + mock_get_image.assert_called_once_with()