diff --git a/setup.cfg b/setup.cfg index 6ed4668..4855b95 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,3 +32,4 @@ console_scripts = slurm-openstack-resume = slurm_openstack_tools.resume:main slurm-openstack-resumefail = slurm_openstack_tools.resumefail:main slurm-openstack-suspend = slurm_openstack_tools.suspend:main + slurm-openstack-redeploy = slurm_openstack_tools.redeploy:main diff --git a/slurm_openstack_tools/redeploy.py b/slurm_openstack_tools/redeploy.py new file mode 100644 index 0000000..f91991e --- /dev/null +++ b/slurm_openstack_tools/redeploy.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- + +# See ../LICENCE + +import logging.handlers +import os +import subprocess +import sys +import time + +import openstack +import pbr.version + +from slurm_openstack_tools.resume import resume +from slurm_openstack_tools.suspend import suspend +from slurm_openstack_tools import utils + +__version__ = pbr.version.VersionInfo("slurm-openstack-tools").version_string() + +# configure logging to syslog - by default only "info" +# and above categories appear +logger = logging.getLogger("syslogger") +logger.setLevel(logging.DEBUG) +handler = logging.handlers.SysLogHandler("/dev/log") +handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) +logger.addHandler(handler) + +def matching_nodes(hostlist_expr, state): + """ Returns a hostlist expression for only nodes matching `state`. """ + + sinfo = subprocess.run( + ['sinfo', '--noheader', f"--nodes={hostlist_expr}", f"--states={state}", "--format=%N"], + stdout=subprocess.PIPE, universal_newlines=True) + + return sinfo.stdout.strip() + +def openstack_instance_status(conn, uuids): + """ Return a dict keyed by UUID with status str (or None). """ + statuses = dict((id, None) for id in uuids) + for uuid in uuids: + response = conn.compute.get(f"/servers/detail").json() + for server in response['servers']: + id = server['id'] + if id in uuids: + statuses[id] = server['status'] + return statuses + +def redeploy(hostlist_expr): + """A PrologSlurmctld for slurm which deletes and recreates CLOUD nodes. + + Messages and errors are logged to syslog. + + Any non-CLOUD-state nodes in the job are not modified. + + Requires: + - Python 3 with openstacksdk module + - An application credential: + - with at least POST rights to /v3/servers/{server_id}/action + - available via a clouds.yaml file containing only one cloud + """ # noqa E501 + + logger.info(f"Slurmctld invoked PrologSlurmctld for {hostlist_expr}") + + conn = openstack.connection.from_config() + + # filter out non-CLOUD nodes: + cloud_hostlist = matching_nodes(hostlist_expr, 'CLOUD') + if not cloud_hostlist: + exit(0) + + # get slurm config: + slurm_conf = utils.get_slurm_conf() + resume_timeout = slurm_conf['ResumeTimeout'].split() # time from issuing resume to being available for use, e.g. '300 sec' + suspend_timeout = slurm_conf['SuspendTimeout'].split() # time from issuing suspend to being ready for new resume, e.g. '30 sec' + + # delete nodes: + suspend(cloud_hostlist) + assert suspend_timeout[1] == 'sec', 'SuspendTimeout not defined in seconds' + time.sleep(int(suspend_timeout[0])) + + # recreate them: + new_uuids = resume(cloud_hostlist) + + # wait until all instances are ACTIVE or timeout: + assert resume_timeout[1] == 'sec', 'ResumeTimeout not defined in seconds' + start = time.monotonic() + sleep_time = int(resume_timeout[0]) / 10 + all_active = False + while True: + status = openstack_instance_status(conn, new_uuids) + if all(s == 'ACTIVE' for s in status.values()): + all_active = True + break + if (time.monotonic() - start) > int(resume_timeout[0]): + break + time.sleep(sleep_time) + + # check all nodes up: + if not all_active: + logger.error(f"Not all nodes ACTIVE by ResumeTimeout: {status}") + exit(1) + + +def main(): + try: + hostlist_expr = os.getenv('SLURM_JOB_NODELIST') + redeploy(hostlist_expr) + except BaseException: + logger.exception('Exception in redeploy():') + raise diff --git a/slurm_openstack_tools/suspend.py b/slurm_openstack_tools/suspend.py index 549f1c4..fb6b402 100644 --- a/slurm_openstack_tools/suspend.py +++ b/slurm_openstack_tools/suspend.py @@ -43,7 +43,7 @@ def suspend(hostlist_expr): """ Deletes nodes defined by a hostlist expression. Returns a sequence of OpenStack instance UUIDs. """ - hostlist_expr = sys.argv[1] + logger.info(f"Slurmctld invoked suspend {hostlist_expr}") remove_nodes = expand_nodes(hostlist_expr)