diff --git a/setup.cfg b/setup.cfg index 439894e..6ed4668 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,4 +30,5 @@ console_scripts = slurm-openstack-rebuild = slurm_openstack_tools.reboot:main slurm-stats = slurm_openstack_tools.sacct:main slurm-openstack-resume = slurm_openstack_tools.resume:main + slurm-openstack-resumefail = slurm_openstack_tools.resumefail:main slurm-openstack-suspend = slurm_openstack_tools.suspend:main diff --git a/slurm_openstack_tools/resume.py b/slurm_openstack_tools/resume.py index 328e1c9..b1404d7 100644 --- a/slurm_openstack_tools/resume.py +++ b/slurm_openstack_tools/resume.py @@ -1,16 +1,6 @@ # -*- coding: utf-8 -*- -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. +# See ../LICENCE """A Slurm ResumeProgram to create OpenStack instances. @@ -48,6 +38,8 @@ import openstack +from slurm_openstack_tools.utils import get_slurm_conf, expand_nodes + REQUIRED_PARAMS = ('image', 'flavor', 'keypair', 'network') # configure logging to syslog - by default only "info" and above @@ -59,24 +51,6 @@ logger.addHandler(handler) -def get_statesavelocation(): - """Return the path for Slurm's StateSaveLocation """ - scontrol = subprocess.run( - ['scontrol', 'show', 'config'], - stdout=subprocess.PIPE, universal_newlines=True) - for line in scontrol.stdout.splitlines(): - if line.startswith( - 'StateSaveLocation'): # StateSaveLocation = /var/spool/slurm - return line.split()[-1] - - -def expand_nodes(hostlist_expr): - scontrol = subprocess.run( - ['scontrol', 'show', 'hostnames', hostlist_expr], - stdout=subprocess.PIPE, universal_newlines=True) - return scontrol.stdout.strip().split('\n') - - def get_features(nodenames): """Retrieve the features specified for given node(s). @@ -100,23 +74,19 @@ def get_features(nodenames): return features -def create_server(conn, name, image, flavor, network, keypair): +def create_server(conn, name, image, flavor, network, keypair, port=None): server = conn.compute.create_server( name=name, image_id=image.id, flavor_id=flavor.id, - networks=[{"uuid": network.id}], key_name=keypair.name, + networks=[{"port": port.id}] if port else [{"uuid": network.id}], ) - # server = conn.compute.wait_for_server(...) return server -def resume(): - debug = False - if len(sys.argv) > 2: - logger.info(f"Running in debug mode - won't actually create nodes") - debug = True - hostlist_expr = sys.argv[1] +def resume(hostlist_expr, debug=False): + """ Creates nodes defined by a hostlist expression. Returns a sequence of OpenStack instance UUIDs. """ + logger.info(f"Slurmctld invoked resume {hostlist_expr}") new_nodes = expand_nodes(hostlist_expr) @@ -126,8 +96,9 @@ def resume(): features = get_features(hostlist_expr) logger.info(f"Read feature information from slurm") - statedir = get_statesavelocation() + statedir = get_slurm_conf()['StateSaveLocation'] + created_instance_ids = [] for node in new_nodes: # extract the openstack parameters from node features: if node not in features: @@ -150,28 +121,38 @@ def resume(): 'network': conn.network.find_network(os_parameters['network']), 'keypair': conn.compute.find_keypair(os_parameters['keypair']), } - not_found = dict((k, v) for (k, v) in os_objects.items() if v is None) + not_found = dict([(k, os_parameters[k]) for (k, v) in os_objects.items() if v is None]) if not_found: raise ValueError( - 'Could not find openstack objects for: %s' % - ', '.join(not_found)) + 'Could not find openstack objects for: ' + ', '.join([f'{k}={v}' for (k, v) in not_found.items()]) + ) + + # get optional port - done outside os_objects so an error finding network doesn't cause unhelpful port traceback: + os_objects['port'] = conn.network.find_port(node, network_id=os_objects['network'].id) + if debug: logger.info(f"os_objects for {node} : {os_objects}") - if not debug: + else: logger.info(f"creating node {node}") - # TODO(stevebrasier): save id to disk so can use it instead of name - # on deletion (to cope with multiple instances with same name) server = create_server(conn, node, **os_objects) logger.info(f"server: {server}") with open(os.path.join(statedir, node), 'w') as f: f.write(server.id) # Don't need scontrol update nodename={node} nodeaddr={server_ip} # as using SlurmctldParameters=cloud_dns + created_instance_ids.append(server.id) + return created_instance_ids def main(): + try: - resume() + hostlist_expr = sys.argv[1] + debug = True if len(sys.argv) > 2 else False + if debug: + logger.info(f"Running in debug mode - won't actually create nodes") + resume(hostlist_expr, debug) except BaseException: logger.exception('Exception in main:') raise diff --git a/slurm_openstack_tools/resumefail.py b/slurm_openstack_tools/resumefail.py new file mode 100644 index 0000000..eae41e7 --- /dev/null +++ b/slurm_openstack_tools/resumefail.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# See ../LICENCE + +"""A Slurm ResumeFail for OpenStack instances. + +This simply resumes any DOWN nodes for which there is no corresponding cloud instance. + +Usage: + + resumefail HOSTLIST_EXPRESSION + +where: HOSTLIST_EXPRESSION: Name(s) of node(s) which have failed using Slurm's + hostlist expression, as per [1]. + +Output and exceptions are written to the syslog. + +OpenStack credentials must be available to this script (e.g. via an +application credential in /etc/openstack/clouds.yaml readable by the slurm +user). + +[1]: https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeFailProgram + +""" + +import logging.handlers +import os +import subprocess +import sys + +import openstack + +from slurm_openstack_tools.utils import expand_nodes + +SCONTROL_PATH = '/usr/bin/scontrol' + +# configure logging to syslog - by default only "info" and above +# categories appear +logger = logging.getLogger("syslogger") +logger.setLevel(logging.DEBUG) +handler = logging.handlers.SysLogHandler("/dev/log") +handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) +logger.addHandler(handler) + +def resumefail(): + hostlist_expr = sys.argv[1] + logger.info(f"Slurmctld invoked resumefail {hostlist_expr}") + failed_nodes = expand_nodes(hostlist_expr) + + conn = openstack.connection.from_config() + logger.info(f"Got openstack connection {conn}") + + for node in failed_nodes: + server = conn.compute.find_server(node) + if server is None: + logger.info(f"No instance found for {node}, requesting resume of node.") + scontrol = subprocess.run([SCONTROL_PATH, 'update', 'state=resume', 'nodename=%s' % node], + stdout=subprocess.PIPE, universal_newlines=True) + else: + # retrieve info ourselves for errors, not exposed through the SDK attributes: + info = conn.compute.get(f"/servers/{server.id}").json() + if info['server']['status'] == 'ERROR': # https://docs.openstack.org/api-ref/compute/?expanded=show-server-details-detail#id30 + fault_message = info['server'].get('fault', {}).get('message', None) + if fault_message: + if "not enough hosts available" in fault_message: + logger.info(f"Instance for {node} has error message '{fault_message}': Requesting instance delete and resume of node.") + conn.compute.delete_server(server, ignore_missing=True, force=True) + scontrol = subprocess.run([SCONTROL_PATH, 'update', 'state=resume', 'nodename=%s' % node], + stdout=subprocess.PIPE, universal_newlines=True) + else: + logger.error(f"Instance for {node} has error message '{fault_message}'. Cannot fix this.") + else: + logger.error(f"Instance for {node} has status {info['server']['status']}. Cannot fix this.") + +def main(): + try: + resumefail() + except BaseException: + logger.exception('Exception in main:') + raise + +if __name__ == '__main__': + # running for testing + handler = logging.StreamHandler() # log to console + main() diff --git a/slurm_openstack_tools/suspend.py b/slurm_openstack_tools/suspend.py index 1290248..fb6b402 100644 --- a/slurm_openstack_tools/suspend.py +++ b/slurm_openstack_tools/suspend.py @@ -1,16 +1,6 @@ # -*- coding: utf-8 -*- -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. +# See ../LICENCE """A Slurm SuspendProgram to delete OpenStack instances. @@ -41,6 +31,8 @@ import openstack +from slurm_openstack_tools.utils import get_slurm_conf, expand_nodes + # configure logging to syslog - by default only "info" and above # categories appear logger = logging.getLogger("syslogger") @@ -49,56 +41,38 @@ handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) logger.addHandler(handler) - -def get_statesavelocation(): - """Return the path for Slurm's StateSaveLocation """ - scontrol = subprocess.run( - ['scontrol', 'show', 'config'], - stdout=subprocess.PIPE, universal_newlines=True) - for line in scontrol.stdout.splitlines(): - if line.startswith( - 'StateSaveLocation'): # StateSaveLocation = /var/spool/slurm - return line.split()[-1] - - -def expand_nodes(hostlist_expr): - scontrol = subprocess.run( - ['scontrol', 'show', 'hostnames', hostlist_expr], - stdout=subprocess.PIPE, universal_newlines=True) - return scontrol.stdout.strip().split('\n') - - -def delete_server(conn, name): - server = conn.compute.find_server(name) - conn.compute.delete_server(server) - - -def suspend(): - hostlist_expr = sys.argv[1] +def suspend(hostlist_expr): + """ Deletes nodes defined by a hostlist expression. Returns a sequence of OpenStack instance UUIDs. """ + logger.info(f"Slurmctld invoked suspend {hostlist_expr}") remove_nodes = expand_nodes(hostlist_expr) conn = openstack.connection.from_config() logger.info(f"Got openstack connection {conn}") + statedir = get_slurm_conf()['StateSaveLocation'] + + deleted_instance_ids = [] for node in remove_nodes: instance_id = False - statedir = get_statesavelocation() instance_file = os.path.join(statedir, node) try: with open(instance_file) as f: instance_id = f.readline().strip() except FileNotFoundError: - logger.info( - f"no instance file found in {statedir} for node {node}") + logger.error(f"no instance file found in {statedir} for node {node}") + exit(1) - logger.info(f"deleting node {instance_id or node}") - delete_server(conn, (instance_id or node)) + logger.info(f"deleting node {instance_id}") + conn.compute.delete_server(instance_id) + deleted_instance_ids.append(instance_id) + return deleted_instance_ids def main(): try: - suspend() + hostlist_expr = sys.argv[1] + suspend(hostlist_expr) except BaseException: logger.exception('Exception in main:') raise diff --git a/slurm_openstack_tools/utils.py b/slurm_openstack_tools/utils.py new file mode 100644 index 0000000..58978e7 --- /dev/null +++ b/slurm_openstack_tools/utils.py @@ -0,0 +1,19 @@ +import subprocess + +def get_slurm_conf(): + """ Return the path for Slurm's StateSaveLocation """ + scontrol = subprocess.run( + ['scontrol', 'show', 'config'], + stdout=subprocess.PIPE, universal_newlines=True, + ) + config = {} + for line in scontrol.stdout.splitlines()[1:]: # skips e.g. 'Configuration data as of 2022-03-22T09:38:28' in first item + k, _, v = line.strip().partition('=') + config[k.strip()] = v.strip() + return config + +def expand_nodes(hostlist_expr): + scontrol = subprocess.run( + ['scontrol', 'show', 'hostnames', hostlist_expr], + stdout=subprocess.PIPE, universal_newlines=True) + return scontrol.stdout.strip().split('\n')