Skip to content

Add ports to ResumeProgram ports and add ResumeFailProgram #16

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,5 @@ console_scripts =
slurm-openstack-rebuild = slurm_openstack_tools.reboot:main
slurm-stats = slurm_openstack_tools.sacct:main
slurm-openstack-resume = slurm_openstack_tools.resume:main
slurm-openstack-resumefail = slurm_openstack_tools.resumefail:main
slurm-openstack-suspend = slurm_openstack_tools.suspend:main
73 changes: 27 additions & 46 deletions slurm_openstack_tools/resume.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,6 @@
# -*- coding: utf-8 -*-

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# See ../LICENCE

"""A Slurm ResumeProgram to create OpenStack instances.

Expand Down Expand Up @@ -48,6 +38,8 @@

import openstack

from slurm_openstack_tools.utils import get_slurm_conf, expand_nodes

REQUIRED_PARAMS = ('image', 'flavor', 'keypair', 'network')

# configure logging to syslog - by default only "info" and above
Expand All @@ -59,24 +51,6 @@
logger.addHandler(handler)


def get_statesavelocation():
"""Return the path for Slurm's StateSaveLocation """
scontrol = subprocess.run(
['scontrol', 'show', 'config'],
stdout=subprocess.PIPE, universal_newlines=True)
for line in scontrol.stdout.splitlines():
if line.startswith(
'StateSaveLocation'): # StateSaveLocation = /var/spool/slurm
return line.split()[-1]


def expand_nodes(hostlist_expr):
scontrol = subprocess.run(
['scontrol', 'show', 'hostnames', hostlist_expr],
stdout=subprocess.PIPE, universal_newlines=True)
return scontrol.stdout.strip().split('\n')


def get_features(nodenames):
"""Retrieve the features specified for given node(s).

Expand All @@ -100,23 +74,19 @@ def get_features(nodenames):
return features


def create_server(conn, name, image, flavor, network, keypair):
def create_server(conn, name, image, flavor, network, keypair, port=None):

server = conn.compute.create_server(
name=name, image_id=image.id, flavor_id=flavor.id,
networks=[{"uuid": network.id}], key_name=keypair.name,
networks=[{"port": port.id}] if port else [{"uuid": network.id}],
)
# server = conn.compute.wait_for_server(...)

return server


def resume():
debug = False
if len(sys.argv) > 2:
logger.info(f"Running in debug mode - won't actually create nodes")
debug = True
hostlist_expr = sys.argv[1]
def resume(hostlist_expr, debug=False):
""" Creates nodes defined by a hostlist expression. Returns a sequence of OpenStack instance UUIDs. """

logger.info(f"Slurmctld invoked resume {hostlist_expr}")
new_nodes = expand_nodes(hostlist_expr)

Expand All @@ -126,8 +96,9 @@ def resume():
features = get_features(hostlist_expr)
logger.info(f"Read feature information from slurm")

statedir = get_statesavelocation()
statedir = get_slurm_conf()['StateSaveLocation']

created_instance_ids = []
for node in new_nodes:
# extract the openstack parameters from node features:
if node not in features:
Expand All @@ -150,28 +121,38 @@ def resume():
'network': conn.network.find_network(os_parameters['network']),
'keypair': conn.compute.find_keypair(os_parameters['keypair']),
}
not_found = dict((k, v) for (k, v) in os_objects.items() if v is None)
not_found = dict([(k, os_parameters[k]) for (k, v) in os_objects.items() if v is None])
if not_found:
raise ValueError(
'Could not find openstack objects for: %s' %
', '.join(not_found))
'Could not find openstack objects for: '
', '.join([f'{k}={v}' for (k, v) in not_found.items()])
)

# get optional port - done outside os_objects so an error finding network doesn't cause unhelpful port traceback:
os_objects['port'] = conn.network.find_port(node, network_id=os_objects['network'].id)

if debug:
logger.info(f"os_objects for {node} : {os_objects}")
if not debug:
else:
logger.info(f"creating node {node}")
# TODO(stevebrasier): save id to disk so can use it instead of name
# on deletion (to cope with multiple instances with same name)
server = create_server(conn, node, **os_objects)
logger.info(f"server: {server}")
with open(os.path.join(statedir, node), 'w') as f:
f.write(server.id)
# Don't need scontrol update nodename={node} nodeaddr={server_ip}
# as using SlurmctldParameters=cloud_dns
created_instance_ids.append(server.id)

return created_instance_ids

def main():

try:
resume()
hostlist_expr = sys.argv[1]
debug = True if len(sys.argv) > 2 else False
if debug:
logger.info(f"Running in debug mode - won't actually create nodes")
resume(hostlist_expr, debug)
except BaseException:
logger.exception('Exception in main:')
raise
85 changes: 85 additions & 0 deletions slurm_openstack_tools/resumefail.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-

# See ../LICENCE

"""A Slurm ResumeFail for OpenStack instances.

This simply resumes any DOWN nodes for which there is no corresponding cloud instance.

Usage:

resumefail HOSTLIST_EXPRESSION

where: HOSTLIST_EXPRESSION: Name(s) of node(s) which have failed using Slurm's
hostlist expression, as per [1].

Output and exceptions are written to the syslog.

OpenStack credentials must be available to this script (e.g. via an
application credential in /etc/openstack/clouds.yaml readable by the slurm
user).

[1]: https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeFailProgram

"""

import logging.handlers
import os
import subprocess
import sys

import openstack

from slurm_openstack_tools.utils import expand_nodes

SCONTROL_PATH = '/usr/bin/scontrol'

# configure logging to syslog - by default only "info" and above
# categories appear
logger = logging.getLogger("syslogger")
logger.setLevel(logging.DEBUG)
handler = logging.handlers.SysLogHandler("/dev/log")
handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s'))
logger.addHandler(handler)

def resumefail():
hostlist_expr = sys.argv[1]
logger.info(f"Slurmctld invoked resumefail {hostlist_expr}")
failed_nodes = expand_nodes(hostlist_expr)

conn = openstack.connection.from_config()
logger.info(f"Got openstack connection {conn}")

for node in failed_nodes:
server = conn.compute.find_server(node)
if server is None:
logger.info(f"No instance found for {node}, requesting resume of node.")
scontrol = subprocess.run([SCONTROL_PATH, 'update', 'state=resume', 'nodename=%s' % node],
stdout=subprocess.PIPE, universal_newlines=True)
else:
# retrieve info ourselves for errors, not exposed through the SDK attributes:
info = conn.compute.get(f"/servers/{server.id}").json()
if info['server']['status'] == 'ERROR': # https://docs.openstack.org/api-ref/compute/?expanded=show-server-details-detail#id30
fault_message = info['server'].get('fault', {}).get('message', None)
if fault_message:
if "not enough hosts available" in fault_message:
logger.info(f"Instance for {node} has error message '{fault_message}': Requesting instance delete and resume of node.")
conn.compute.delete_server(server, ignore_missing=True, force=True)
scontrol = subprocess.run([SCONTROL_PATH, 'update', 'state=resume', 'nodename=%s' % node],
stdout=subprocess.PIPE, universal_newlines=True)
else:
logger.error(f"Instance for {node} has error message '{fault_message}'. Cannot fix this.")
else:
logger.error(f"Instance for {node} has status {info['server']['status']}. Cannot fix this.")

def main():
try:
resumefail()
except BaseException:
logger.exception('Exception in main:')
raise

if __name__ == '__main__':
# running for testing
handler = logging.StreamHandler() # log to console
main()
60 changes: 17 additions & 43 deletions slurm_openstack_tools/suspend.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,6 @@
# -*- coding: utf-8 -*-

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# See ../LICENCE

"""A Slurm SuspendProgram to delete OpenStack instances.

Expand Down Expand Up @@ -41,6 +31,8 @@

import openstack

from slurm_openstack_tools.utils import get_slurm_conf, expand_nodes

# configure logging to syslog - by default only "info" and above
# categories appear
logger = logging.getLogger("syslogger")
Expand All @@ -49,56 +41,38 @@
handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s'))
logger.addHandler(handler)


def get_statesavelocation():
"""Return the path for Slurm's StateSaveLocation """
scontrol = subprocess.run(
['scontrol', 'show', 'config'],
stdout=subprocess.PIPE, universal_newlines=True)
for line in scontrol.stdout.splitlines():
if line.startswith(
'StateSaveLocation'): # StateSaveLocation = /var/spool/slurm
return line.split()[-1]


def expand_nodes(hostlist_expr):
scontrol = subprocess.run(
['scontrol', 'show', 'hostnames', hostlist_expr],
stdout=subprocess.PIPE, universal_newlines=True)
return scontrol.stdout.strip().split('\n')


def delete_server(conn, name):
server = conn.compute.find_server(name)
conn.compute.delete_server(server)


def suspend():
hostlist_expr = sys.argv[1]
def suspend(hostlist_expr):
""" Deletes nodes defined by a hostlist expression. Returns a sequence of OpenStack instance UUIDs. """

logger.info(f"Slurmctld invoked suspend {hostlist_expr}")
remove_nodes = expand_nodes(hostlist_expr)

conn = openstack.connection.from_config()
logger.info(f"Got openstack connection {conn}")

statedir = get_slurm_conf()['StateSaveLocation']

deleted_instance_ids = []
for node in remove_nodes:
instance_id = False
statedir = get_statesavelocation()
instance_file = os.path.join(statedir, node)
try:
with open(instance_file) as f:
instance_id = f.readline().strip()
except FileNotFoundError:
logger.info(
f"no instance file found in {statedir} for node {node}")
logger.error(f"no instance file found in {statedir} for node {node}")
exit(1)

logger.info(f"deleting node {instance_id or node}")
delete_server(conn, (instance_id or node))
logger.info(f"deleting node {instance_id}")
conn.compute.delete_server(instance_id)
deleted_instance_ids.append(instance_id)

return deleted_instance_ids

def main():
try:
suspend()
hostlist_expr = sys.argv[1]
suspend(hostlist_expr)
except BaseException:
logger.exception('Exception in main:')
raise
19 changes: 19 additions & 0 deletions slurm_openstack_tools/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import subprocess

def get_slurm_conf():
""" Return the path for Slurm's StateSaveLocation """
scontrol = subprocess.run(
['scontrol', 'show', 'config'],
stdout=subprocess.PIPE, universal_newlines=True,
)
config = {}
for line in scontrol.stdout.splitlines()[1:]: # skips e.g. 'Configuration data as of 2022-03-22T09:38:28' in first item
k, _, v = line.strip().partition('=')
config[k.strip()] = v.strip()
return config

def expand_nodes(hostlist_expr):
scontrol = subprocess.run(
['scontrol', 'show', 'hostnames', hostlist_expr],
stdout=subprocess.PIPE, universal_newlines=True)
return scontrol.stdout.strip().split('\n')