From a12349276c99db03b91a5975366d37cad722666f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 30 Mar 2021 16:26:32 +0000 Subject: [PATCH 001/105] WIP autoscale PoC --- ansible/autoscale.yml | 24 +++ ansible/slurm.yml | 8 + ansible/templates/resume.j2 | 22 +++ ansible/templates/suspend.j2 | 8 + .../inventory/group_vars/all/openhpc.yml | 1 + environments/sausage-autoscale/README.md | 176 ++++++++++++++++++ environments/sausage-autoscale/activate | 23 +++ environments/sausage-autoscale/ansible.cfg | 14 ++ environments/sausage-autoscale/hooks/.gitkeep | 0 .../inventory/group_vars/all/.gitkeep | 0 .../inventory/group_vars/all/secrets.yml | 43 +++++ .../inventory/group_vars/autoscale.yml | 1 + .../inventory/group_vars/openhpc.yml | 19 ++ .../inventory/group_vars/rebuild.yml | 1 + .../sausage-autoscale/inventory/groups | 17 ++ .../sausage-autoscale/inventory/hosts | 31 +++ .../terraform/.terraform.lock.hcl | 39 ++++ .../sausage-autoscale/terraform/inventory.tpl | 32 ++++ .../sausage-autoscale/terraform/main.tf | 131 +++++++++++++ .../terraform/terraform.tfvars | 10 + 20 files changed, 600 insertions(+) create mode 100644 ansible/autoscale.yml create mode 100644 ansible/templates/resume.j2 create mode 100644 ansible/templates/suspend.j2 create mode 100644 environments/sausage-autoscale/README.md create mode 100644 environments/sausage-autoscale/activate create mode 100644 environments/sausage-autoscale/ansible.cfg create mode 100644 environments/sausage-autoscale/hooks/.gitkeep create mode 100644 environments/sausage-autoscale/inventory/group_vars/all/.gitkeep create mode 100644 environments/sausage-autoscale/inventory/group_vars/all/secrets.yml create mode 100644 environments/sausage-autoscale/inventory/group_vars/autoscale.yml create mode 100644 environments/sausage-autoscale/inventory/group_vars/openhpc.yml create mode 100644 environments/sausage-autoscale/inventory/group_vars/rebuild.yml create mode 100644 environments/sausage-autoscale/inventory/groups create mode 100755 environments/sausage-autoscale/inventory/hosts create mode 100644 environments/sausage-autoscale/terraform/.terraform.lock.hcl create mode 100644 environments/sausage-autoscale/terraform/inventory.tpl create mode 100644 environments/sausage-autoscale/terraform/main.tf create mode 100644 environments/sausage-autoscale/terraform/terraform.tfvars diff --git a/ansible/autoscale.yml b/ansible/autoscale.yml new file mode 100644 index 000000000..1e179deeb --- /dev/null +++ b/ansible/autoscale.yml @@ -0,0 +1,24 @@ +- name: Copy out clouds.yaml + copy: + src: "{{ openhpc_autoscale_clouds }}" + dest: /etc/openstack/clouds.yaml + owner: slurm + group: slurm + mode: '0400' +- name: Setup slurm tools + include_role: + name: stackhpc.slurm_openstack_tools.pytools +- name: Create SuspendProgram + template: + src: suspend.j2 + dest: /opt/slurm-tools/bin/suspend.sh + owner: slurm + group: slurm + mode: u=rwx,go= +- name: Create ResumeProgram + template: + src: resume.j2 + dest: /opt/slurm-tools/bin/resume + owner: slurm + group: slurm + mode: u=rwx,go= diff --git a/ansible/slurm.yml b/ansible/slurm.yml index e7a0cb4c9..f94145f81 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -35,6 +35,14 @@ - import_role: name: stackhpc.slurm_openstack_tools.rebuild +- name: Setup autoscaling suspend/resume programs + hosts: autoscale # this is the *controller* + become: yes + tags: + - autoscale + tasks: + - import_tasks: autoscale.yml + - name: Set locked memory limits on user-facing nodes hosts: - compute diff --git a/ansible/templates/resume.j2 b/ansible/templates/resume.j2 new file mode 100644 index 000000000..748b5f270 --- /dev/null +++ b/ansible/templates/resume.j2 @@ -0,0 +1,22 @@ +#!/opt/slurm-tools/bin/python3 +""" Create OpenStack instances """ + +import sys, subprocess + +# configure logging to syslog - by default only "info" +# and above categories appear +logger = logging.getLogger("syslogger") +logger.setLevel(logging.DEBUG) +handler = logging.handlers.SysLogHandler("/dev/log") +logger.addHandler(handler) + +def expand_nodes(hostlist_expr): + scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) # TODO: pass full path to binary + return scontrol.stdout.strip().split('\n') + +def launch(): + hostlist_expr = sys.argv[1:] + logger.info(f"Resume invoked for %{hostexphostlist_expr}") + nodes = expand_nodes(hostlist_expr) + for node in nodes: + logger.info(f"TODO: Resume node %{node}") diff --git a/ansible/templates/suspend.j2 b/ansible/templates/suspend.j2 new file mode 100644 index 000000000..1df641b6d --- /dev/null +++ b/ansible/templates/suspend.j2 @@ -0,0 +1,8 @@ +#!/bin/bash +# Example SuspendProgram +echo "`date` Suspend invoked $0 $*" >>/var/log/power_save.log +hosts=`scontrol show hostnames $1` +for host in $hosts +do + openstack server delete $host +done diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 028b932f4..b6d8abacf 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -15,6 +15,7 @@ openhpc_slurmdbd_mysql_username: slurm openhpc_slurm_control_host: "{{ groups['control'] | first }}" openhpc_slurm_partitions: - name: "compute" + # cloud_nodes: 2 openhpc_default_packages: - slurm-libpmi-ohpc # to allow intel mpi to work properly - ohpc-gnu9-openmpi4-perf-tools # for hpctests diff --git a/environments/sausage-autoscale/README.md b/environments/sausage-autoscale/README.md new file mode 100644 index 000000000..69f25a09e --- /dev/null +++ b/environments/sausage-autoscale/README.md @@ -0,0 +1,176 @@ +# Sausage-Autoscale cluster + +Dev env for autoscaling on sausagecloud + +# Directory structure + +## terraform + +Contains terraform configuration to deploy infrastructure. + +## inventory + +Ansible inventory for configuring the infrastructure. + +# Setup + +In the repo root, run: + + python3 -m venv venv # TODO: do we need system-site-packages? + . venv/bin/activate + pip install -U upgrade pip + pip install requirements.txt + ansible-galaxy install -r requirements.yml -p ansible/roles + ansible-galaxy collection install -r requirements.yml -p ansible/collections # don't worry about collections path warning + +# Activating the environment + +There is a small environment file that you must `source` which defines environment +variables that reference the configuration path. This is so that we can locate +resources relative the environment directory. + + . environments/sausage-autoscale/activate + +The pattern we use is that all resources referenced in the inventory +are located in the environment directory containing the inventory that +references them. + +# Common configuration + +Configuarion is shared by specifiying multiple inventories. We reference the `common` +inventory from `ansible.cfg`, including it before the environment specific +inventory, located at `./inventory`. + +Inventories specified later in the list can override values set in the inventories +that appear earlier. This allows you to override values set by the `common` inventory. + +Any variables that would be identical for all environments should be defined in the `common` inventory. + +# Passwords + +Prior to running any other playbooks, you need to define a set of passwords. You can +use the `generate-passwords.yml` playbook to automate this process: + +``` +cd +ansible-playbook ansible/adhoc/generate-passwords.yml # can actually be run from anywhere once environment activated +``` + +This will output a set of passwords `inventory/group_vars/all/secrets.yml`. +Placing them in the inventory means that they will be defined for all playbooks. + +It is recommended to encrypt the contents of this file prior to commiting to git: + +``` +ansible-vault encrypt inventory/group_vars/all/secrets.yml +``` + +You will then need to provide a password when running the playbooks e.g: + +``` +ansible-playbook ../ansible/site.yml --tags grafana --ask-vault-password +``` + +See the [Ansible vault documentation](https://docs.ansible.com/ansible/latest/user_guide/vault.html) for more details. + + +# Deploy nodes with Terraform + +- Modify the keypair in `main.tf` and ensure the required Centos images are available on OpenStack. +- Activate the virtualenv and create the instances: + + . venv/bin/activate + cd environments/sausage-autoscale/ + terraform apply + +This creates an ansible inventory file `./inventory`. + +Note that this terraform deploys instances onto an existing network - for production use you probably want to create a network for the cluster. + +# Create and configure cluster with Ansible + +Now run one or more playbooks using: + + cd + ansible-playbook ansible/site.yml + +This provides: +- grafana at `http://:3000` - username `grafana`, password as set above +- prometheus at `http://:9090` + +NB: if grafana's yum repos are down you will see `Errors during downloading metadata for repository 'grafana' ...`. You can work around this using: + + ssh centos@ + sudo rm -rf /etc/yum.repos.d/grafana.repo + wget https://dl.grafana.com/oss/release/grafana-7.3.1-1.x86_64.rpm + sudo yum install grafana-7.3.1-1.x86_64.rpm + exit + ansible-playbook -i inventory monitoring.yml -e grafana_password= --skip-tags grafana_install + +# rebuild.yml + +# FIXME: outdated + +Enable the compute nodes of a Slurm-based OpenHPC cluster on Openstack to be reimaged from Slurm. + +For full details including the Slurm commmands to use see the [role's README](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools/blob/main/roles/rebuild/README.md) + +Ensure you have `~/.config/openstack/clouds.yaml` defining authentication for a a single Openstack cloud (see above README to change location). + +Then run: + + ansible-playbook -i inventory rebuild.yml + +Note this does not rebuild the nodes, only deploys the tools to do so. + +# test.yml + +This runs MPI-based tests on the cluster: +- `pingpong`: Runs Intel MPI Benchmark's IMB-MPI1 pingpong between a pair of (scheduler-selected) nodes. Reports zero-size message latency and maximum bandwidth. +- `pingmatrix`: Runs a similar pingpong test but between all pairs of nodes. Reports zero-size message latency & maximum bandwidth. +- `hpl-solo`: Runs HPL **separately** on all nodes, using 80% of memory, reporting Gflops on each node. + +These names can be used as tags to run only a subset of tests. For full details see the [role's README](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools/blob/main/roles/test/README.md). + +Note these are intended as post-deployment tests for a cluster to which you have root access - they are **not** intended for use on a system running production jobs: +- Test directories are created within `openhpc_tests_rootdir` (here `/mnt/nfs/ohcp-tests`) which must be on a shared filesystem (read/write from login/control and compute nodes) +- Generally, packages are only installed on the control/login node, and `/opt` is exported via NFS to the compute nodes. +- The exception is the `slurm-libpmi-ohpc` package (required for `srun` with Intel MPI) which is installed on all nodes. + +To achieve best performance for HPL set `openhpc_tests_hpl_NB` in [test.yml](test.yml) to the appropriate the HPL blocksize 'NB' for the compute node processor - for Intel CPUs see [here](https://software.intel.com/content/www/us/en/develop/documentation/mkl-linux-developer-guide/top/intel-math-kernel-library-benchmarks/intel-distribution-for-linpack-benchmark/configuring-parameters.html). + +Then run: + + ansible-playbook ../ansible/adhoc/test.yml + +Results will be reported in the ansible stdout - the pingmatrix test also writes an html results file onto the ansible host. + +Note that you can still use the `test.yml` playbook even if the terraform/ansible in this repo wasn't used to deploy the cluster - as long as it's running OpenHPC v2. Simply create an appropriate `inventory` file, e.g: + + [all:vars] + ansible_user=centos + + [cluster:children] + cluster_login + cluster_compute + + [cluster_login] + slurm-control + + [cluster_compute] + cpu-h21a5-u3-svn2 + cpu-h21a5-u3-svn4 + ... + +And run the `test.yml` playbook as described above. If you want to run tests only on a group from this inventory, rather than an entire partition, you can +use ``--limit`` + +Then running the tests passing this file as extra_vars: + + ansible-playbook ../ansible/test.yml --limit group-in-inventory + +# Destroying the cluster + +When finished, run: + + terraform destroy --auto-approve diff --git a/environments/sausage-autoscale/activate b/environments/sausage-autoscale/activate new file mode 100644 index 000000000..e74031095 --- /dev/null +++ b/environments/sausage-autoscale/activate @@ -0,0 +1,23 @@ +export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) +echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" + +APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) +export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" + +export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") +echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" + +export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") +echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root" + +export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") +echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root" + +export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT") +echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" + +if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then + export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg +fi + + diff --git a/environments/sausage-autoscale/ansible.cfg b/environments/sausage-autoscale/ansible.cfg new file mode 100644 index 000000000..c243e9958 --- /dev/null +++ b/environments/sausage-autoscale/ansible.cfg @@ -0,0 +1,14 @@ +[defaults] +any_errors_fatal = True +stdout_callback = debug +stderr_callback = debug +gathering = smart +forks = 30 +host_key_checking = False +inventory = ../common/inventory,inventory +collections_path = ../../ansible/collections +roles_path = ../../ansible/roles + +[ssh_connection] +ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +pipelining = True diff --git a/environments/sausage-autoscale/hooks/.gitkeep b/environments/sausage-autoscale/hooks/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/environments/sausage-autoscale/inventory/group_vars/all/.gitkeep b/environments/sausage-autoscale/inventory/group_vars/all/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/environments/sausage-autoscale/inventory/group_vars/all/secrets.yml b/environments/sausage-autoscale/inventory/group_vars/all/secrets.yml new file mode 100644 index 000000000..29f86e2dd --- /dev/null +++ b/environments/sausage-autoscale/inventory/group_vars/all/secrets.yml @@ -0,0 +1,43 @@ +--- +# Ansible managed +secrets_openhpc_elasticsearch_admin_password: pC2:.3c1QShckz1BxB5u +secrets_openhpc_elasticsearch_kibana_password: TVevBQ2JXTyoSW.bT_ba +secrets_openhpc_grafana_admin_password: QMiyymbFD.YC7M.39mCa +secrets_openhpc_mungekey: + content: 'YCbG2tA9UWIezb3xRRqK2vNzd6tOnGMs0kpPO+quHuqGXLsXagFjXq8Kqnd1/UIqoRW/TH3AfRNF + + yhuvemkRd3TlZLrvBZzMcQw0jAP2sI/+4hPpeIEk1kcQFVgE9A1HppLc0CxI7SskDPmY3vGwnuo7 + + 4K19jYxgPkIb9FUKCNetKgHR7L78LsbZxWUYkmvO6rCDUeLgMchFkjoi5Lwr+i1JJAxoGhT0yLmW + + D0hEenYePgsefzopwEnKEHByhnx0ROlJ86S58bh+rOnAqBWWJ8Im71NeJS58Moyrh9VLOkmRUCIj + + e0bhEKd7+/a5I4GN6KIo1oXRT74TxVHkwypSqFgAbVF5KMSsuY+5eG4JLcpTOGZYQpbAY9ICtnjM + + U6T6YhVXYvurVcb7N2ybub8veIwWeS98Yr2C9ZwsBzvpA2Fk3wxCFjo6vxe47U2nsezIUAUxVH7U + + V6jNMVoT4GZMQcKRsTp0zoAVAund6jMjsQ6h6Ly8EYyiKz6itTq4L5OqotZ0tUCX6xnVxtaD4LZb + + tQfZbcxPdd1C7NtTfImUsxHDp2CBIu/VDZWen/iafGaPeI83XVkC8Kk1QwhjBcRnlJEw2cK4TdBW + + 6Soy8CnNZMd92iqlqIZs7iZHu9FLyLiLCrkjaDnxM0UH0RP9CPTihiE47w874HVOQioacNX6U3Dz + + 3I0vxUAd/AF6ZrmBtOh3EekbxPtFNY7Gw3qPCbbJM6v5+XFjz//Lj1GFYzGK1DA7jhekzk5vtOWe + + k2vZcyqPYOIxFlqtm3OGt+f6V9G/xvYvRofE1EbO9qU1vqVRbW8Z7dqOR4AwomW2UlhH9G/ijLZZ + + EKqOWiCVONfMEe+Cndi/WH80R/nASx8hMJrTp0VOPtNjN+LWb/pPE/cSY9hbuA2EvqJB4gFQzmqz + + sFpQAqPVS8/+vesiKKVcnxUeMoRBx8g9CmdFTIvz5fU6M9lh7DjYoKcKx7eKtQhCAktyeI21o+Tn + + 2gyALzcxX29VCJy/8n/8qC26T9wLjN2URpO95yT2+k+Uv96R6Uj4zK4CD2c7yXm/0CmyrUWf3EPp + + VeaaWhy+KKR7T923TCEETiwSlwOynwb4lHLPmE17t8XBqYAqWGL2e8uDuLYhPAf+U5Bwt+LiXO5j + + hjg6szwpdSirDl1vpkqTDaOGdzVcTb+j8NfHDCdVOJbWu2I8sAHkjDRl+faagwxeMIGpTjoRi225 + + mj9rJdCbZxCSrwbE1r1koHrJZ+XN4AG4OrmTXdXMSLhpJuptyeNsRmvWxBe665tAxktRZ/kQUY3c + + W1zq03n3wtBkilL1wh/Fata4XrN5UZhpVSwT+7Z3gPacJVt5UjedkqpW8br+Pxw4efQExeDH2g==' +secrets_openhpc_mysql_root_password: XuY4ATpIzRje.PhkXI-t +secrets_openhpc_mysql_slurm_password: bS7GCWoTtsf4cjLo70S5 diff --git a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml new file mode 100644 index 000000000..6976f8117 --- /dev/null +++ b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml @@ -0,0 +1 @@ +openhpc_autoscale_clouds: ~/steveb-openrc.sh \ No newline at end of file diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml new file mode 100644 index 000000000..40ada5b98 --- /dev/null +++ b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml @@ -0,0 +1,19 @@ +openhpc_login_only_nodes: '' +openhpc_slurm_conf: + SlurmctldDebug: debug5 + SlurmctldLogFile: /var/log/slurmctld.log + SlurmdDebug: debug5 + SlurmdLogFile: /var/log/slurmd.log + SuspendExcNodes: "{{ groups['compute'] | join(',') }}" # i.e. all tf-defined nodes in the partition + # BELOW FOR TESTING ONLY, NOT PRODUCTION VALUES! + SuspendTime: 120 + SuspendTimeout: 300 + SuspendProgram: /opt/slurm/suspend.sh + ResumeProgram: /opt/slurm/resume.sh + ResumeTimeout: 300 + SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns,power_save_interval=10,power_save_min_interval=0 + NOTES: + # enable_configless: required (as set in template) + # cloud_dns: requires working DNS + # power_save_*interval: options are defaults but should enable changes + # reboot_from_controller: should be really useful but actually we're already setup for rebuild on computes, so use that diff --git a/environments/sausage-autoscale/inventory/group_vars/rebuild.yml b/environments/sausage-autoscale/inventory/group_vars/rebuild.yml new file mode 100644 index 000000000..b2eba881a --- /dev/null +++ b/environments/sausage-autoscale/inventory/group_vars/rebuild.yml @@ -0,0 +1 @@ +openhpc_rebuild_clouds: ~/steveb-openrc.sh diff --git a/environments/sausage-autoscale/inventory/groups b/environments/sausage-autoscale/inventory/groups new file mode 100644 index 000000000..4cec8ab8f --- /dev/null +++ b/environments/sausage-autoscale/inventory/groups @@ -0,0 +1,17 @@ +[control:children] +login + +[nfs:children] +cluster + +[openhpc:children] +cluster + +[mysql:children] +control + +[rebuild:children] +compute + +[autoscale:children] +login # actually controller diff --git a/environments/sausage-autoscale/inventory/hosts b/environments/sausage-autoscale/inventory/hosts new file mode 100755 index 000000000..203a41376 --- /dev/null +++ b/environments/sausage-autoscale/inventory/hosts @@ -0,0 +1,31 @@ +[all:vars] +ansible_user=centos +ssh_proxy=10.0.3.100 +openhpc_cluster_name=sbscale + +[sbscale_login] +sbscale-login-0 ansible_host=10.0.3.100 server_networks='{"stackhpc":["10.0.3.100"]}' + +[sbscale_compute] +sbscale-compute-0 ansible_host=10.0.3.107 server_networks='{"stackhpc":["10.0.3.107"]}' +sbscale-compute-1 ansible_host=10.0.3.71 server_networks='{"stackhpc":["10.0.3.71"]}' + +[sbscale_compute:vars] +ansible_ssh_common_args='-o ProxyCommand="ssh centos@10.0.3.100 -W %h:%p"' + +[cluster_login:children] +sbscale_login + +# NOTE: This is hardcoded in the tests role +[cluster_compute:children] +sbscale_compute + +[login:children] +cluster_login + +[compute:children] +cluster_compute + +[cluster:children] +login +compute \ No newline at end of file diff --git a/environments/sausage-autoscale/terraform/.terraform.lock.hcl b/environments/sausage-autoscale/terraform/.terraform.lock.hcl new file mode 100644 index 000000000..8f9e2298d --- /dev/null +++ b/environments/sausage-autoscale/terraform/.terraform.lock.hcl @@ -0,0 +1,39 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/local" { + version = "2.1.0" + hashes = [ + "h1:EYZdckuGU3n6APs97nS2LxZm3dDtGqyM4qaIvsmac8o=", + "zh:0f1ec65101fa35050978d483d6e8916664b7556800348456ff3d09454ac1eae2", + "zh:36e42ac19f5d68467aacf07e6adcf83c7486f2e5b5f4339e9671f68525fc87ab", + "zh:6db9db2a1819e77b1642ec3b5e95042b202aee8151a0256d289f2e141bf3ceb3", + "zh:719dfd97bb9ddce99f7d741260b8ece2682b363735c764cac83303f02386075a", + "zh:7598bb86e0378fd97eaa04638c1a4c75f960f62f69d3662e6d80ffa5a89847fe", + "zh:ad0a188b52517fec9eca393f1e2c9daea362b33ae2eb38a857b6b09949a727c1", + "zh:c46846c8df66a13fee6eff7dc5d528a7f868ae0dcf92d79deaac73cc297ed20c", + "zh:dc1a20a2eec12095d04bf6da5321f535351a594a636912361db20eb2a707ccc4", + "zh:e57ab4771a9d999401f6badd8b018558357d3cbdf3d33cc0c4f83e818ca8e94b", + "zh:ebdcde208072b4b0f8d305ebf2bfdc62c926e0717599dcf8ec2fd8c5845031c3", + "zh:ef34c52b68933bedd0868a13ccfd59ff1c820f299760b3c02e008dc95e2ece91", + ] +} + +provider "registry.terraform.io/terraform-provider-openstack/openstack" { + version = "1.40.0" + hashes = [ + "h1:gBrsytNqUG1ZQPKys8KAvZkjesjimXb7vcrTmyFUTM0=", + "zh:278a878a256ec5447e1e64b5d9a691e3a1f7d5c247e536500c97c5b996bc2531", + "zh:5c7ae8cfe0831557c8c1988581f3fd0bdf182d15bcefbe645bb91564027e67d4", + "zh:944d75fc1e3d54df4c47e5d34007927abf4fa79e2107b05d14f11b52970a6164", + "zh:a50922d05185598a9264a25eff6f01ce7671c70a562a3ef93e9bb7a449e358b0", + "zh:adb87ad3782f1f7a5eaeedbcffa0e5559d2372502f9af91781aa13c11cf4b47b", + "zh:c0e4218259a37f16c10b4779009f0b0b5d467e4d347fc2aa3a212f1ee3a71d63", + "zh:c2eb4f40cbd78238500a3a84ba995060bfc50f770bd13732ae50b73687f3dce6", + "zh:ca8a38fe932972d0d7fdc51f84ae775648b7aff3c96b8ead085007e880ee987f", + "zh:ce4f703719d646507d6006085dc1114954c75710226df43078169b2b01993537", + "zh:e29542a492bbf55613d20b5f68ed4357cbc8bb09d61a1752d2976e5e1608879d", + "zh:e68d47b85b9da089f8f7102c23545331c15a9e6ea99875926d2ebf6e38bf2073", + "zh:fdb10cb345250d7c47e342def106bd10ef75493ef6edf15809e10e6367a0d9f6", + ] +} diff --git a/environments/sausage-autoscale/terraform/inventory.tpl b/environments/sausage-autoscale/terraform/inventory.tpl new file mode 100644 index 000000000..361a359ec --- /dev/null +++ b/environments/sausage-autoscale/terraform/inventory.tpl @@ -0,0 +1,32 @@ +[all:vars] +ansible_user=centos +ssh_proxy=${login.network[0].fixed_ip_v4} +openhpc_cluster_name=${cluster_name} + +[${cluster_name}_login] +${login.name} ansible_host=${login.network[0].fixed_ip_v4} server_networks='${jsonencode({for net in login.network: net.name => [ net.fixed_ip_v4 ] })}' + +[${cluster_name}_compute] +%{ for compute in computes ~} +${compute.name} ansible_host=${compute.network[0].fixed_ip_v4} server_networks='${jsonencode({for net in compute.network: net.name => [ net.fixed_ip_v4 ] })}' +%{ endfor ~} + +[${cluster_name}_compute:vars] +ansible_ssh_common_args='-o ProxyCommand="ssh centos@${login.network[0].fixed_ip_v4} -W %h:%p"' + +[cluster_login:children] +${cluster_name}_login + +# NOTE: This is hardcoded in the tests role +[cluster_compute:children] +${cluster_name}_compute + +[login:children] +cluster_login + +[compute:children] +cluster_compute + +[cluster:children] +login +compute \ No newline at end of file diff --git a/environments/sausage-autoscale/terraform/main.tf b/environments/sausage-autoscale/terraform/main.tf new file mode 100644 index 000000000..1523eeae5 --- /dev/null +++ b/environments/sausage-autoscale/terraform/main.tf @@ -0,0 +1,131 @@ +terraform { + required_version = ">= 0.14" + required_providers { + openstack = { + source = "terraform-provider-openstack/openstack" + } + } +} + +variable "environment_root" { + type = string +} + +variable "compute_names" { + default = ["compute-0", "compute-1"] +} + +variable "cluster_name" { + default = "testohpc" +} + +variable "key_pair" { + type = string +} + +variable "network" { + type = string +} + +variable "login_flavor" { + type = string +} + +variable "login_image" { + type = string +} + +variable "compute_flavor" { + type = string +} + +variable "compute_image" { + type = string +} + +resource "openstack_networking_secgroup_v2" "secgroup_slurm_login" { + name = "secgroup_slurm_login" + description = "Rules for the slurm login node" + # Fully manage with terraform + delete_default_rules = true +} + +resource "openstack_networking_secgroup_v2" "secgroup_slurm_compute" { + name = "secgroup_slurm_compute" + description = "Rules for the slurm compute node" + # Fully manage with terraform + delete_default_rules = true +} + +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_egress_v4" { + direction = "egress" + ethertype = "IPv4" + security_group_id = openstack_networking_secgroup_v2.secgroup_slurm_login.id +} + +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_ingress_tcp_v4" { + direction = "ingress" + ethertype = "IPv4" + # NOTE: You will want to lock down the ports in a production environment. This will require + # setting of static ports for the NFS server see: + # https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/6/html/storage_administration_guide/s2-nfs-nfs-firewall-config + port_range_min = 1 + protocol = "tcp" + port_range_max = 65535 + security_group_id = openstack_networking_secgroup_v2.secgroup_slurm_login.id +} + +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_compute_rule_egress_v4" { + direction = "egress" + ethertype = "IPv4" + security_group_id = openstack_networking_secgroup_v2.secgroup_slurm_compute.id +} + +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_compute_rule_ingress_tcp_v4" { + direction = "ingress" + ethertype = "IPv4" + port_range_min = 1 + protocol = "tcp" + port_range_max = 65535 + security_group_id = openstack_networking_secgroup_v2.secgroup_slurm_compute.id +} + +resource "openstack_compute_instance_v2" "login" { + + name = "${var.cluster_name}-login-0" + image_name = var.login_image + flavor_name = var.login_flavor + key_pair = var.key_pair + network { + name = var.network + } + security_groups = [openstack_networking_secgroup_v2.secgroup_slurm_login.name] +} + + +resource "openstack_compute_instance_v2" "compute" { + + for_each = toset(var.compute_names) + + name = "${var.cluster_name}-${each.value}" + image_name = var.compute_image + flavor_name = var.compute_flavor + #flavor_name = "compute-A" + key_pair = var.key_pair + network { + name = var.network + } + security_groups = [openstack_networking_secgroup_v2.secgroup_slurm_compute.name] +} + +# TODO: needs fixing for case where creation partially fails resulting in "compute.network is empty list of object" +resource "local_file" "hosts" { + content = templatefile("${path.module}/inventory.tpl", + { + "cluster_name": var.cluster_name + "login": openstack_compute_instance_v2.login, + "computes": openstack_compute_instance_v2.compute, + }, + ) + filename = "${var.environment_root}/inventory/hosts" +} \ No newline at end of file diff --git a/environments/sausage-autoscale/terraform/terraform.tfvars b/environments/sausage-autoscale/terraform/terraform.tfvars new file mode 100644 index 000000000..04bfb7ade --- /dev/null +++ b/environments/sausage-autoscale/terraform/terraform.tfvars @@ -0,0 +1,10 @@ +compute_names = ["compute-0", "compute-1"] +cluster_name = "sbscale" +key_pair = "steveb-local" +network = "stackhpc" + +login_image = "CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64" +login_flavor = "chipolata" + +compute_image = "CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64" +compute_flavor = "chipolata" From c9c9bfc6795131a9cef6434e12e7d73e2dd944a6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 31 Mar 2021 06:57:50 +0000 Subject: [PATCH 002/105] add IMB package to allow testing --- .../sausage-autoscale/inventory/group_vars/openhpc.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml index 40ada5b98..f15cc50d4 100644 --- a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml +++ b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml @@ -1,4 +1,9 @@ openhpc_login_only_nodes: '' +openhpc_packages: + - slurm-libpmi-ohpc + - wget + - lmod-defaults-gnu9-openmpi4-ohpc + - imb-gnu9-openmpi4-ohpc openhpc_slurm_conf: SlurmctldDebug: debug5 SlurmctldLogFile: /var/log/slurmctld.log From ab14526837d954cbf992bbf9e3dc679eaec2f2c9 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 31 Mar 2021 07:36:40 +0000 Subject: [PATCH 003/105] move cloud_nodes config to right environment --- environments/common/inventory/group_vars/all/openhpc.yml | 6 ++++++ .../sausage-autoscale/inventory/group_vars/openhpc.yml | 7 +++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index b6d8abacf..f1bae139b 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -15,6 +15,7 @@ openhpc_slurmdbd_mysql_username: slurm openhpc_slurm_control_host: "{{ groups['control'] | first }}" openhpc_slurm_partitions: - name: "compute" +<<<<<<< HEAD # cloud_nodes: 2 openhpc_default_packages: - slurm-libpmi-ohpc # to allow intel mpi to work properly @@ -22,5 +23,10 @@ openhpc_default_packages: openhpc_extra_packages: [] openhpc_packages: "{{ openhpc_default_packages + openhpc_extra_packages }}" openhpc_munge_key: "{{ secrets_openhpc_mungekey | b64decode }}" +======= +openhpc_packages: + - slurm-libpmi-ohpc +slurm_munge_key: "{{ secrets_openhpc_mungekey }}" +>>>>>>> b8d9eba... move cloud_nodes config to right environment openhpc_slurm_configless: true openhpc_login_only_nodes: login \ No newline at end of file diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml index f15cc50d4..9f69b0f9f 100644 --- a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml +++ b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml @@ -1,4 +1,7 @@ openhpc_login_only_nodes: '' +openhpc_slurm_partitions: + - name: "compute" + cloud_nodes: 2 openhpc_packages: - slurm-libpmi-ohpc - wget @@ -13,8 +16,8 @@ openhpc_slurm_conf: # BELOW FOR TESTING ONLY, NOT PRODUCTION VALUES! SuspendTime: 120 SuspendTimeout: 300 - SuspendProgram: /opt/slurm/suspend.sh - ResumeProgram: /opt/slurm/resume.sh + SuspendProgram: /opt/slurm-tools/bin/suspend.sh + ResumeProgram: /opt/slurm-tools/bin/resume ResumeTimeout: 300 SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns,power_save_interval=10,power_save_min_interval=0 NOTES: From 67b16a4d98feabfa5b8d90afc7cb26fc2d3344b5 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 31 Mar 2021 10:03:30 +0000 Subject: [PATCH 004/105] fix /etc/openstack permissions for resume --- ansible/autoscale.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ansible/autoscale.yml b/ansible/autoscale.yml index 1e179deeb..9a4bb4879 100644 --- a/ansible/autoscale.yml +++ b/ansible/autoscale.yml @@ -5,6 +5,13 @@ owner: slurm group: slurm mode: '0400' +- name: Ensure /etc/openstack/ is readable by slurm # TODO: think this clashes with rebuild? + file: + path: /etc/openstack/ + state: directory + owner: slurm + group: slurm + mode: u=rx - name: Setup slurm tools include_role: name: stackhpc.slurm_openstack_tools.pytools @@ -22,3 +29,4 @@ owner: slurm group: slurm mode: u=rwx,go= + tags: resume From a618acac172048f7be5c295ed9542d34671f18cf Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 31 Mar 2021 10:03:54 +0000 Subject: [PATCH 005/105] fix clouds.yaml --- .../sausage-autoscale/inventory/group_vars/autoscale.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml index 6976f8117..5216ebbc8 100644 --- a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml +++ b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml @@ -1 +1 @@ -openhpc_autoscale_clouds: ~/steveb-openrc.sh \ No newline at end of file +openhpc_autoscale_clouds: /home/centos/steveb-clouds.yaml \ No newline at end of file From 341a5c99668c33522a77d35ce70322ef90edde11 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 31 Mar 2021 12:50:12 +0000 Subject: [PATCH 006/105] get resume/suspend scripts working manually --- ansible/autoscale.yml | 3 +- ansible/templates/resume.j2 | 50 +++++++++++++++---- ansible/templates/suspend.j2 | 43 +++++++++++++--- .../inventory/group_vars/autoscale.yml | 7 ++- 4 files changed, 83 insertions(+), 20 deletions(-) diff --git a/ansible/autoscale.yml b/ansible/autoscale.yml index 9a4bb4879..d9dc9b057 100644 --- a/ansible/autoscale.yml +++ b/ansible/autoscale.yml @@ -18,10 +18,11 @@ - name: Create SuspendProgram template: src: suspend.j2 - dest: /opt/slurm-tools/bin/suspend.sh + dest: /opt/slurm-tools/bin/suspend owner: slurm group: slurm mode: u=rwx,go= + tags: suspend - name: Create ResumeProgram template: src: resume.j2 diff --git a/ansible/templates/resume.j2 b/ansible/templates/resume.j2 index 748b5f270..bf79c6c04 100644 --- a/ansible/templates/resume.j2 +++ b/ansible/templates/resume.j2 @@ -1,22 +1,52 @@ #!/opt/slurm-tools/bin/python3 """ Create OpenStack instances """ -import sys, subprocess +import sys, subprocess, logging.handlers +import openstack +import pprint -# configure logging to syslog - by default only "info" -# and above categories appear +# all take a name or ID: +IMAGE = "{{ openhpc_autoscale_image }}" +NETWORK = "{{ openhpc_autoscale_network }}" +FLAVOR = "{{ openhpc_autoscale_flavor }}" +KEYPAIR = "{{ openhpc_autoscale_keypair }}" + +# configure logging to syslog - by default only "info" and above categories appear logger = logging.getLogger("syslogger") logger.setLevel(logging.DEBUG) handler = logging.handlers.SysLogHandler("/dev/log") logger.addHandler(handler) def expand_nodes(hostlist_expr): - scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) # TODO: pass full path to binary + scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) return scontrol.stdout.strip().split('\n') -def launch(): - hostlist_expr = sys.argv[1:] - logger.info(f"Resume invoked for %{hostexphostlist_expr}") - nodes = expand_nodes(hostlist_expr) - for node in nodes: - logger.info(f"TODO: Resume node %{node}") +def create_server(conn, name): + + image = conn.compute.find_image(IMAGE) + flavor = conn.compute.find_flavor(FLAVOR) + network = conn.network.find_network(NETWORK) + keypair = conn.compute.find_keypair(KEYPAIR) + + server = conn.compute.create_server( + name=name, image_id=image.id, flavor_id=flavor.id, + networks=[{"uuid": network.id}], key_name=keypair.name) + + #server = conn.compute.wait_for_server(server) + return server + +def resume(): + hostlist_expr = sys.argv[1] + logger.info(f"Slurmctld invoked resume {hostlist_expr}") + new_nodes = expand_nodes(hostlist_expr) + + conn = openstack.connection.from_config() + logger.info(f"Got openstack connection {conn}") + + for node in new_nodes: + logger.info(f"creating node {node}") + server = create_server(conn, node) + logger.info(f"server: {server}") + +if __name__ == "__main__": + sys.exit(resume()) diff --git a/ansible/templates/suspend.j2 b/ansible/templates/suspend.j2 index 1df641b6d..02d09bc0d 100644 --- a/ansible/templates/suspend.j2 +++ b/ansible/templates/suspend.j2 @@ -1,8 +1,35 @@ -#!/bin/bash -# Example SuspendProgram -echo "`date` Suspend invoked $0 $*" >>/var/log/power_save.log -hosts=`scontrol show hostnames $1` -for host in $hosts -do - openstack server delete $host -done +#!/opt/slurm-tools/bin/python3 +""" Delete openstack instances """ + +import sys, subprocess, logging, logging.handlers +import openstack +import pprint + +# configure logging to syslog - by default only "info" and above categories appear +logger = logging.getLogger("syslogger") +logger.setLevel(logging.DEBUG) +handler = logging.handlers.SysLogHandler("/dev/log") +logger.addHandler(handler) + +def expand_nodes(hostlist_expr): + scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) + return scontrol.stdout.strip().split('\n') + +def delete_server(conn, name): + server = conn.compute.find_server(name) + conn.compute.delete_server(server) + +def suspend(): + hostlist_expr = sys.argv[1] + logger.info(f"Slurmctld invoked suspend {hostlist_expr}") + remove_nodes = expand_nodes(hostlist_expr) + + conn = openstack.connection.from_config() + logger.info(f"Got openstack connection {conn}") + + for node in remove_nodes: + logger.info(f"deleting node {node}") + delete_server(conn, node) + +if __name__ == "__main__": + sys.exit(suspend()) diff --git a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml index 5216ebbc8..a87c5afe9 100644 --- a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml +++ b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml @@ -1 +1,6 @@ -openhpc_autoscale_clouds: /home/centos/steveb-clouds.yaml \ No newline at end of file +openhpc_autoscale_clouds: /home/centos/steveb-clouds.yaml +# TODO: change below to be defined somewhere else +openhpc_autoscale_image: CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64 # TODO change to built image +openhpc_autoscale_network: stackhpc +openhpc_autoscale_flavor: chipolata +openhpc_autoscale_keypair: steveb-local From 99fe7adfa24a5a8b5282147770bf030a49e9e3f7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 08:59:48 +0000 Subject: [PATCH 007/105] note issue with adhoc slurm restart for combined headnode --- ansible/adhoc/restart-slurm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/adhoc/restart-slurm.yml b/ansible/adhoc/restart-slurm.yml index 41b9dcb50..cf523ddee 100644 --- a/ansible/adhoc/restart-slurm.yml +++ b/ansible/adhoc/restart-slurm.yml @@ -20,7 +20,7 @@ name: slurmctld state: restarted -- hosts: compute,login +- hosts: compute,login # FIXME: doesn't work if using `login` as combined slurmctld become: yes gather_facts: no tasks: From a956a549bb61505986e6fd2a1fc810bdca1c5a00 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 09:06:31 +0000 Subject: [PATCH 008/105] fix openhpc variables for autoscale --- .../sausage-autoscale/inventory/group_vars/openhpc.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml index 9f69b0f9f..7a9795419 100644 --- a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml +++ b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml @@ -16,11 +16,12 @@ openhpc_slurm_conf: # BELOW FOR TESTING ONLY, NOT PRODUCTION VALUES! SuspendTime: 120 SuspendTimeout: 300 - SuspendProgram: /opt/slurm-tools/bin/suspend.sh + SuspendProgram: /opt/slurm-tools/bin/suspend ResumeProgram: /opt/slurm-tools/bin/resume ResumeTimeout: 300 SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns,power_save_interval=10,power_save_min_interval=0 - NOTES: + # FIXME: need to set TreeWidth to >= number of nodes (default: 50) + # NOTES: # enable_configless: required (as set in template) # cloud_dns: requires working DNS # power_save_*interval: options are defaults but should enable changes From 4ea81c5fc015a899b9c0cf91df85c57a9b50656c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 09:06:47 +0000 Subject: [PATCH 009/105] set new image ID --- .../sausage-autoscale/inventory/group_vars/autoscale.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml index a87c5afe9..275b938e7 100644 --- a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml +++ b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml @@ -1,6 +1,6 @@ openhpc_autoscale_clouds: /home/centos/steveb-clouds.yaml # TODO: change below to be defined somewhere else -openhpc_autoscale_image: CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64 # TODO change to built image +openhpc_autoscale_image: 1c3025f4-8384-4f3f-946e-8ce3b8e32292 openhpc_autoscale_network: stackhpc openhpc_autoscale_flavor: chipolata openhpc_autoscale_keypair: steveb-local From 354c67a36886ef605dd70aa3a18e57ee6dca085a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 09:07:19 +0000 Subject: [PATCH 010/105] set autoscale branch for openhpc role requirements --- requirements.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.yml b/requirements.yml index c945cb931..90f280107 100644 --- a/requirements.yml +++ b/requirements.yml @@ -2,6 +2,7 @@ roles: - src: stackhpc.nfs - src: stackhpc.openhpc + version: feature/autoscale - src: cloudalchemy.node_exporter - src: cloudalchemy.blackbox-exporter - src: cloudalchemy.prometheus From c74a271c11b2587bda26a8f3baabb45060517021 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 16:17:59 +0000 Subject: [PATCH 011/105] fix /etc/openstack for autoscale --- ansible/autoscale.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ansible/autoscale.yml b/ansible/autoscale.yml index d9dc9b057..2d5fba68a 100644 --- a/ansible/autoscale.yml +++ b/ansible/autoscale.yml @@ -1,3 +1,10 @@ +- name: Ensure /etc/openstack/ exists and is readable by slurm # TODO: think this clashes with rebuild? + file: + path: /etc/openstack/ + state: directory + owner: slurm + group: slurm + mode: u=rx - name: Copy out clouds.yaml copy: src: "{{ openhpc_autoscale_clouds }}" @@ -5,13 +12,6 @@ owner: slurm group: slurm mode: '0400' -- name: Ensure /etc/openstack/ is readable by slurm # TODO: think this clashes with rebuild? - file: - path: /etc/openstack/ - state: directory - owner: slurm - group: slurm - mode: u=rx - name: Setup slurm tools include_role: name: stackhpc.slurm_openstack_tools.pytools From 73eed3962b5d3ecf590aee357a7f6a1b210ad4a4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 16:18:40 +0000 Subject: [PATCH 012/105] remove SlurmctldParameters unsupported in slurm 20.02.5 --- environments/sausage-autoscale/inventory/group_vars/openhpc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml index 7a9795419..5dc8bedfe 100644 --- a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml +++ b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml @@ -19,7 +19,7 @@ openhpc_slurm_conf: SuspendProgram: /opt/slurm-tools/bin/suspend ResumeProgram: /opt/slurm-tools/bin/resume ResumeTimeout: 300 - SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns,power_save_interval=10,power_save_min_interval=0 + SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns # FIXME: need to set TreeWidth to >= number of nodes (default: 50) # NOTES: # enable_configless: required (as set in template) From 967d1077e9ffebc6a281716c117854a7390cec32 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 16:19:48 +0000 Subject: [PATCH 013/105] use openhpc_munge_key parameter --- environments/common/inventory/group_vars/all/openhpc.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index f1bae139b..b6d8abacf 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -15,7 +15,6 @@ openhpc_slurmdbd_mysql_username: slurm openhpc_slurm_control_host: "{{ groups['control'] | first }}" openhpc_slurm_partitions: - name: "compute" -<<<<<<< HEAD # cloud_nodes: 2 openhpc_default_packages: - slurm-libpmi-ohpc # to allow intel mpi to work properly @@ -23,10 +22,5 @@ openhpc_default_packages: openhpc_extra_packages: [] openhpc_packages: "{{ openhpc_default_packages + openhpc_extra_packages }}" openhpc_munge_key: "{{ secrets_openhpc_mungekey | b64decode }}" -======= -openhpc_packages: - - slurm-libpmi-ohpc -slurm_munge_key: "{{ secrets_openhpc_mungekey }}" ->>>>>>> b8d9eba... move cloud_nodes config to right environment openhpc_slurm_configless: true openhpc_login_only_nodes: login \ No newline at end of file From 94de0995710bb82d469f2bf619f6f409ccb4832c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 18:36:30 +0000 Subject: [PATCH 014/105] don't cache node ips in slurm --- environments/sausage-autoscale/inventory/group_vars/openhpc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml index 5dc8bedfe..266f1fe65 100644 --- a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml +++ b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml @@ -20,6 +20,7 @@ openhpc_slurm_conf: ResumeProgram: /opt/slurm-tools/bin/resume ResumeTimeout: 300 SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns + CommunicationParameters: NoAddrCache # FIXME: need to set TreeWidth to >= number of nodes (default: 50) # NOTES: # enable_configless: required (as set in template) From 99793ad8214e799bb8029c217c42f7c24fd907b1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 18:38:02 +0000 Subject: [PATCH 015/105] tune slurm debug info for powersave only --- .../sausage-autoscale/inventory/group_vars/openhpc.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml index 266f1fe65..c7415bb45 100644 --- a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml +++ b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml @@ -10,7 +10,8 @@ openhpc_packages: openhpc_slurm_conf: SlurmctldDebug: debug5 SlurmctldLogFile: /var/log/slurmctld.log - SlurmdDebug: debug5 + # SlurmdDebug: debug5 + DebugFlags: PowerSave SlurmdLogFile: /var/log/slurmd.log SuspendExcNodes: "{{ groups['compute'] | join(',') }}" # i.e. all tf-defined nodes in the partition # BELOW FOR TESTING ONLY, NOT PRODUCTION VALUES! From 1a3fd48c012057b2c3f7b1bca9d6c586245cff16 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 6 Apr 2021 12:40:19 +0000 Subject: [PATCH 016/105] use default security groups --- .../sausage-autoscale/terraform/main.tf | 49 ------------------- 1 file changed, 49 deletions(-) diff --git a/environments/sausage-autoscale/terraform/main.tf b/environments/sausage-autoscale/terraform/main.tf index 1523eeae5..fb9931dec 100644 --- a/environments/sausage-autoscale/terraform/main.tf +++ b/environments/sausage-autoscale/terraform/main.tf @@ -43,53 +43,6 @@ variable "compute_image" { type = string } -resource "openstack_networking_secgroup_v2" "secgroup_slurm_login" { - name = "secgroup_slurm_login" - description = "Rules for the slurm login node" - # Fully manage with terraform - delete_default_rules = true -} - -resource "openstack_networking_secgroup_v2" "secgroup_slurm_compute" { - name = "secgroup_slurm_compute" - description = "Rules for the slurm compute node" - # Fully manage with terraform - delete_default_rules = true -} - -resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_egress_v4" { - direction = "egress" - ethertype = "IPv4" - security_group_id = openstack_networking_secgroup_v2.secgroup_slurm_login.id -} - -resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_ingress_tcp_v4" { - direction = "ingress" - ethertype = "IPv4" - # NOTE: You will want to lock down the ports in a production environment. This will require - # setting of static ports for the NFS server see: - # https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/6/html/storage_administration_guide/s2-nfs-nfs-firewall-config - port_range_min = 1 - protocol = "tcp" - port_range_max = 65535 - security_group_id = openstack_networking_secgroup_v2.secgroup_slurm_login.id -} - -resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_compute_rule_egress_v4" { - direction = "egress" - ethertype = "IPv4" - security_group_id = openstack_networking_secgroup_v2.secgroup_slurm_compute.id -} - -resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_compute_rule_ingress_tcp_v4" { - direction = "ingress" - ethertype = "IPv4" - port_range_min = 1 - protocol = "tcp" - port_range_max = 65535 - security_group_id = openstack_networking_secgroup_v2.secgroup_slurm_compute.id -} - resource "openstack_compute_instance_v2" "login" { name = "${var.cluster_name}-login-0" @@ -99,7 +52,6 @@ resource "openstack_compute_instance_v2" "login" { network { name = var.network } - security_groups = [openstack_networking_secgroup_v2.secgroup_slurm_login.name] } @@ -115,7 +67,6 @@ resource "openstack_compute_instance_v2" "compute" { network { name = var.network } - security_groups = [openstack_networking_secgroup_v2.secgroup_slurm_compute.name] } # TODO: needs fixing for case where creation partially fails resulting in "compute.network is empty list of object" From b9921610d71adea7911263d06895f291ba89231b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 6 Apr 2021 12:40:54 +0000 Subject: [PATCH 017/105] remove ssh proxying from inventory --- environments/sausage-autoscale/terraform/inventory.tpl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/environments/sausage-autoscale/terraform/inventory.tpl b/environments/sausage-autoscale/terraform/inventory.tpl index 361a359ec..965f1f330 100644 --- a/environments/sausage-autoscale/terraform/inventory.tpl +++ b/environments/sausage-autoscale/terraform/inventory.tpl @@ -1,6 +1,5 @@ [all:vars] ansible_user=centos -ssh_proxy=${login.network[0].fixed_ip_v4} openhpc_cluster_name=${cluster_name} [${cluster_name}_login] @@ -11,13 +10,9 @@ ${login.name} ansible_host=${login.network[0].fixed_ip_v4} server_networks='${js ${compute.name} ansible_host=${compute.network[0].fixed_ip_v4} server_networks='${jsonencode({for net in compute.network: net.name => [ net.fixed_ip_v4 ] })}' %{ endfor ~} -[${cluster_name}_compute:vars] -ansible_ssh_common_args='-o ProxyCommand="ssh centos@${login.network[0].fixed_ip_v4} -W %h:%p"' - [cluster_login:children] ${cluster_name}_login -# NOTE: This is hardcoded in the tests role [cluster_compute:children] ${cluster_name}_compute From 0ebba2033ea18d5426562ddc225b2811d5127596 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 6 Apr 2021 12:41:46 +0000 Subject: [PATCH 018/105] add helloworld MPI program setup --- environments/sausage-autoscale/hooks/post.yml | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 environments/sausage-autoscale/hooks/post.yml diff --git a/environments/sausage-autoscale/hooks/post.yml b/environments/sausage-autoscale/hooks/post.yml new file mode 100644 index 000000000..fa23fb4a6 --- /dev/null +++ b/environments/sausage-autoscale/hooks/post.yml @@ -0,0 +1,54 @@ +- hosts: login + gather_facts: false + tasks: + - name: make helloworld directory + file: + path: /mnt/nfs/helloworld + state: directory + owner: centos + group: centos + become: yes + + - name: make helloworld source + copy: + dest: /mnt/nfs/helloworld/helloworld.c + content: | + #include + #include + + int main(int argc, char** argv) { + // Initialize the MPI environment + MPI_Init(NULL, NULL); + + // Get the number of processes + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + // Get the rank of the process + int world_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + // Get the name of the processor + char processor_name[MPI_MAX_PROCESSOR_NAME]; + int name_len; + MPI_Get_processor_name(processor_name, &name_len); + // Print off a hello world message + printf("Hello world from processor %s, rank %d out of %d processors\n", + processor_name, world_rank, world_size); + // Finalize the MPI environment. + MPI_Finalize(); + } + + - name: compile helloworld + shell: + cmd: mpicc -o helloworld helloworld.c + chdir: /mnt/nfs/helloworld/ + + - name: make helloworld sbatch script + copy: + dest: /mnt/nfs/helloworld/helloworld.sh + content: | + #!/bin/bash + #SBATCH --ntasks-per-node=1 + #SBATCH --time=0:10:0 + #SBATCH --exclusive + export SLURM_MPI_TYPE=pmix_v3 + srun helloworld From 79b0516d770ccb134a8386227c48414a323a4da3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 6 Apr 2021 12:42:31 +0000 Subject: [PATCH 019/105] specify NFS server by hostname not IP --- environments/sausage-autoscale/inventory/group_vars/nfs.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 environments/sausage-autoscale/inventory/group_vars/nfs.yml diff --git a/environments/sausage-autoscale/inventory/group_vars/nfs.yml b/environments/sausage-autoscale/inventory/group_vars/nfs.yml new file mode 100644 index 000000000..68b31e8b6 --- /dev/null +++ b/environments/sausage-autoscale/inventory/group_vars/nfs.yml @@ -0,0 +1 @@ +nfs_server_default: "{{ groups['control'] | first }}" \ No newline at end of file From 9f9430a86a61945eaa89b4b43368275aef359096 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 6 Apr 2021 12:44:51 +0000 Subject: [PATCH 020/105] update to latest built image --- .../sausage-autoscale/inventory/group_vars/autoscale.yml | 2 +- environments/sausage-autoscale/terraform/terraform.tfvars | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml index 275b938e7..88f8f57d6 100644 --- a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml +++ b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml @@ -1,6 +1,6 @@ openhpc_autoscale_clouds: /home/centos/steveb-clouds.yaml # TODO: change below to be defined somewhere else -openhpc_autoscale_image: 1c3025f4-8384-4f3f-946e-8ce3b8e32292 +openhpc_autoscale_image: ohpc-compute-210406-1108.qcow2 openhpc_autoscale_network: stackhpc openhpc_autoscale_flavor: chipolata openhpc_autoscale_keypair: steveb-local diff --git a/environments/sausage-autoscale/terraform/terraform.tfvars b/environments/sausage-autoscale/terraform/terraform.tfvars index 04bfb7ade..3e4a4d92e 100644 --- a/environments/sausage-autoscale/terraform/terraform.tfvars +++ b/environments/sausage-autoscale/terraform/terraform.tfvars @@ -6,5 +6,5 @@ network = "stackhpc" login_image = "CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64" login_flavor = "chipolata" -compute_image = "CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64" +compute_image = "ohpc-compute-210406-1108.qcow2" compute_flavor = "chipolata" From 95a8ed27a076c4b589ea67f4d8a4c41325b45f51 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 6 Apr 2021 13:03:26 +0000 Subject: [PATCH 021/105] remove inventory hosts file from git --- .../sausage-autoscale/inventory/.gitignore | 1 + .../sausage-autoscale/inventory/hosts | 31 ------------------- 2 files changed, 1 insertion(+), 31 deletions(-) create mode 100644 environments/sausage-autoscale/inventory/.gitignore delete mode 100755 environments/sausage-autoscale/inventory/hosts diff --git a/environments/sausage-autoscale/inventory/.gitignore b/environments/sausage-autoscale/inventory/.gitignore new file mode 100644 index 000000000..9b0b900ab --- /dev/null +++ b/environments/sausage-autoscale/inventory/.gitignore @@ -0,0 +1 @@ +hosts \ No newline at end of file diff --git a/environments/sausage-autoscale/inventory/hosts b/environments/sausage-autoscale/inventory/hosts deleted file mode 100755 index 203a41376..000000000 --- a/environments/sausage-autoscale/inventory/hosts +++ /dev/null @@ -1,31 +0,0 @@ -[all:vars] -ansible_user=centos -ssh_proxy=10.0.3.100 -openhpc_cluster_name=sbscale - -[sbscale_login] -sbscale-login-0 ansible_host=10.0.3.100 server_networks='{"stackhpc":["10.0.3.100"]}' - -[sbscale_compute] -sbscale-compute-0 ansible_host=10.0.3.107 server_networks='{"stackhpc":["10.0.3.107"]}' -sbscale-compute-1 ansible_host=10.0.3.71 server_networks='{"stackhpc":["10.0.3.71"]}' - -[sbscale_compute:vars] -ansible_ssh_common_args='-o ProxyCommand="ssh centos@10.0.3.100 -W %h:%p"' - -[cluster_login:children] -sbscale_login - -# NOTE: This is hardcoded in the tests role -[cluster_compute:children] -sbscale_compute - -[login:children] -cluster_login - -[compute:children] -cluster_compute - -[cluster:children] -login -compute \ No newline at end of file From 510a1bf85a6c26967a49ae5f4b6c5a1a6237d6bc Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 8 Apr 2021 12:36:02 +0000 Subject: [PATCH 022/105] show cloud nodes even when powered off --- environments/sausage-autoscale/inventory/group_vars/openhpc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml index c7415bb45..3ac9948f8 100644 --- a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml +++ b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml @@ -22,6 +22,7 @@ openhpc_slurm_conf: ResumeTimeout: 300 SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns CommunicationParameters: NoAddrCache + PrivateData: cloud # FIXME: need to set TreeWidth to >= number of nodes (default: 50) # NOTES: # enable_configless: required (as set in template) From 9392c39f2512f5f6157de32e1a47ec886a934fda Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 8 Apr 2021 13:58:05 +0000 Subject: [PATCH 023/105] revert compute image to vanilla cento8.2 --- environments/sausage-autoscale/terraform/terraform.tfvars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/sausage-autoscale/terraform/terraform.tfvars b/environments/sausage-autoscale/terraform/terraform.tfvars index 3e4a4d92e..04bfb7ade 100644 --- a/environments/sausage-autoscale/terraform/terraform.tfvars +++ b/environments/sausage-autoscale/terraform/terraform.tfvars @@ -6,5 +6,5 @@ network = "stackhpc" login_image = "CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64" login_flavor = "chipolata" -compute_image = "ohpc-compute-210406-1108.qcow2" +compute_image = "CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64" compute_flavor = "chipolata" From 9467973554645dfc39bb4251a7c3db88dde97dce Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 8 Sep 2021 13:55:11 +0000 Subject: [PATCH 024/105] remove sausagecloud environment --- environments/sausage-autoscale/README.md | 176 ------------------ environments/sausage-autoscale/activate | 23 --- environments/sausage-autoscale/ansible.cfg | 14 -- environments/sausage-autoscale/hooks/.gitkeep | 0 environments/sausage-autoscale/hooks/post.yml | 54 ------ .../sausage-autoscale/inventory/.gitignore | 1 - .../inventory/group_vars/all/.gitkeep | 0 .../inventory/group_vars/all/secrets.yml | 43 ----- .../inventory/group_vars/autoscale.yml | 6 - .../inventory/group_vars/nfs.yml | 1 - .../inventory/group_vars/openhpc.yml | 31 --- .../inventory/group_vars/rebuild.yml | 1 - .../sausage-autoscale/inventory/groups | 17 -- .../terraform/.terraform.lock.hcl | 39 ---- .../sausage-autoscale/terraform/inventory.tpl | 27 --- .../sausage-autoscale/terraform/main.tf | 82 -------- .../terraform/terraform.tfvars | 10 - 17 files changed, 525 deletions(-) delete mode 100644 environments/sausage-autoscale/README.md delete mode 100644 environments/sausage-autoscale/activate delete mode 100644 environments/sausage-autoscale/ansible.cfg delete mode 100644 environments/sausage-autoscale/hooks/.gitkeep delete mode 100644 environments/sausage-autoscale/hooks/post.yml delete mode 100644 environments/sausage-autoscale/inventory/.gitignore delete mode 100644 environments/sausage-autoscale/inventory/group_vars/all/.gitkeep delete mode 100644 environments/sausage-autoscale/inventory/group_vars/all/secrets.yml delete mode 100644 environments/sausage-autoscale/inventory/group_vars/autoscale.yml delete mode 100644 environments/sausage-autoscale/inventory/group_vars/nfs.yml delete mode 100644 environments/sausage-autoscale/inventory/group_vars/openhpc.yml delete mode 100644 environments/sausage-autoscale/inventory/group_vars/rebuild.yml delete mode 100644 environments/sausage-autoscale/inventory/groups delete mode 100644 environments/sausage-autoscale/terraform/.terraform.lock.hcl delete mode 100644 environments/sausage-autoscale/terraform/inventory.tpl delete mode 100644 environments/sausage-autoscale/terraform/main.tf delete mode 100644 environments/sausage-autoscale/terraform/terraform.tfvars diff --git a/environments/sausage-autoscale/README.md b/environments/sausage-autoscale/README.md deleted file mode 100644 index 69f25a09e..000000000 --- a/environments/sausage-autoscale/README.md +++ /dev/null @@ -1,176 +0,0 @@ -# Sausage-Autoscale cluster - -Dev env for autoscaling on sausagecloud - -# Directory structure - -## terraform - -Contains terraform configuration to deploy infrastructure. - -## inventory - -Ansible inventory for configuring the infrastructure. - -# Setup - -In the repo root, run: - - python3 -m venv venv # TODO: do we need system-site-packages? - . venv/bin/activate - pip install -U upgrade pip - pip install requirements.txt - ansible-galaxy install -r requirements.yml -p ansible/roles - ansible-galaxy collection install -r requirements.yml -p ansible/collections # don't worry about collections path warning - -# Activating the environment - -There is a small environment file that you must `source` which defines environment -variables that reference the configuration path. This is so that we can locate -resources relative the environment directory. - - . environments/sausage-autoscale/activate - -The pattern we use is that all resources referenced in the inventory -are located in the environment directory containing the inventory that -references them. - -# Common configuration - -Configuarion is shared by specifiying multiple inventories. We reference the `common` -inventory from `ansible.cfg`, including it before the environment specific -inventory, located at `./inventory`. - -Inventories specified later in the list can override values set in the inventories -that appear earlier. This allows you to override values set by the `common` inventory. - -Any variables that would be identical for all environments should be defined in the `common` inventory. - -# Passwords - -Prior to running any other playbooks, you need to define a set of passwords. You can -use the `generate-passwords.yml` playbook to automate this process: - -``` -cd -ansible-playbook ansible/adhoc/generate-passwords.yml # can actually be run from anywhere once environment activated -``` - -This will output a set of passwords `inventory/group_vars/all/secrets.yml`. -Placing them in the inventory means that they will be defined for all playbooks. - -It is recommended to encrypt the contents of this file prior to commiting to git: - -``` -ansible-vault encrypt inventory/group_vars/all/secrets.yml -``` - -You will then need to provide a password when running the playbooks e.g: - -``` -ansible-playbook ../ansible/site.yml --tags grafana --ask-vault-password -``` - -See the [Ansible vault documentation](https://docs.ansible.com/ansible/latest/user_guide/vault.html) for more details. - - -# Deploy nodes with Terraform - -- Modify the keypair in `main.tf` and ensure the required Centos images are available on OpenStack. -- Activate the virtualenv and create the instances: - - . venv/bin/activate - cd environments/sausage-autoscale/ - terraform apply - -This creates an ansible inventory file `./inventory`. - -Note that this terraform deploys instances onto an existing network - for production use you probably want to create a network for the cluster. - -# Create and configure cluster with Ansible - -Now run one or more playbooks using: - - cd - ansible-playbook ansible/site.yml - -This provides: -- grafana at `http://:3000` - username `grafana`, password as set above -- prometheus at `http://:9090` - -NB: if grafana's yum repos are down you will see `Errors during downloading metadata for repository 'grafana' ...`. You can work around this using: - - ssh centos@ - sudo rm -rf /etc/yum.repos.d/grafana.repo - wget https://dl.grafana.com/oss/release/grafana-7.3.1-1.x86_64.rpm - sudo yum install grafana-7.3.1-1.x86_64.rpm - exit - ansible-playbook -i inventory monitoring.yml -e grafana_password= --skip-tags grafana_install - -# rebuild.yml - -# FIXME: outdated - -Enable the compute nodes of a Slurm-based OpenHPC cluster on Openstack to be reimaged from Slurm. - -For full details including the Slurm commmands to use see the [role's README](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools/blob/main/roles/rebuild/README.md) - -Ensure you have `~/.config/openstack/clouds.yaml` defining authentication for a a single Openstack cloud (see above README to change location). - -Then run: - - ansible-playbook -i inventory rebuild.yml - -Note this does not rebuild the nodes, only deploys the tools to do so. - -# test.yml - -This runs MPI-based tests on the cluster: -- `pingpong`: Runs Intel MPI Benchmark's IMB-MPI1 pingpong between a pair of (scheduler-selected) nodes. Reports zero-size message latency and maximum bandwidth. -- `pingmatrix`: Runs a similar pingpong test but between all pairs of nodes. Reports zero-size message latency & maximum bandwidth. -- `hpl-solo`: Runs HPL **separately** on all nodes, using 80% of memory, reporting Gflops on each node. - -These names can be used as tags to run only a subset of tests. For full details see the [role's README](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools/blob/main/roles/test/README.md). - -Note these are intended as post-deployment tests for a cluster to which you have root access - they are **not** intended for use on a system running production jobs: -- Test directories are created within `openhpc_tests_rootdir` (here `/mnt/nfs/ohcp-tests`) which must be on a shared filesystem (read/write from login/control and compute nodes) -- Generally, packages are only installed on the control/login node, and `/opt` is exported via NFS to the compute nodes. -- The exception is the `slurm-libpmi-ohpc` package (required for `srun` with Intel MPI) which is installed on all nodes. - -To achieve best performance for HPL set `openhpc_tests_hpl_NB` in [test.yml](test.yml) to the appropriate the HPL blocksize 'NB' for the compute node processor - for Intel CPUs see [here](https://software.intel.com/content/www/us/en/develop/documentation/mkl-linux-developer-guide/top/intel-math-kernel-library-benchmarks/intel-distribution-for-linpack-benchmark/configuring-parameters.html). - -Then run: - - ansible-playbook ../ansible/adhoc/test.yml - -Results will be reported in the ansible stdout - the pingmatrix test also writes an html results file onto the ansible host. - -Note that you can still use the `test.yml` playbook even if the terraform/ansible in this repo wasn't used to deploy the cluster - as long as it's running OpenHPC v2. Simply create an appropriate `inventory` file, e.g: - - [all:vars] - ansible_user=centos - - [cluster:children] - cluster_login - cluster_compute - - [cluster_login] - slurm-control - - [cluster_compute] - cpu-h21a5-u3-svn2 - cpu-h21a5-u3-svn4 - ... - -And run the `test.yml` playbook as described above. If you want to run tests only on a group from this inventory, rather than an entire partition, you can -use ``--limit`` - -Then running the tests passing this file as extra_vars: - - ansible-playbook ../ansible/test.yml --limit group-in-inventory - -# Destroying the cluster - -When finished, run: - - terraform destroy --auto-approve diff --git a/environments/sausage-autoscale/activate b/environments/sausage-autoscale/activate deleted file mode 100644 index e74031095..000000000 --- a/environments/sausage-autoscale/activate +++ /dev/null @@ -1,23 +0,0 @@ -export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) -echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" - -APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) -export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" - -export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") -echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" - -export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") -echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root" - -export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") -echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root" - -export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT") -echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" - -if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then - export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg -fi - - diff --git a/environments/sausage-autoscale/ansible.cfg b/environments/sausage-autoscale/ansible.cfg deleted file mode 100644 index c243e9958..000000000 --- a/environments/sausage-autoscale/ansible.cfg +++ /dev/null @@ -1,14 +0,0 @@ -[defaults] -any_errors_fatal = True -stdout_callback = debug -stderr_callback = debug -gathering = smart -forks = 30 -host_key_checking = False -inventory = ../common/inventory,inventory -collections_path = ../../ansible/collections -roles_path = ../../ansible/roles - -[ssh_connection] -ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null -pipelining = True diff --git a/environments/sausage-autoscale/hooks/.gitkeep b/environments/sausage-autoscale/hooks/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/environments/sausage-autoscale/hooks/post.yml b/environments/sausage-autoscale/hooks/post.yml deleted file mode 100644 index fa23fb4a6..000000000 --- a/environments/sausage-autoscale/hooks/post.yml +++ /dev/null @@ -1,54 +0,0 @@ -- hosts: login - gather_facts: false - tasks: - - name: make helloworld directory - file: - path: /mnt/nfs/helloworld - state: directory - owner: centos - group: centos - become: yes - - - name: make helloworld source - copy: - dest: /mnt/nfs/helloworld/helloworld.c - content: | - #include - #include - - int main(int argc, char** argv) { - // Initialize the MPI environment - MPI_Init(NULL, NULL); - - // Get the number of processes - int world_size; - MPI_Comm_size(MPI_COMM_WORLD, &world_size); - // Get the rank of the process - int world_rank; - MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); - // Get the name of the processor - char processor_name[MPI_MAX_PROCESSOR_NAME]; - int name_len; - MPI_Get_processor_name(processor_name, &name_len); - // Print off a hello world message - printf("Hello world from processor %s, rank %d out of %d processors\n", - processor_name, world_rank, world_size); - // Finalize the MPI environment. - MPI_Finalize(); - } - - - name: compile helloworld - shell: - cmd: mpicc -o helloworld helloworld.c - chdir: /mnt/nfs/helloworld/ - - - name: make helloworld sbatch script - copy: - dest: /mnt/nfs/helloworld/helloworld.sh - content: | - #!/bin/bash - #SBATCH --ntasks-per-node=1 - #SBATCH --time=0:10:0 - #SBATCH --exclusive - export SLURM_MPI_TYPE=pmix_v3 - srun helloworld diff --git a/environments/sausage-autoscale/inventory/.gitignore b/environments/sausage-autoscale/inventory/.gitignore deleted file mode 100644 index 9b0b900ab..000000000 --- a/environments/sausage-autoscale/inventory/.gitignore +++ /dev/null @@ -1 +0,0 @@ -hosts \ No newline at end of file diff --git a/environments/sausage-autoscale/inventory/group_vars/all/.gitkeep b/environments/sausage-autoscale/inventory/group_vars/all/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/environments/sausage-autoscale/inventory/group_vars/all/secrets.yml b/environments/sausage-autoscale/inventory/group_vars/all/secrets.yml deleted file mode 100644 index 29f86e2dd..000000000 --- a/environments/sausage-autoscale/inventory/group_vars/all/secrets.yml +++ /dev/null @@ -1,43 +0,0 @@ ---- -# Ansible managed -secrets_openhpc_elasticsearch_admin_password: pC2:.3c1QShckz1BxB5u -secrets_openhpc_elasticsearch_kibana_password: TVevBQ2JXTyoSW.bT_ba -secrets_openhpc_grafana_admin_password: QMiyymbFD.YC7M.39mCa -secrets_openhpc_mungekey: - content: 'YCbG2tA9UWIezb3xRRqK2vNzd6tOnGMs0kpPO+quHuqGXLsXagFjXq8Kqnd1/UIqoRW/TH3AfRNF - - yhuvemkRd3TlZLrvBZzMcQw0jAP2sI/+4hPpeIEk1kcQFVgE9A1HppLc0CxI7SskDPmY3vGwnuo7 - - 4K19jYxgPkIb9FUKCNetKgHR7L78LsbZxWUYkmvO6rCDUeLgMchFkjoi5Lwr+i1JJAxoGhT0yLmW - - D0hEenYePgsefzopwEnKEHByhnx0ROlJ86S58bh+rOnAqBWWJ8Im71NeJS58Moyrh9VLOkmRUCIj - - e0bhEKd7+/a5I4GN6KIo1oXRT74TxVHkwypSqFgAbVF5KMSsuY+5eG4JLcpTOGZYQpbAY9ICtnjM - - U6T6YhVXYvurVcb7N2ybub8veIwWeS98Yr2C9ZwsBzvpA2Fk3wxCFjo6vxe47U2nsezIUAUxVH7U - - V6jNMVoT4GZMQcKRsTp0zoAVAund6jMjsQ6h6Ly8EYyiKz6itTq4L5OqotZ0tUCX6xnVxtaD4LZb - - tQfZbcxPdd1C7NtTfImUsxHDp2CBIu/VDZWen/iafGaPeI83XVkC8Kk1QwhjBcRnlJEw2cK4TdBW - - 6Soy8CnNZMd92iqlqIZs7iZHu9FLyLiLCrkjaDnxM0UH0RP9CPTihiE47w874HVOQioacNX6U3Dz - - 3I0vxUAd/AF6ZrmBtOh3EekbxPtFNY7Gw3qPCbbJM6v5+XFjz//Lj1GFYzGK1DA7jhekzk5vtOWe - - k2vZcyqPYOIxFlqtm3OGt+f6V9G/xvYvRofE1EbO9qU1vqVRbW8Z7dqOR4AwomW2UlhH9G/ijLZZ - - EKqOWiCVONfMEe+Cndi/WH80R/nASx8hMJrTp0VOPtNjN+LWb/pPE/cSY9hbuA2EvqJB4gFQzmqz - - sFpQAqPVS8/+vesiKKVcnxUeMoRBx8g9CmdFTIvz5fU6M9lh7DjYoKcKx7eKtQhCAktyeI21o+Tn - - 2gyALzcxX29VCJy/8n/8qC26T9wLjN2URpO95yT2+k+Uv96R6Uj4zK4CD2c7yXm/0CmyrUWf3EPp - - VeaaWhy+KKR7T923TCEETiwSlwOynwb4lHLPmE17t8XBqYAqWGL2e8uDuLYhPAf+U5Bwt+LiXO5j - - hjg6szwpdSirDl1vpkqTDaOGdzVcTb+j8NfHDCdVOJbWu2I8sAHkjDRl+faagwxeMIGpTjoRi225 - - mj9rJdCbZxCSrwbE1r1koHrJZ+XN4AG4OrmTXdXMSLhpJuptyeNsRmvWxBe665tAxktRZ/kQUY3c - - W1zq03n3wtBkilL1wh/Fata4XrN5UZhpVSwT+7Z3gPacJVt5UjedkqpW8br+Pxw4efQExeDH2g==' -secrets_openhpc_mysql_root_password: XuY4ATpIzRje.PhkXI-t -secrets_openhpc_mysql_slurm_password: bS7GCWoTtsf4cjLo70S5 diff --git a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml deleted file mode 100644 index 88f8f57d6..000000000 --- a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml +++ /dev/null @@ -1,6 +0,0 @@ -openhpc_autoscale_clouds: /home/centos/steveb-clouds.yaml -# TODO: change below to be defined somewhere else -openhpc_autoscale_image: ohpc-compute-210406-1108.qcow2 -openhpc_autoscale_network: stackhpc -openhpc_autoscale_flavor: chipolata -openhpc_autoscale_keypair: steveb-local diff --git a/environments/sausage-autoscale/inventory/group_vars/nfs.yml b/environments/sausage-autoscale/inventory/group_vars/nfs.yml deleted file mode 100644 index 68b31e8b6..000000000 --- a/environments/sausage-autoscale/inventory/group_vars/nfs.yml +++ /dev/null @@ -1 +0,0 @@ -nfs_server_default: "{{ groups['control'] | first }}" \ No newline at end of file diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml deleted file mode 100644 index 3ac9948f8..000000000 --- a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml +++ /dev/null @@ -1,31 +0,0 @@ -openhpc_login_only_nodes: '' -openhpc_slurm_partitions: - - name: "compute" - cloud_nodes: 2 -openhpc_packages: - - slurm-libpmi-ohpc - - wget - - lmod-defaults-gnu9-openmpi4-ohpc - - imb-gnu9-openmpi4-ohpc -openhpc_slurm_conf: - SlurmctldDebug: debug5 - SlurmctldLogFile: /var/log/slurmctld.log - # SlurmdDebug: debug5 - DebugFlags: PowerSave - SlurmdLogFile: /var/log/slurmd.log - SuspendExcNodes: "{{ groups['compute'] | join(',') }}" # i.e. all tf-defined nodes in the partition - # BELOW FOR TESTING ONLY, NOT PRODUCTION VALUES! - SuspendTime: 120 - SuspendTimeout: 300 - SuspendProgram: /opt/slurm-tools/bin/suspend - ResumeProgram: /opt/slurm-tools/bin/resume - ResumeTimeout: 300 - SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns - CommunicationParameters: NoAddrCache - PrivateData: cloud - # FIXME: need to set TreeWidth to >= number of nodes (default: 50) - # NOTES: - # enable_configless: required (as set in template) - # cloud_dns: requires working DNS - # power_save_*interval: options are defaults but should enable changes - # reboot_from_controller: should be really useful but actually we're already setup for rebuild on computes, so use that diff --git a/environments/sausage-autoscale/inventory/group_vars/rebuild.yml b/environments/sausage-autoscale/inventory/group_vars/rebuild.yml deleted file mode 100644 index b2eba881a..000000000 --- a/environments/sausage-autoscale/inventory/group_vars/rebuild.yml +++ /dev/null @@ -1 +0,0 @@ -openhpc_rebuild_clouds: ~/steveb-openrc.sh diff --git a/environments/sausage-autoscale/inventory/groups b/environments/sausage-autoscale/inventory/groups deleted file mode 100644 index 4cec8ab8f..000000000 --- a/environments/sausage-autoscale/inventory/groups +++ /dev/null @@ -1,17 +0,0 @@ -[control:children] -login - -[nfs:children] -cluster - -[openhpc:children] -cluster - -[mysql:children] -control - -[rebuild:children] -compute - -[autoscale:children] -login # actually controller diff --git a/environments/sausage-autoscale/terraform/.terraform.lock.hcl b/environments/sausage-autoscale/terraform/.terraform.lock.hcl deleted file mode 100644 index 8f9e2298d..000000000 --- a/environments/sausage-autoscale/terraform/.terraform.lock.hcl +++ /dev/null @@ -1,39 +0,0 @@ -# This file is maintained automatically by "terraform init". -# Manual edits may be lost in future updates. - -provider "registry.terraform.io/hashicorp/local" { - version = "2.1.0" - hashes = [ - "h1:EYZdckuGU3n6APs97nS2LxZm3dDtGqyM4qaIvsmac8o=", - "zh:0f1ec65101fa35050978d483d6e8916664b7556800348456ff3d09454ac1eae2", - "zh:36e42ac19f5d68467aacf07e6adcf83c7486f2e5b5f4339e9671f68525fc87ab", - "zh:6db9db2a1819e77b1642ec3b5e95042b202aee8151a0256d289f2e141bf3ceb3", - "zh:719dfd97bb9ddce99f7d741260b8ece2682b363735c764cac83303f02386075a", - "zh:7598bb86e0378fd97eaa04638c1a4c75f960f62f69d3662e6d80ffa5a89847fe", - "zh:ad0a188b52517fec9eca393f1e2c9daea362b33ae2eb38a857b6b09949a727c1", - "zh:c46846c8df66a13fee6eff7dc5d528a7f868ae0dcf92d79deaac73cc297ed20c", - "zh:dc1a20a2eec12095d04bf6da5321f535351a594a636912361db20eb2a707ccc4", - "zh:e57ab4771a9d999401f6badd8b018558357d3cbdf3d33cc0c4f83e818ca8e94b", - "zh:ebdcde208072b4b0f8d305ebf2bfdc62c926e0717599dcf8ec2fd8c5845031c3", - "zh:ef34c52b68933bedd0868a13ccfd59ff1c820f299760b3c02e008dc95e2ece91", - ] -} - -provider "registry.terraform.io/terraform-provider-openstack/openstack" { - version = "1.40.0" - hashes = [ - "h1:gBrsytNqUG1ZQPKys8KAvZkjesjimXb7vcrTmyFUTM0=", - "zh:278a878a256ec5447e1e64b5d9a691e3a1f7d5c247e536500c97c5b996bc2531", - "zh:5c7ae8cfe0831557c8c1988581f3fd0bdf182d15bcefbe645bb91564027e67d4", - "zh:944d75fc1e3d54df4c47e5d34007927abf4fa79e2107b05d14f11b52970a6164", - "zh:a50922d05185598a9264a25eff6f01ce7671c70a562a3ef93e9bb7a449e358b0", - "zh:adb87ad3782f1f7a5eaeedbcffa0e5559d2372502f9af91781aa13c11cf4b47b", - "zh:c0e4218259a37f16c10b4779009f0b0b5d467e4d347fc2aa3a212f1ee3a71d63", - "zh:c2eb4f40cbd78238500a3a84ba995060bfc50f770bd13732ae50b73687f3dce6", - "zh:ca8a38fe932972d0d7fdc51f84ae775648b7aff3c96b8ead085007e880ee987f", - "zh:ce4f703719d646507d6006085dc1114954c75710226df43078169b2b01993537", - "zh:e29542a492bbf55613d20b5f68ed4357cbc8bb09d61a1752d2976e5e1608879d", - "zh:e68d47b85b9da089f8f7102c23545331c15a9e6ea99875926d2ebf6e38bf2073", - "zh:fdb10cb345250d7c47e342def106bd10ef75493ef6edf15809e10e6367a0d9f6", - ] -} diff --git a/environments/sausage-autoscale/terraform/inventory.tpl b/environments/sausage-autoscale/terraform/inventory.tpl deleted file mode 100644 index 965f1f330..000000000 --- a/environments/sausage-autoscale/terraform/inventory.tpl +++ /dev/null @@ -1,27 +0,0 @@ -[all:vars] -ansible_user=centos -openhpc_cluster_name=${cluster_name} - -[${cluster_name}_login] -${login.name} ansible_host=${login.network[0].fixed_ip_v4} server_networks='${jsonencode({for net in login.network: net.name => [ net.fixed_ip_v4 ] })}' - -[${cluster_name}_compute] -%{ for compute in computes ~} -${compute.name} ansible_host=${compute.network[0].fixed_ip_v4} server_networks='${jsonencode({for net in compute.network: net.name => [ net.fixed_ip_v4 ] })}' -%{ endfor ~} - -[cluster_login:children] -${cluster_name}_login - -[cluster_compute:children] -${cluster_name}_compute - -[login:children] -cluster_login - -[compute:children] -cluster_compute - -[cluster:children] -login -compute \ No newline at end of file diff --git a/environments/sausage-autoscale/terraform/main.tf b/environments/sausage-autoscale/terraform/main.tf deleted file mode 100644 index fb9931dec..000000000 --- a/environments/sausage-autoscale/terraform/main.tf +++ /dev/null @@ -1,82 +0,0 @@ -terraform { - required_version = ">= 0.14" - required_providers { - openstack = { - source = "terraform-provider-openstack/openstack" - } - } -} - -variable "environment_root" { - type = string -} - -variable "compute_names" { - default = ["compute-0", "compute-1"] -} - -variable "cluster_name" { - default = "testohpc" -} - -variable "key_pair" { - type = string -} - -variable "network" { - type = string -} - -variable "login_flavor" { - type = string -} - -variable "login_image" { - type = string -} - -variable "compute_flavor" { - type = string -} - -variable "compute_image" { - type = string -} - -resource "openstack_compute_instance_v2" "login" { - - name = "${var.cluster_name}-login-0" - image_name = var.login_image - flavor_name = var.login_flavor - key_pair = var.key_pair - network { - name = var.network - } -} - - -resource "openstack_compute_instance_v2" "compute" { - - for_each = toset(var.compute_names) - - name = "${var.cluster_name}-${each.value}" - image_name = var.compute_image - flavor_name = var.compute_flavor - #flavor_name = "compute-A" - key_pair = var.key_pair - network { - name = var.network - } -} - -# TODO: needs fixing for case where creation partially fails resulting in "compute.network is empty list of object" -resource "local_file" "hosts" { - content = templatefile("${path.module}/inventory.tpl", - { - "cluster_name": var.cluster_name - "login": openstack_compute_instance_v2.login, - "computes": openstack_compute_instance_v2.compute, - }, - ) - filename = "${var.environment_root}/inventory/hosts" -} \ No newline at end of file diff --git a/environments/sausage-autoscale/terraform/terraform.tfvars b/environments/sausage-autoscale/terraform/terraform.tfvars deleted file mode 100644 index 04bfb7ade..000000000 --- a/environments/sausage-autoscale/terraform/terraform.tfvars +++ /dev/null @@ -1,10 +0,0 @@ -compute_names = ["compute-0", "compute-1"] -cluster_name = "sbscale" -key_pair = "steveb-local" -network = "stackhpc" - -login_image = "CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64" -login_flavor = "chipolata" - -compute_image = "CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64" -compute_flavor = "chipolata" From 000a4e773a0ffbdc0bf9bc6d1405c2b0bb4d6e2d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 8 Sep 2021 13:55:49 +0000 Subject: [PATCH 025/105] move autoscale into slurm --- ansible/autoscale.yml | 33 ------------------------ ansible/slurm.yml | 59 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 50 insertions(+), 42 deletions(-) delete mode 100644 ansible/autoscale.yml diff --git a/ansible/autoscale.yml b/ansible/autoscale.yml deleted file mode 100644 index 2d5fba68a..000000000 --- a/ansible/autoscale.yml +++ /dev/null @@ -1,33 +0,0 @@ -- name: Ensure /etc/openstack/ exists and is readable by slurm # TODO: think this clashes with rebuild? - file: - path: /etc/openstack/ - state: directory - owner: slurm - group: slurm - mode: u=rx -- name: Copy out clouds.yaml - copy: - src: "{{ openhpc_autoscale_clouds }}" - dest: /etc/openstack/clouds.yaml - owner: slurm - group: slurm - mode: '0400' -- name: Setup slurm tools - include_role: - name: stackhpc.slurm_openstack_tools.pytools -- name: Create SuspendProgram - template: - src: suspend.j2 - dest: /opt/slurm-tools/bin/suspend - owner: slurm - group: slurm - mode: u=rwx,go= - tags: suspend -- name: Create ResumeProgram - template: - src: resume.j2 - dest: /opt/slurm-tools/bin/resume - owner: slurm - group: slurm - mode: u=rwx,go= - tags: resume diff --git a/ansible/slurm.yml b/ansible/slurm.yml index f94145f81..6fa5c8a50 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -15,14 +15,14 @@ tags: - openhpc tasks: - - name: Add CentOS 8.3 Vault repo for OpenHPC hwloc dependency - # NB: REMOVE THIS once OpenHPC works on CentOS 8.4 - yum_repository: - name: vault - file: CentOS-Linux-Vault8.3 - description: CentOS 8.3 packages from Vault - baseurl: https://vault.centos.org/8.3.2011/BaseOS/$basearch/os/ - gpgkey: file:///etc/pki/rpm-gpg/RPM-GPG-KEY-centosofficial + # - name: Add CentOS 8.3 Vault repo for OpenHPC hwloc dependency + # # NB: REMOVE THIS once OpenHPC works on CentOS 8.4 + # yum_repository: + # name: vault + # file: CentOS-Linux-Vault8.3 + # description: CentOS 8.3 packages from Vault + # baseurl: https://vault.centos.org/8.3.2011/BaseOS/$basearch/os/ + # gpgkey: file:///etc/pki/rpm-gpg/RPM-GPG-KEY-centosofficial - import_role: name: stackhpc.openhpc @@ -41,7 +41,48 @@ tags: - autoscale tasks: - - import_tasks: autoscale.yml +- name: Configure autoscale + hosts: autoscale + become: yes + tags: + - autoscale + tasks: + - name: Create /etc/openstack + file: + path: /etc/openstack + state: directory + owner: root + group: root + mode: '0400' + - name: Copy out clouds.yaml + copy: + src: "{{ autoscale_clouds }}" + dest: /etc/openstack/clouds.yaml + owner: root + group: root + mode: '0400' + - name: Setup slurm tools (to get venv) + include_role: + name: stackhpc.slurm_openstack_tools.pytools + - name: Create SuspendProgram + template: + src: suspend.j2 + dest: /opt/slurm-tools/bin/suspend + owner: slurm + group: slurm + mode: u=rwx,go= + tags: suspend + - name: Create ResumeProgram + template: + src: resume.j2 + dest: /opt/slurm-tools/bin/resume + owner: slurm + group: slurm + mode: u=rwx,go= + tags: resume + - name: Reconfigure slurm + command: + cmd: scontrol reconfigure - name: Set locked memory limits on user-facing nodes hosts: From f6514e6f8940707486297e4146ea025c8ffa2f0f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 8 Sep 2021 13:57:08 +0000 Subject: [PATCH 026/105] allow for overriding slurm config in appliance --- environments/common/inventory/group_vars/all/openhpc.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index b6d8abacf..9c6ad2925 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -23,4 +23,8 @@ openhpc_extra_packages: [] openhpc_packages: "{{ openhpc_default_packages + openhpc_extra_packages }}" openhpc_munge_key: "{{ secrets_openhpc_mungekey | b64decode }}" openhpc_slurm_configless: true -openhpc_login_only_nodes: login \ No newline at end of file +openhpc_login_only_nodes: login + +openhpc_extra_config_overrides: {} +appliance_openhpc_extra_config: "{{ autoscale_openhpc_extra_config if groups['autoscale'] else {} }}" +openhpc_extra_config: "{{ appliance_openhpc_extra_config | combine(openhpc_extra_config_overrides) }}" From b285620e3d123aca983dc34bf6980470addd2732 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 8 Sep 2021 13:57:41 +0000 Subject: [PATCH 027/105] add autoscale group/group_vars --- ansible/templates/resume.j2 | 8 +++---- .../inventory/group_vars/all/autoscale.yml | 23 +++++++++++++++++++ environments/common/inventory/groups | 3 +++ 3 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 environments/common/inventory/group_vars/all/autoscale.yml diff --git a/ansible/templates/resume.j2 b/ansible/templates/resume.j2 index bf79c6c04..55a03d1ca 100644 --- a/ansible/templates/resume.j2 +++ b/ansible/templates/resume.j2 @@ -6,10 +6,10 @@ import openstack import pprint # all take a name or ID: -IMAGE = "{{ openhpc_autoscale_image }}" -NETWORK = "{{ openhpc_autoscale_network }}" -FLAVOR = "{{ openhpc_autoscale_flavor }}" -KEYPAIR = "{{ openhpc_autoscale_keypair }}" +IMAGE = "{{ autoscale_image }}" +NETWORK = "{{ autoscale_network }}" +FLAVOR = "{{ autoscale_flavor }}" +KEYPAIR = "{{ autoscale_keypair }}" # configure logging to syslog - by default only "info" and above categories appear logger = logging.getLogger("syslogger") diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml new file mode 100644 index 000000000..c5580d173 --- /dev/null +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -0,0 +1,23 @@ +autoscale_clouds: ~/.config/openstack/clouds.yaml +# TODO: change below to be defined somewhere else, poss as part of slurm config for partition?? +autoscale_image: ohpc-compute-210406-1108.qcow2 +autoscale_network: stackhpc +autoscale_flavor: chipolata +autoscale_keypair: steveb-local + +autoscale_openhpc_extra_config: + # required parameters: + SuspendExcNodes: "{{ groups['compute'] | join(',') }}" # i.e. all tf-defined nodes in the partition # TODO: fixme: hardcoded "compute" + SuspendProgram: /opt/slurm-tools/bin/suspend # TODO: fixme: hijacking slurm-tools + ResumeProgram: /opt/slurm-tools/bin/resume # TODO: fixme: hijacking slurm-tools + SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns + CommunicationParameters: NoAddrCache + # recommended: + PrivateData: cloud # shows cloud node state + # TODO: for testing only, not production: + DebugFlags: PowerSave + SuspendTime: 120 + SuspendTimeout: 300 + ResumeTimeout: 300 + # FIXME: maybe need to set TreeWidth to >= number of nodes (default: 50) + # power_save_*interval: options are defaults but should enable changes diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 4d86f8e7d..140fd509f 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -74,3 +74,6 @@ cluster [update] # All hosts to (optionally) run yum update on. + +[autoscale] +# Add control to enable autoscaling From 7ae504218720abd9ebfa9ff44493633db5a61c93 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 8 Sep 2021 13:57:58 +0000 Subject: [PATCH 028/105] use autoscale branch of openhpc role --- requirements.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 90f280107..afa1ab90e 100644 --- a/requirements.yml +++ b/requirements.yml @@ -1,8 +1,9 @@ --- roles: - src: stackhpc.nfs - - src: stackhpc.openhpc + - src: https://github.com/stackhpc/ansible-role-openhpc version: feature/autoscale + name: stackhpc.openhpc - src: cloudalchemy.node_exporter - src: cloudalchemy.blackbox-exporter - src: cloudalchemy.prometheus From ea5c3bcd4c87619232bd630fb18dc9627770cd35 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Sep 2021 11:27:39 +0000 Subject: [PATCH 029/105] Add podman_cidr to allow changing podman network range --- ansible/roles/filebeat/templates/filebeat.service.j2 | 2 +- ansible/roles/kibana/templates/kibana.service.j2 | 2 +- ansible/roles/opendistro/templates/opendistro.service.j2 | 2 +- ansible/roles/podman/tasks/validate.yml | 5 +++++ environments/common/inventory/group_vars/all/podman.yml | 1 + requirements.txt | 1 + 6 files changed, 10 insertions(+), 3 deletions(-) diff --git a/ansible/roles/filebeat/templates/filebeat.service.j2 b/ansible/roles/filebeat/templates/filebeat.service.j2 index 9553784a2..454ed2339 100644 --- a/ansible/roles/filebeat/templates/filebeat.service.j2 +++ b/ansible/roles/filebeat/templates/filebeat.service.j2 @@ -12,7 +12,7 @@ After=network-online.target [Service] Environment=PODMAN_SYSTEMD_UNIT=%n Restart=always -ExecStart=/usr/bin/podman run --sdnotify=conmon --cgroups=no-conmon --replace --name filebeat --user root --restart=always --security-opt label=disable --volume /var/log/:/logs:ro --volume /etc/filebeat/filebeat.yml:/usr/share/filebeat/filebeat.yml:ro --detach=True docker.elastic.co/beats/filebeat-oss:7.9.3 -e -strict.perms=false -d "*" +ExecStart=/usr/bin/podman run --network slirp4netns:cidr={{ podman_cidr }} --sdnotify=conmon --cgroups=no-conmon --replace --name filebeat --user root --restart=always --security-opt label=disable --volume /var/log/:/logs:ro --volume /etc/filebeat/filebeat.yml:/usr/share/filebeat/filebeat.yml:ro --detach=True docker.elastic.co/beats/filebeat-oss:7.9.3 -e -strict.perms=false -d "*" ExecStop=/usr/bin/podman stop --ignore filebeat -t 10 ExecStopPost=/usr/bin/podman rm --ignore -f filebeat KillMode=none diff --git a/ansible/roles/kibana/templates/kibana.service.j2 b/ansible/roles/kibana/templates/kibana.service.j2 index 91011344a..4658e4cb3 100644 --- a/ansible/roles/kibana/templates/kibana.service.j2 +++ b/ansible/roles/kibana/templates/kibana.service.j2 @@ -9,7 +9,7 @@ After=network-online.target [Service] Environment=PODMAN_SYSTEMD_UNIT=%n Restart=always -ExecStart=/usr/bin/podman run --sdnotify=conmon --cgroups=no-conmon -d --replace --name kibana --restart=no --env ELASTICSEARCH_URL=https://{{ elasticsearch_address }}:9200 --env ELASTICSEARCH_HOSTS=https://{{ elasticsearch_address}}:9200 --env ELASTICSEARCH_USERNAME=admin --env ELASTICSEARCH_PASSWORD="{{ vault_elasticsearch_admin_password }}" --publish 5601:5601 --detach=True amazon/opendistro-for-elasticsearch-kibana:1.12.0 +ExecStart=/usr/bin/podman run --network slirp4netns:cidr={{ podman_cidr }} --sdnotify=conmon --cgroups=no-conmon -d --replace --name kibana --restart=no --env ELASTICSEARCH_URL=https://{{ elasticsearch_address }}:9200 --env ELASTICSEARCH_HOSTS=https://{{ elasticsearch_address}}:9200 --env ELASTICSEARCH_USERNAME=admin --env ELASTICSEARCH_PASSWORD="{{ vault_elasticsearch_admin_password }}" --publish 5601:5601 --detach=True amazon/opendistro-for-elasticsearch-kibana:1.12.0 ExecStop=/usr/bin/podman stop --ignore kibana -t 10 ExecStopPost=/usr/bin/podman rm --ignore -f kibana KillMode=none diff --git a/ansible/roles/opendistro/templates/opendistro.service.j2 b/ansible/roles/opendistro/templates/opendistro.service.j2 index 1b2095795..ddf99aea6 100644 --- a/ansible/roles/opendistro/templates/opendistro.service.j2 +++ b/ansible/roles/opendistro/templates/opendistro.service.j2 @@ -9,7 +9,7 @@ After=network-online.target [Service] Environment=PODMAN_SYSTEMD_UNIT=%n Restart=always -ExecStart=/usr/bin/podman run --sdnotify=conmon --cgroups=no-conmon -d --replace --name opendistro --restart=no --user elasticsearch --ulimit memlock=-1:-1 --ulimit nofile=65536:65536 --volume opendistro:/usr/share/elasticsearch/data --volume /etc/elastic/internal_users.yml:/usr/share/elasticsearch/plugins/opendistro_security/securityconfig/internal_users.yml:ro --env node.name=opendistro --env discovery.type=single-node --env bootstrap.memory_lock=true --env "ES_JAVA_OPTS=-Xms512m -Xmx512m" --publish 9200:9200 amazon/opendistro-for-elasticsearch:1.12.0 +ExecStart=/usr/bin/podman run --network slirp4netns:cidr={{ podman_cidr }} --sdnotify=conmon --cgroups=no-conmon -d --replace --name opendistro --restart=no --user elasticsearch --ulimit memlock=-1:-1 --ulimit nofile=65536:65536 --volume opendistro:/usr/share/elasticsearch/data --volume /etc/elastic/internal_users.yml:/usr/share/elasticsearch/plugins/opendistro_security/securityconfig/internal_users.yml:ro --env node.name=opendistro --env discovery.type=single-node --env bootstrap.memory_lock=true --env "ES_JAVA_OPTS=-Xms512m -Xmx512m" --publish 9200:9200 amazon/opendistro-for-elasticsearch:1.12.0 ExecStop=/usr/bin/podman stop --ignore opendistro -t 10 # note for some reason this returns status=143 which makes systemd show the unit as failed, not stopped ExecStopPost=/usr/bin/podman rm --ignore -f opendistro diff --git a/ansible/roles/podman/tasks/validate.yml b/ansible/roles/podman/tasks/validate.yml index 14b13d11f..7edd84ee9 100644 --- a/ansible/roles/podman/tasks/validate.yml +++ b/ansible/roles/podman/tasks/validate.yml @@ -7,3 +7,8 @@ assert: that: podman_tmp_fstype.stdout == 'tmpfs' fail_msg: "{{ podman_tmp_fstype }} (variable podman_tmp_fstype) must be on tmpfs" + +- name: Check host IP is not within podman network CIDR + assert: + that: ( podman_cidr | ansible.netcommon.network_in_network(ansible_default_ipv4.address)) == false + fail_msg: "Default ipv4 address {{ ansible_default_ipv4.address }} for {{ inventory_hostname }} is in podman network range {{ podman_cidr }} - set `podman_cidr` to avoid host network address range" \ No newline at end of file diff --git a/environments/common/inventory/group_vars/all/podman.yml b/environments/common/inventory/group_vars/all/podman.yml index 10ece8cff..866b81090 100644 --- a/environments/common/inventory/group_vars/all/podman.yml +++ b/environments/common/inventory/group_vars/all/podman.yml @@ -1 +1,2 @@ podman_users: "{{ appliances_local_users_podman }}" +podman_cidr: 10.0.2.0/24 # see slirp4netns:cidr= at https://docs.podman.io/en/latest/markdown/podman-run.1.html diff --git a/requirements.txt b/requirements.txt index 6895fa3b8..57cca7e83 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ passlib[bcrypt] cookiecutter vagranttoansible selinux # this is a shim to avoid having to use --system-site-packages, you still need sudo yum install libselinux-python3 +netaddr From 237b0698c13e0ec9ebe6fb9fa0f25896d2fd9c39 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Sep 2021 13:32:25 +0000 Subject: [PATCH 030/105] fix order of slurm.conf changes and {Resume,Suspend}Program creation (workaround) --- ansible/slurm.yml | 10 ++++++++++ .../common/inventory/group_vars/all/autoscale.yml | 2 -- .../common/inventory/group_vars/all/openhpc.yml | 1 - 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 6fa5c8a50..444de81eb 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -36,6 +36,7 @@ name: stackhpc.slurm_openstack_tools.rebuild - name: Setup autoscaling suspend/resume programs + # has to happen *after* slurm user has been created hosts: autoscale # this is the *controller* become: yes tags: @@ -80,6 +81,15 @@ group: slurm mode: u=rwx,go= tags: resume + - name: Add Resume/SuspendProgram parameters + community.general.ini_file: + path: /etc/slurm/slurm.conf + option: "{{ item.key }}" + section: null + value: "{{ item.value }}" + no_extra_spaces: true + create: no + loop: "{{ {'SuspendProgram':'/opt/slurm-tools/bin/suspend', 'ResumeProgram':'/opt/slurm-tools/bin/resume'} | dict2items }}" # TODO: fixme: hijacking slurm-tools - name: Reconfigure slurm command: cmd: scontrol reconfigure diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index c5580d173..a16d93659 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -8,8 +8,6 @@ autoscale_keypair: steveb-local autoscale_openhpc_extra_config: # required parameters: SuspendExcNodes: "{{ groups['compute'] | join(',') }}" # i.e. all tf-defined nodes in the partition # TODO: fixme: hardcoded "compute" - SuspendProgram: /opt/slurm-tools/bin/suspend # TODO: fixme: hijacking slurm-tools - ResumeProgram: /opt/slurm-tools/bin/resume # TODO: fixme: hijacking slurm-tools SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns CommunicationParameters: NoAddrCache # recommended: diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 165a86ee0..70d9289c9 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -15,7 +15,6 @@ openhpc_slurmdbd_mysql_username: slurm openhpc_slurm_control_host: "{{ groups['control'] | first }}" openhpc_slurm_partitions: - name: "compute" - # cloud_nodes: 2 openhpc_default_packages: - slurm-libpmi-ohpc # to allow intel mpi to work properly - ohpc-gnu9-openmpi4-perf-tools # for hpctests From 82e4fac29c28f99c43fb7916453f494f6abeee77 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Sep 2021 08:25:59 +0000 Subject: [PATCH 031/105] turn up slurmctld logging --- environments/common/inventory/group_vars/all/autoscale.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index a16d93659..649007e15 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -14,6 +14,7 @@ autoscale_openhpc_extra_config: PrivateData: cloud # shows cloud node state # TODO: for testing only, not production: DebugFlags: PowerSave + SlurmctldSyslogDebug: info SuspendTime: 120 SuspendTimeout: 300 ResumeTimeout: 300 From 1353f86a9a24ae8c3d6230f8d68070caec279654 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Sep 2021 08:40:34 +0000 Subject: [PATCH 032/105] add extension to templates --- ansible/templates/{resume.j2 => resume.py.j2} | 0 ansible/templates/{suspend.j2 => suspend.py.j2} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename ansible/templates/{resume.j2 => resume.py.j2} (100%) rename ansible/templates/{suspend.j2 => suspend.py.j2} (100%) diff --git a/ansible/templates/resume.j2 b/ansible/templates/resume.py.j2 similarity index 100% rename from ansible/templates/resume.j2 rename to ansible/templates/resume.py.j2 diff --git a/ansible/templates/suspend.j2 b/ansible/templates/suspend.py.j2 similarity index 100% rename from ansible/templates/suspend.j2 rename to ansible/templates/suspend.py.j2 From 60473139d22659ec57a1314d2b069e3913d94738 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Sep 2021 08:45:24 +0000 Subject: [PATCH 033/105] log exception tracebacks from resume/suspend programs --- ansible/slurm.yml | 4 ++-- ansible/templates/resume.py.j2 | 6 +++++- ansible/templates/suspend.py.j2 | 6 +++++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 444de81eb..a8a324624 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -67,7 +67,7 @@ name: stackhpc.slurm_openstack_tools.pytools - name: Create SuspendProgram template: - src: suspend.j2 + src: suspend.py.j2 dest: /opt/slurm-tools/bin/suspend owner: slurm group: slurm @@ -75,7 +75,7 @@ tags: suspend - name: Create ResumeProgram template: - src: resume.j2 + src: resume.py.j2 dest: /opt/slurm-tools/bin/resume owner: slurm group: slurm diff --git a/ansible/templates/resume.py.j2 b/ansible/templates/resume.py.j2 index 55a03d1ca..4f116d8e2 100644 --- a/ansible/templates/resume.py.j2 +++ b/ansible/templates/resume.py.j2 @@ -49,4 +49,8 @@ def resume(): logger.info(f"server: {server}") if __name__ == "__main__": - sys.exit(resume()) + try: + sys.exit(resume()) + except: + logger.exception('Exception in main:') + raise diff --git a/ansible/templates/suspend.py.j2 b/ansible/templates/suspend.py.j2 index 02d09bc0d..c003ad3ae 100644 --- a/ansible/templates/suspend.py.j2 +++ b/ansible/templates/suspend.py.j2 @@ -32,4 +32,8 @@ def suspend(): delete_server(conn, node) if __name__ == "__main__": - sys.exit(suspend()) + try: + sys.exit(suspend()) + except: + logger.exception('Exception in main:') + raise From 919ff5031c996e148abe9ffca5f0fcb3e42b64f0 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Sep 2021 13:25:54 +0000 Subject: [PATCH 034/105] chhange appcred owner --- ansible/slurm.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index a8a324624..aa5515b52 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -52,15 +52,15 @@ file: path: /etc/openstack state: directory - owner: root - group: root - mode: '0400' + owner: slurm + group: slurm + mode: '0500' - name: Copy out clouds.yaml copy: src: "{{ autoscale_clouds }}" dest: /etc/openstack/clouds.yaml - owner: root - group: root + owner: slurm + group: slurm mode: '0400' - name: Setup slurm tools (to get venv) include_role: From 02377b110569b6dfe7c9b98d684ac2837e368605 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Sep 2021 13:26:14 +0000 Subject: [PATCH 035/105] fix try/except in resume/suspend --- ansible/templates/resume.py.j2 | 2 +- ansible/templates/suspend.py.j2 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/templates/resume.py.j2 b/ansible/templates/resume.py.j2 index 4f116d8e2..2722780d6 100644 --- a/ansible/templates/resume.py.j2 +++ b/ansible/templates/resume.py.j2 @@ -53,4 +53,4 @@ if __name__ == "__main__": sys.exit(resume()) except: logger.exception('Exception in main:') - raise + raise diff --git a/ansible/templates/suspend.py.j2 b/ansible/templates/suspend.py.j2 index c003ad3ae..7a4c70f9c 100644 --- a/ansible/templates/suspend.py.j2 +++ b/ansible/templates/suspend.py.j2 @@ -36,4 +36,4 @@ if __name__ == "__main__": sys.exit(suspend()) except: logger.exception('Exception in main:') - raise + raise From b0622d96a6ea7b0b3666d867a1b9a85b5b0c34ef Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Sep 2021 13:45:11 +0000 Subject: [PATCH 036/105] handle incorrect resume config --- ansible/templates/resume.py.j2 | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/ansible/templates/resume.py.j2 b/ansible/templates/resume.py.j2 index 2722780d6..b0f869bd7 100644 --- a/ansible/templates/resume.py.j2 +++ b/ansible/templates/resume.py.j2 @@ -6,10 +6,12 @@ import openstack import pprint # all take a name or ID: -IMAGE = "{{ autoscale_image }}" -NETWORK = "{{ autoscale_network }}" -FLAVOR = "{{ autoscale_flavor }}" -KEYPAIR = "{{ autoscale_keypair }}" +config = { + 'image': "{{ autoscale_image }}", + 'network': "{{ autoscale_network }}", + 'flavor': "{{ autoscale_flavor }}", + 'keypair': "{{ autoscale_keypair }}", +} # configure logging to syslog - by default only "info" and above categories appear logger = logging.getLogger("syslogger") @@ -23,10 +25,14 @@ def expand_nodes(hostlist_expr): def create_server(conn, name): - image = conn.compute.find_image(IMAGE) - flavor = conn.compute.find_flavor(FLAVOR) - network = conn.network.find_network(NETWORK) - keypair = conn.compute.find_keypair(KEYPAIR) + image = conn.compute.find_image(config['image']) + flavor = conn.compute.find_flavor(config['flavor']) + network = conn.network.find_network(config['network']) + keypair = conn.compute.find_keypair(config['keypair']) + + for ix, item in enumerate((image, flavor, network, keypair)): + if item is None: + raise ValueError(f'Specified {list(config)[ix]} {config[list(config)[ix]]} was not found') server = conn.compute.create_server( name=name, image_id=image.id, flavor_id=flavor.id, @@ -45,7 +51,7 @@ def resume(): for node in new_nodes: logger.info(f"creating node {node}") - server = create_server(conn, node) + server = create_server(conn, node) # TODO: save id to disk so can use it instead of name on deletion (to cope with multiple instances with same name) logger.info(f"server: {server}") if __name__ == "__main__": From d1ba38e378951f50b3f25caacd1c634c3d07e632 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Sep 2021 13:47:11 +0000 Subject: [PATCH 037/105] fix autoscale config for smslabs --- .../common/inventory/group_vars/all/autoscale.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index 649007e15..cdf4a5eb6 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -1,9 +1,9 @@ autoscale_clouds: ~/.config/openstack/clouds.yaml # TODO: change below to be defined somewhere else, poss as part of slurm config for partition?? -autoscale_image: ohpc-compute-210406-1108.qcow2 -autoscale_network: stackhpc -autoscale_flavor: chipolata -autoscale_keypair: steveb-local +autoscale_image: ohpc-compute-210909-1316.qcow2 +autoscale_network: stackhpc-ipv4-geneve +autoscale_flavor: general.v1.small +autoscale_keypair: centos-at-steveb-ansible autoscale_openhpc_extra_config: # required parameters: From 8e2a8270c091e54054374f4572b8995a2c43df34 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Sep 2021 14:26:05 +0000 Subject: [PATCH 038/105] avoid suspend/resume exceptions on successful run --- ansible/templates/resume.py.j2 | 3 ++- ansible/templates/suspend.py.j2 | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ansible/templates/resume.py.j2 b/ansible/templates/resume.py.j2 index b0f869bd7..d1fb85bcd 100644 --- a/ansible/templates/resume.py.j2 +++ b/ansible/templates/resume.py.j2 @@ -53,10 +53,11 @@ def resume(): logger.info(f"creating node {node}") server = create_server(conn, node) # TODO: save id to disk so can use it instead of name on deletion (to cope with multiple instances with same name) logger.info(f"server: {server}") + # Don't need scontrol update nodename={node} nodeaddr={server_ip} as using SlurmctldParameters=cloud_dns if __name__ == "__main__": try: - sys.exit(resume()) + resume() except: logger.exception('Exception in main:') raise diff --git a/ansible/templates/suspend.py.j2 b/ansible/templates/suspend.py.j2 index 7a4c70f9c..dadfc2e4a 100644 --- a/ansible/templates/suspend.py.j2 +++ b/ansible/templates/suspend.py.j2 @@ -33,7 +33,7 @@ def suspend(): if __name__ == "__main__": try: - sys.exit(suspend()) + suspend() except: logger.exception('Exception in main:') raise From 37055b50abe7cae6ac4fee8afdd0fb765b86e465 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 23 Sep 2021 11:33:31 +0000 Subject: [PATCH 039/105] basic (messy) working autoscale --- ansible/slurm.yml | 99 +++++++------------ .../inventory/group_vars/all/autoscale.yml | 8 +- .../inventory/group_vars/all/openhpc.yml | 3 +- .../inventory/group_vars/all/rebuild.yml | 3 + 4 files changed, 46 insertions(+), 67 deletions(-) create mode 100644 environments/common/inventory/group_vars/all/rebuild.yml diff --git a/ansible/slurm.yml b/ansible/slurm.yml index a724c4854..3574c97d2 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -9,91 +9,62 @@ - include_role: name: geerlingguy.mysql -- name: Setup slurm - hosts: openhpc - become: yes - tags: - - openhpc - tasks: - # - name: Add CentOS 8.3 Vault repo for OpenHPC hwloc dependency - # # NB: REMOVE THIS once OpenHPC works on CentOS 8.4 - # yum_repository: - # name: vault - # file: CentOS-Linux-Vault8.3 - # description: CentOS 8.3 packages from Vault - # baseurl: https://vault.centos.org/8.3.2011/BaseOS/$basearch/os/ - # gpgkey: file:///etc/pki/rpm-gpg/RPM-GPG-KEY-centosofficial - - import_role: - name: stackhpc.openhpc - -- name: Setup slurm-driven reimage - hosts: rebuild - become: yes - tags: +- name: Enable Slurm-controlled instance changes + hosts: - rebuild - - openhpc - tasks: - - import_role: - name: stackhpc.slurm_openstack_tools.rebuild - -- name: Setup autoscaling suspend/resume programs - # has to happen *after* slurm user has been created - hosts: autoscale # this is the *controller* - become: yes - tags: - autoscale - tasks: -- name: Configure autoscale - hosts: autoscale - become: yes + become: true tags: + - rebuild - autoscale + - openhpc tasks: - name: Create /etc/openstack file: path: /etc/openstack state: directory - owner: slurm - group: slurm - mode: '0500' + owner: root # This will be changed later + group: root + mode: u=r - name: Copy out clouds.yaml copy: - src: "{{ autoscale_clouds }}" + src: "{{ openhpc_rebuild_clouds }}" # TODO: name is wrong really! dest: /etc/openstack/clouds.yaml - owner: slurm - group: slurm - mode: '0400' - - name: Setup slurm tools (to get venv) + mode: u=rx + - name: Setup slurm tools # this adds reboot script only at present include_role: - name: stackhpc.slurm_openstack_tools.pytools - - name: Create SuspendProgram + name: stackhpc.slurm_openstack_tools.pytools # TODO: could just move this into the repo? + - name: Create SuspendProgram # TODO: FIXME: add to slurm-tools template: src: suspend.py.j2 dest: /opt/slurm-tools/bin/suspend - owner: slurm - group: slurm - mode: u=rwx,go= + mode: u=rwx,g=rx,o=rx # is OK as clouds.yaml is protected tags: suspend - - name: Create ResumeProgram + when: "'autoscale' in group_names" + - name: Create ResumeProgram # TODO: FIXME: add to slurm-tools template: src: resume.py.j2 dest: /opt/slurm-tools/bin/resume - owner: slurm - group: slurm - mode: u=rwx,go= + mode: u=rwx,g=rx,o=rx # is OK as clouds.yaml is protected tags: resume - - name: Add Resume/SuspendProgram parameters - community.general.ini_file: - path: /etc/slurm/slurm.conf - option: "{{ item.key }}" - section: null - value: "{{ item.value }}" - no_extra_spaces: true - create: no - loop: "{{ {'SuspendProgram':'/opt/slurm-tools/bin/suspend', 'ResumeProgram':'/opt/slurm-tools/bin/resume'} | dict2items }}" # TODO: fixme: hijacking slurm-tools - - name: Reconfigure slurm - command: - cmd: scontrol reconfigure + when: "'autoscale' in group_names" + +- name: Setup slurm + hosts: openhpc + become: yes + tags: + - openhpc + tasks: + # - name: Add CentOS 8.3 Vault repo for OpenHPC hwloc dependency + # # NB: REMOVE THIS once OpenHPC works on CentOS 8.4 + # yum_repository: + # name: vault + # file: CentOS-Linux-Vault8.3 + # description: CentOS 8.3 packages from Vault + # baseurl: https://vault.centos.org/8.3.2011/BaseOS/$basearch/os/ + # gpgkey: file:///etc/pki/rpm-gpg/RPM-GPG-KEY-centosofficial + - import_role: + name: stackhpc.openhpc - name: Set locked memory limits on user-facing nodes hosts: diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index cdf4a5eb6..ab8516ca7 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -10,13 +10,19 @@ autoscale_openhpc_extra_config: SuspendExcNodes: "{{ groups['compute'] | join(',') }}" # i.e. all tf-defined nodes in the partition # TODO: fixme: hardcoded "compute" SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns CommunicationParameters: NoAddrCache + SuspendProgram: /opt/slurm-tools/bin/suspend + ResumeProgram: /opt/slurm-tools/bin/resume # recommended: PrivateData: cloud # shows cloud node state # TODO: for testing only, not production: DebugFlags: PowerSave SlurmctldSyslogDebug: info SuspendTime: 120 - SuspendTimeout: 300 + SuspendTimeout: 30 ResumeTimeout: 300 # FIXME: maybe need to set TreeWidth to >= number of nodes (default: 50) # power_save_*interval: options are defaults but should enable changes +openhpc_slurm_dirs: + - /etc/openstack + - /opt/slurm-tools +openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml \ No newline at end of file diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 70d9289c9..8bf38293f 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -26,5 +26,4 @@ openhpc_slurm_configless: true openhpc_login_only_nodes: login openhpc_extra_config_overrides: {} -appliance_openhpc_extra_config: "{{ autoscale_openhpc_extra_config if groups['autoscale'] else {} }}" -openhpc_extra_config: "{{ appliance_openhpc_extra_config | combine(openhpc_extra_config_overrides) }}" +openhpc_extra_config: "{{ {} | combine(rebuild_openhpc_extra_config, autoscale_openhpc_extra_config, openhpc_extra_config_overrides) }}" # TODO: handle case where groups aren't defined! diff --git a/environments/common/inventory/group_vars/all/rebuild.yml b/environments/common/inventory/group_vars/all/rebuild.yml new file mode 100644 index 000000000..4026a0e21 --- /dev/null +++ b/environments/common/inventory/group_vars/all/rebuild.yml @@ -0,0 +1,3 @@ +rebuild_openhpc_extra_config: + RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild +openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml \ No newline at end of file From 6a37f50975a836df352e7d39b67f3a425d5f45c5 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 24 Sep 2021 13:20:17 +0000 Subject: [PATCH 040/105] make clouds.yaml idemponent (TODO: fix for rebuild nodes) --- ansible/slurm.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 3574c97d2..cb2cfa974 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -19,21 +19,31 @@ - autoscale - openhpc tasks: + - name: Check if slurm user exists + command: + cmd: "id slurm" + register: id_slurm + failed_when: false + changed_when: false - name: Create /etc/openstack file: path: /etc/openstack state: directory - owner: root # This will be changed later - group: root + owner: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task + group: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task mode: u=r - name: Copy out clouds.yaml copy: src: "{{ openhpc_rebuild_clouds }}" # TODO: name is wrong really! dest: /etc/openstack/clouds.yaml mode: u=rx + owner: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task + group: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task - name: Setup slurm tools # this adds reboot script only at present include_role: name: stackhpc.slurm_openstack_tools.pytools # TODO: could just move this into the repo? + vars: # TODO: debug + pytools_editable: true # git repo in /opt/slurm-tools/src/slurm-openstack-tools - name: Create SuspendProgram # TODO: FIXME: add to slurm-tools template: src: suspend.py.j2 From 49a76cc67592b9982795ae67f435c6e362368413 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 24 Sep 2021 15:14:29 +0000 Subject: [PATCH 041/105] fix /etc/openstack permissions for autoscale --- ansible/slurm.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index cb2cfa974..69cf11d96 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -31,12 +31,12 @@ state: directory owner: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task group: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task - mode: u=r + mode: u=rX,go= - name: Copy out clouds.yaml copy: src: "{{ openhpc_rebuild_clouds }}" # TODO: name is wrong really! dest: /etc/openstack/clouds.yaml - mode: u=rx + mode: u=r,go= owner: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task group: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task - name: Setup slurm tools # this adds reboot script only at present From 9c9a69e116face2742d0c24dc1ba0fc23badb0b8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 24 Sep 2021 15:15:06 +0000 Subject: [PATCH 042/105] use openhpc_suspend_exc_nodes to prevent login nodes autoscaling --- environments/common/inventory/group_vars/all/autoscale.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index ab8516ca7..fb5ac3cbd 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -5,9 +5,9 @@ autoscale_network: stackhpc-ipv4-geneve autoscale_flavor: general.v1.small autoscale_keypair: centos-at-steveb-ansible +openhpc_suspend_exc_nodes: "{{ (groups['compute'] + groups.get('login', [])) }}" # i.e. all tf-defined nodes in the partition # TODO: fixme: hardcoded "compute" autoscale_openhpc_extra_config: # required parameters: - SuspendExcNodes: "{{ groups['compute'] | join(',') }}" # i.e. all tf-defined nodes in the partition # TODO: fixme: hardcoded "compute" SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns CommunicationParameters: NoAddrCache SuspendProgram: /opt/slurm-tools/bin/suspend From 10a20363636c23d54c7d616e9cf8ea499ae64f87 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 28 Sep 2021 08:11:17 +0000 Subject: [PATCH 043/105] install slurm user before adding slurm tools --- ansible/slurm.yml | 22 +++++++++---------- .../inventory/group_vars/all/autoscale.yml | 3 --- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 69cf11d96..73fe05216 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -9,7 +9,7 @@ - include_role: name: geerlingguy.mysql -- name: Enable Slurm-controlled instance changes +- name: Enable Slurm/OpenStack integrations hosts: - rebuild - autoscale @@ -19,31 +19,31 @@ - autoscale - openhpc tasks: - - name: Check if slurm user exists - command: - cmd: "id slurm" - register: id_slurm - failed_when: false - changed_when: false + - name: Install slurm packages to create slurm user + import_role: + name: stackhpc.openhpc + tasks_from: install.yml - name: Create /etc/openstack file: path: /etc/openstack state: directory - owner: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task - group: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task + owner: slurm # TODO: check if this works for rebuild too? + group: slurm mode: u=rX,go= - name: Copy out clouds.yaml copy: src: "{{ openhpc_rebuild_clouds }}" # TODO: name is wrong really! dest: /etc/openstack/clouds.yaml mode: u=r,go= - owner: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task - group: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task + owner: slurm # TODO: check if this works for rebuild too? + group: slurm - name: Setup slurm tools # this adds reboot script only at present include_role: name: stackhpc.slurm_openstack_tools.pytools # TODO: could just move this into the repo? vars: # TODO: debug pytools_editable: true # git repo in /opt/slurm-tools/src/slurm-openstack-tools + become_user: slurm # TODO: check if this works for rebuild too? + become_flags: '-s /bin/bash' - name: Create SuspendProgram # TODO: FIXME: add to slurm-tools template: src: suspend.py.j2 diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index fb5ac3cbd..9c0eefc9d 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -22,7 +22,4 @@ autoscale_openhpc_extra_config: ResumeTimeout: 300 # FIXME: maybe need to set TreeWidth to >= number of nodes (default: 50) # power_save_*interval: options are defaults but should enable changes -openhpc_slurm_dirs: - - /etc/openstack - - /opt/slurm-tools openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml \ No newline at end of file From 7de823fbb22c1499de1ee758aab2a290e00396c9 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 28 Sep 2021 13:16:56 +0000 Subject: [PATCH 044/105] read node Features to get openstack instance information --- ansible/templates/resume.py.j2 | 101 +++++++++++++++++++++++++-------- 1 file changed, 78 insertions(+), 23 deletions(-) diff --git a/ansible/templates/resume.py.j2 b/ansible/templates/resume.py.j2 index d1fb85bcd..e36d044f6 100644 --- a/ansible/templates/resume.py.j2 +++ b/ansible/templates/resume.py.j2 @@ -1,47 +1,75 @@ #!/opt/slurm-tools/bin/python3 -""" Create OpenStack instances """ +""" A Slurm ResumeProgram to create OpenStack instances. + + Usage: + + resume HOSTLIST_EXPRESSION [debug] + + where: + HOSTLIST_EXPRESSION: Name(s) of node(s) to create, using Slurm's hostlist expression, as per [1]. + debug: Any 2nd argument puts this in debug mode which is more verbose but does not actually create nodes. + + Output and exceptions are written to the syslog. + + The flavor, image, network and keypair to be used must be defined as node Features [2] in the format "parameter=value". + + OpenStack credentials must be available to this script (e.g. via an application credential in /etc/openstack/clouds.yaml readable by the slurm user) + + [1]: https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeProgram + [2]: https://slurm.schedmd.com/slurm.conf.html#OPT_Features +""" import sys, subprocess, logging.handlers import openstack import pprint -# all take a name or ID: -config = { - 'image': "{{ autoscale_image }}", - 'network': "{{ autoscale_network }}", - 'flavor': "{{ autoscale_flavor }}", - 'keypair': "{{ autoscale_keypair }}", -} +REQUIRED_PARAMS = ('image', 'flavor', 'keypair', 'network') # configure logging to syslog - by default only "info" and above categories appear logger = logging.getLogger("syslogger") logger.setLevel(logging.DEBUG) handler = logging.handlers.SysLogHandler("/dev/log") +handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) logger.addHandler(handler) def expand_nodes(hostlist_expr): scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) return scontrol.stdout.strip().split('\n') -def create_server(conn, name): - - image = conn.compute.find_image(config['image']) - flavor = conn.compute.find_flavor(config['flavor']) - network = conn.network.find_network(config['network']) - keypair = conn.compute.find_keypair(config['keypair']) +def get_features(nodenames): + """ Retrieve the features specified for given node(s). + + Returns a dict with a key/value pair for each node. Keys are node names, values are lists of strings, one string per feature. + """ + + scontrol = subprocess.run(['scontrol', 'show', 'node', nodenames], stdout=subprocess.PIPE, universal_newlines=True) + features = {} + for line in scontrol.stdout.splitlines(): + line = line.strip() + if line.startswith('NodeName'): # NodeName=dev-small-cloud-1 CoresPerSocket=1 + node = line.split()[0].split('=')[1] + if line.startswith('AvailableFeatures'): + feature_args = line.split('=', 1)[1] + features[node] = feature_args.split(',') + break + + return features - for ix, item in enumerate((image, flavor, network, keypair)): - if item is None: - raise ValueError(f'Specified {list(config)[ix]} {config[list(config)[ix]]} was not found') +def create_server(conn, name, image, flavor, network, keypair): server = conn.compute.create_server( name=name, image_id=image.id, flavor_id=flavor.id, - networks=[{"uuid": network.id}], key_name=keypair.name) + networks=[{"uuid": network.id}], key_name=keypair.name, + ) + #server = conn.compute.wait_for_server(...) - #server = conn.compute.wait_for_server(server) return server def resume(): + debug = False + if len(sys.argv) > 2: + logger.info(f"Running in debug mode - won't actually create nodes") + debug = True hostlist_expr = sys.argv[1] logger.info(f"Slurmctld invoked resume {hostlist_expr}") new_nodes = expand_nodes(hostlist_expr) @@ -49,11 +77,38 @@ def resume(): conn = openstack.connection.from_config() logger.info(f"Got openstack connection {conn}") + features = get_features(hostlist_expr) + logger.info(f"Read feature information from slurm") + logger.info(f"Features: {features}") + for node in new_nodes: - logger.info(f"creating node {node}") - server = create_server(conn, node) # TODO: save id to disk so can use it instead of name on deletion (to cope with multiple instances with same name) - logger.info(f"server: {server}") - # Don't need scontrol update nodename={node} nodeaddr={server_ip} as using SlurmctldParameters=cloud_dns + # extract the openstack parameters from node features: + if node not in features: + logger.error(f"No Feature definitions found for node {node}: {features}") + os_parameters = dict(feature.split('=') for feature in features[node]) + if debug: + logger.info(f"os_parameters for {node}: {os_parameters}") + missing = set(REQUIRED_PARAMS).difference(os_parameters.keys()) + if missing: + logger.error(f"Missing {','.join(missing)} from feature definition for node {node}: {os_parameters}") + + # get openstack objects: + os_objects = { + 'image': conn.compute.find_image(os_parameters['image']), + 'flavor': conn.compute.find_flavor(os_parameters['flavor']), + 'network': conn.network.find_network(os_parameters['network']), + 'keypair': conn.compute.find_keypair(os_parameters['keypair']), + } + not_found = dict((k, v) for (k, v) in os_objects.items() if v is None) + if not_found: + raise ValueError('Could not find openstack objects for: %s' % ', '.join(not_found)) + if debug: + logger.info(f"os_objects for {node} : {os_objects}") + if not debug: + logger.info(f"creating node {node}") + server = create_server(conn, node, **os_objects) # TODO: save id to disk so can use it instead of name on deletion (to cope with multiple instances with same name) + logger.info(f"server: {server}") + # Don't need scontrol update nodename={node} nodeaddr={server_ip} as using SlurmctldParameters=cloud_dns if __name__ == "__main__": try: From d7bfa7547931d337ead4166cb420f1e3078adf53 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 28 Sep 2021 14:43:54 +0000 Subject: [PATCH 045/105] move autoscale node info to openhpc_slurm_partitions --- .../common/inventory/group_vars/all/autoscale.yml | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index 9c0eefc9d..f9fafa715 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -1,11 +1,5 @@ autoscale_clouds: ~/.config/openstack/clouds.yaml -# TODO: change below to be defined somewhere else, poss as part of slurm config for partition?? -autoscale_image: ohpc-compute-210909-1316.qcow2 -autoscale_network: stackhpc-ipv4-geneve -autoscale_flavor: general.v1.small -autoscale_keypair: centos-at-steveb-ansible - -openhpc_suspend_exc_nodes: "{{ (groups['compute'] + groups.get('login', [])) }}" # i.e. all tf-defined nodes in the partition # TODO: fixme: hardcoded "compute" +autoscale_openhpc_suspend_exc_nodes: "{{ (groups['compute'] + groups.get('login', [])) }}" # i.e. all tf-defined nodes in the partition # TODO: fixme: hardcoded "compute" autoscale_openhpc_extra_config: # required parameters: SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns @@ -22,4 +16,4 @@ autoscale_openhpc_extra_config: ResumeTimeout: 300 # FIXME: maybe need to set TreeWidth to >= number of nodes (default: 50) # power_save_*interval: options are defaults but should enable changes -openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml \ No newline at end of file +openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: fix name here? From 544b1abcc2b28154f7cb562e1cbac0d7ccfdd21d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 08:21:03 +0000 Subject: [PATCH 046/105] rename openhpc vars --- .../inventory/group_vars/all/openhpc.yml | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 8bf38293f..7ce64defd 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -15,15 +15,24 @@ openhpc_slurmdbd_mysql_username: slurm openhpc_slurm_control_host: "{{ groups['control'] | first }}" openhpc_slurm_partitions: - name: "compute" -openhpc_default_packages: + +# TODO: WIP PR to change/deprecate name here: +openhpc_packages_default: - slurm-libpmi-ohpc # to allow intel mpi to work properly - ohpc-gnu9-openmpi4-perf-tools # for hpctests - openblas-gnu9-ohpc # for hpctests (HPL) -openhpc_extra_packages: [] -openhpc_packages: "{{ openhpc_default_packages + openhpc_extra_packages }}" +openhpc_packages_extra: [] +openhpc_packages: "{{ openhpc_packages_default + openhpc_packages_extra }}" + openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}" openhpc_slurm_configless: true openhpc_login_only_nodes: login -openhpc_extra_config_overrides: {} -openhpc_extra_config: "{{ {} | combine(rebuild_openhpc_extra_config, autoscale_openhpc_extra_config, openhpc_extra_config_overrides) }}" # TODO: handle case where groups aren't defined! +openhpc_config_default: "{{ rebuild_openhpc_extra_config | combine(autoscale_openhpc_extra_config) }}" # TODO: handle case where groups aren't defined! +openhpc_config_extra: {} +# TODO: WIP PR for openhpc_extra_config -> openhpc_config +openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra) }}" + +openhpc_suspend_exc_nodes_default: "{{ autoscale_openhpc_suspend_exc_nodes }}" # TODO: handle cases where groups aren't defined +openhpc_env_suspend_exc_nodes_extra: [] +openhpc_suspend_exc_nodes: "{{ openhpc_suspend_exc_nodes_default + openhpc_env_suspend_exc_nodes_extra }}" From 31d8e848b6e475ef3de859d580c019679a006b8b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 08:28:57 +0000 Subject: [PATCH 047/105] add vars from smslabs environment as demo --- environments/smslabs/activate | 23 ++++++++++ environments/smslabs/hooks/post.yml | 19 ++++++++ .../smslabs/inventory/group_vars/all/.gitkeep | 0 .../inventory/group_vars/all/users.yml | 3 ++ .../group_vars/openhpc/overrides.yml | 9 ++++ .../group_vars/openhpc/partitions.yml | 19 ++++++++ .../inventory/group_vars/podman/overrides.yml | 1 + environments/smslabs/inventory/groups | 43 +++++++++++++++++++ environments/smslabs/inventory/hosts | 18 ++++++++ 9 files changed, 135 insertions(+) create mode 100644 environments/smslabs/activate create mode 100644 environments/smslabs/hooks/post.yml create mode 100644 environments/smslabs/inventory/group_vars/all/.gitkeep create mode 100644 environments/smslabs/inventory/group_vars/all/users.yml create mode 100644 environments/smslabs/inventory/group_vars/openhpc/overrides.yml create mode 100755 environments/smslabs/inventory/group_vars/openhpc/partitions.yml create mode 100644 environments/smslabs/inventory/group_vars/podman/overrides.yml create mode 100644 environments/smslabs/inventory/groups create mode 100755 environments/smslabs/inventory/hosts diff --git a/environments/smslabs/activate b/environments/smslabs/activate new file mode 100644 index 000000000..e74031095 --- /dev/null +++ b/environments/smslabs/activate @@ -0,0 +1,23 @@ +export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) +echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" + +APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) +export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" + +export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") +echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" + +export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") +echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root" + +export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") +echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root" + +export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT") +echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" + +if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then + export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg +fi + + diff --git a/environments/smslabs/hooks/post.yml b/environments/smslabs/hooks/post.yml new file mode 100644 index 000000000..87e637f8c --- /dev/null +++ b/environments/smslabs/hooks/post.yml @@ -0,0 +1,19 @@ +- hosts: control + become: true + tasks: + - name: Prevent ansible_user's processes being killed on compute nodes at job completion + replace: + path: /etc/slurm/slurm.epilog.clean + regexp: 'if \[ \$SLURM_UID -lt 100 \] ; then' + replace: "if [[ $SLURM_UID -lt 100 || $SLURM_JOB_USER -eq {{ ansible_user }} ]] ; then" + - name: Make a /home/test directory for centos + file: + path: /home/test + state: directory + owner: centos + group: centos + - name: Install ewatch + git: + repo: https://github.com/sjpb/ewatch.git + dest: /home/test/ewatch + force: yes diff --git a/environments/smslabs/inventory/group_vars/all/.gitkeep b/environments/smslabs/inventory/group_vars/all/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/environments/smslabs/inventory/group_vars/all/users.yml b/environments/smslabs/inventory/group_vars/all/users.yml new file mode 100644 index 000000000..3de23fee4 --- /dev/null +++ b/environments/smslabs/inventory/group_vars/all/users.yml @@ -0,0 +1,3 @@ +users: + - name: stig + pubkey: ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDTXW9Y0r0cTW/ay6FEIlIejuRPZZ+ObzR08XFzp4x8ecCW//WSZAjo1fD/u/CQGoV552QCjWj+tP9Cy9UcsI3WLAx+n4i48oHqvpRLO1CLgJazNpQ8Bc7GveF78xhD5EoL/IpcAFKIad3CU7gb8HLRJIQpER1OsY96T9ViKe9lDWy8mk2WjoYoU1niMtmbs549Gqwl+fGNdBVUsGS5k7Xy4D/0T8TitthN3W6UbMHXVCUzdd3v9TNl7hgyeq6dCvRS6g8Vmlp2Ia0NLkrWF+bqP2RhRuqWOj71PD3auPAq0hF4yqdW9awMuZY8vBesnjE3iC2h34jvFkYaolGTfDZUa48s7yBTpjWoINUSbg105KJoPg55lWwXj58MMhvyX6hyYl3oJMiG3eq48jAAA4n80EKK4IBXrg/yjpuoDiNGqVe9hDAoT94j3+s8Smz5rohsKQVS+l266eyjo2VLUVR2NaOnw5fW86MEUyTicvHjSN4xOCGjSK2j1k6hXT7EiuM= stig@nrel-jumphost.novalocal \ No newline at end of file diff --git a/environments/smslabs/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs/inventory/group_vars/openhpc/overrides.yml new file mode 100644 index 000000000..4bed1823f --- /dev/null +++ b/environments/smslabs/inventory/group_vars/openhpc/overrides.yml @@ -0,0 +1,9 @@ +openhpc_extra_packages: + - git + - python3 +openhpc_extra_config_overrides: + SlurmctldDebug: debug + SlurmdDebug: debug + +#example_list: "{{ example_list + [7] }}" # FAILS - recursive +#example_dict: "{{ example_dict | combine({c: 4} ) }}" # FAILS - recursive diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml new file mode 100755 index 000000000..e7df7b946 --- /dev/null +++ b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml @@ -0,0 +1,19 @@ +cloud_spec: + image: ohpc-compute-210909-1316.qcow2 + flavor: general.v1.small + keypair: centos-at-steveb-ansible + network: stackhpc-ipv4-geneve + +openhpc_slurm_partitions: +- name: small + cloud_nodes: 2 + features: "{{ cloud_spec.items() | map('join', '=') }}" + default: yes + # TODO: consider adding suspend_exc: true here?? + +- name: cloud_only + cloud_nodes: 3 + ram_mb: 9996 + cpus: 2 + features: "{{ cloud_spec.items() | map('join', '=') }}" + default: no diff --git a/environments/smslabs/inventory/group_vars/podman/overrides.yml b/environments/smslabs/inventory/group_vars/podman/overrides.yml new file mode 100644 index 000000000..18e712665 --- /dev/null +++ b/environments/smslabs/inventory/group_vars/podman/overrides.yml @@ -0,0 +1 @@ +podman_cidr: 192.168.1.0/24 diff --git a/environments/smslabs/inventory/groups b/environments/smslabs/inventory/groups new file mode 100644 index 000000000..acf3ca6bc --- /dev/null +++ b/environments/smslabs/inventory/groups @@ -0,0 +1,43 @@ +[nfs:children] +openhpc + +[hpctests:children] +# Login node to use for running mpi-based testing. +login + +[mysql:children] +control + +[prometheus:children] +control + +[grafana:children] +control + +[alertmanager:children] +control + +[node_exporter:children] +# disabled node_exporter on control to avoid noise in syslog +login +compute + +[opendistro:children] +control + +[kibana:children] +control + +[slurm_stats:children] +control + +[filebeat:children] +slurm_stats + +# NB: [rebuild] not defined here as this template is used in CI, which does not run in openstack + +[update:children] +cluster + +[autoscale:children] +control diff --git a/environments/smslabs/inventory/hosts b/environments/smslabs/inventory/hosts new file mode 100755 index 000000000..5ab90d3b8 --- /dev/null +++ b/environments/smslabs/inventory/hosts @@ -0,0 +1,18 @@ +[all:vars] +ansible_user=centos +openhpc_cluster_name=dev + +[control] +dev-control ansible_host=10.0.3.182 server_networks='{"stackhpc-ipv4-geneve":["10.0.3.182"]}' + +[login] +dev-login-1 ansible_host=10.0.1.54 server_networks='{"stackhpc-ipv4-geneve":["10.0.1.54"]}' + +[compute] +dev-small-0 ansible_host=10.0.1.217 server_networks='{"stackhpc-ipv4-geneve":["10.0.1.217"]}' +dev-small-1 ansible_host=10.0.3.253 server_networks='{"stackhpc-ipv4-geneve":["10.0.3.253"]}' + +# Define groups for slurm parititions: +[dev_small] +dev-small-0 +dev-small-1 From 3257a8566b23a15efecde19be92d7878d2240b65 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 09:00:29 +0000 Subject: [PATCH 048/105] cope with no non-cloud nodes in suspend_exc defaults --- environments/common/inventory/group_vars/all/autoscale.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index f9fafa715..06ed90034 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -1,5 +1,5 @@ autoscale_clouds: ~/.config/openstack/clouds.yaml -autoscale_openhpc_suspend_exc_nodes: "{{ (groups['compute'] + groups.get('login', [])) }}" # i.e. all tf-defined nodes in the partition # TODO: fixme: hardcoded "compute" +autoscale_openhpc_suspend_exc_nodes: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all nodes in inventory, i.e. not in State=CLOUD initially autoscale_openhpc_extra_config: # required parameters: SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns From 75a00693539317430567d355dab3015c93e27f74 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 09:00:50 +0000 Subject: [PATCH 049/105] smslabs: more complex partition example --- .../group_vars/openhpc/partitions.yml | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml index e7df7b946..e750bef08 100755 --- a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml +++ b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml @@ -6,14 +6,21 @@ cloud_spec: openhpc_slurm_partitions: - name: small - cloud_nodes: 2 - features: "{{ cloud_spec.items() | map('join', '=') }}" default: yes - # TODO: consider adding suspend_exc: true here?? - -- name: cloud_only - cloud_nodes: 3 - ram_mb: 9996 - cpus: 2 + cloud_nodes: '-[2-3]' features: "{{ cloud_spec.items() | map('join', '=') }}" + +- name: burst default: no + groups: + - name: smallmem + cloud_nodes: '[0-3]' + ram_mb: 9996 + cpus: 2 + features: "{{ cloud_spec.items() | map('join', '=') }}" + - name: bigmem + cloud_nodes: '[4-6]' + ram_mb: 9992 + cpus: 2 + features: "{{ cloud_spec.items() | map('join', '=') }}" + From 4a61c5dadb07064de96f204350735ea6d91b37ae Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 09:27:14 +0000 Subject: [PATCH 050/105] use cloud_features support --- .../smslabs/inventory/group_vars/openhpc/partitions.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml index e750bef08..4d632a001 100755 --- a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml +++ b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml @@ -8,19 +8,19 @@ openhpc_slurm_partitions: - name: small default: yes cloud_nodes: '-[2-3]' - features: "{{ cloud_spec.items() | map('join', '=') }}" + cloud_features: "{{ cloud_spec }}" - name: burst default: no groups: - name: smallmem cloud_nodes: '[0-3]' + cloud_features: "{{ cloud_spec }}" ram_mb: 9996 cpus: 2 - features: "{{ cloud_spec.items() | map('join', '=') }}" - name: bigmem cloud_nodes: '[4-6]' + cloud_features: "{{ cloud_spec }}" ram_mb: 9992 cpus: 2 - features: "{{ cloud_spec.items() | map('join', '=') }}" From 74404c2bc7e729da8f20a8b03adfa76352075998 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 10:20:16 +0000 Subject: [PATCH 051/105] fix feature extraction for multiple nodes --- ansible/templates/resume.py.j2 | 2 -- 1 file changed, 2 deletions(-) diff --git a/ansible/templates/resume.py.j2 b/ansible/templates/resume.py.j2 index e36d044f6..559beca93 100644 --- a/ansible/templates/resume.py.j2 +++ b/ansible/templates/resume.py.j2 @@ -51,7 +51,6 @@ def get_features(nodenames): if line.startswith('AvailableFeatures'): feature_args = line.split('=', 1)[1] features[node] = feature_args.split(',') - break return features @@ -79,7 +78,6 @@ def resume(): features = get_features(hostlist_expr) logger.info(f"Read feature information from slurm") - logger.info(f"Features: {features}") for node in new_nodes: # extract the openstack parameters from node features: From 7d13831f82b2f4b29e307ddb63230f355c0a7930 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 10:20:37 +0000 Subject: [PATCH 052/105] smslabs: testable (default) burst partition --- .../group_vars/openhpc/partitions.yml | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml index 4d632a001..d2ed22fa3 100755 --- a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml +++ b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml @@ -1,26 +1,27 @@ -cloud_spec: +general_v1_small: image: ohpc-compute-210909-1316.qcow2 flavor: general.v1.small keypair: centos-at-steveb-ansible network: stackhpc-ipv4-geneve +general_v1_medium: + image: ohpc-compute-210909-1316.qcow2 + flavor: general.v1.medium + keypair: centos-at-steveb-ansible + network: stackhpc-ipv4-geneve + openhpc_slurm_partitions: - name: small - default: yes + default: no cloud_nodes: '-[2-3]' - cloud_features: "{{ cloud_spec }}" + cloud_features: "{{ general_v1_small }}" - name: burst - default: no + default: yes groups: - - name: smallmem - cloud_nodes: '[0-3]' - cloud_features: "{{ cloud_spec }}" - ram_mb: 9996 - cpus: 2 - - name: bigmem + - name: medium cloud_nodes: '[4-6]' - cloud_features: "{{ cloud_spec }}" - ram_mb: 9992 - cpus: 2 + cloud_features: "{{ general_v1_medium }}" + ram_mb: "{{ (15258 * 0.95) | int }}" + sockets_per_board: 4 From 8d627f4fedfbe7e870cecff65f577208b01961e0 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 13:43:36 +0000 Subject: [PATCH 053/105] write instance ID to StateSaveLocation on creation --- ansible/templates/resume.py.j2 | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/ansible/templates/resume.py.j2 b/ansible/templates/resume.py.j2 index 559beca93..c17890608 100644 --- a/ansible/templates/resume.py.j2 +++ b/ansible/templates/resume.py.j2 @@ -19,7 +19,7 @@ [2]: https://slurm.schedmd.com/slurm.conf.html#OPT_Features """ -import sys, subprocess, logging.handlers +import sys, os, subprocess, logging.handlers import openstack import pprint @@ -32,6 +32,13 @@ handler = logging.handlers.SysLogHandler("/dev/log") handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) logger.addHandler(handler) +def get_statesavelocation(): + """ Return the path for Slurm's StateSaveLocation """ + scontrol = subprocess.run(['scontrol', 'show', 'config'], stdout=subprocess.PIPE, universal_newlines=True) + for line in scontrol.stdout.splitlines(): + if line.startswith('StateSaveLocation'): # StateSaveLocation = /var/spool/slurm + return line.split()[-1] + def expand_nodes(hostlist_expr): scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) return scontrol.stdout.strip().split('\n') @@ -78,6 +85,8 @@ def resume(): features = get_features(hostlist_expr) logger.info(f"Read feature information from slurm") + + statedir = get_statesavelocation() for node in new_nodes: # extract the openstack parameters from node features: @@ -106,6 +115,8 @@ def resume(): logger.info(f"creating node {node}") server = create_server(conn, node, **os_objects) # TODO: save id to disk so can use it instead of name on deletion (to cope with multiple instances with same name) logger.info(f"server: {server}") + with open(os.path.join(statedir, node), 'w') as f: + f.write(server.id) # Don't need scontrol update nodename={node} nodeaddr={server_ip} as using SlurmctldParameters=cloud_dns if __name__ == "__main__": From 8b3118968e8fca2209c439a327e83ab582ccb693 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 13:59:36 +0000 Subject: [PATCH 054/105] use instance id on deletion --- ansible/templates/suspend.py.j2 | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/ansible/templates/suspend.py.j2 b/ansible/templates/suspend.py.j2 index dadfc2e4a..e296604ab 100644 --- a/ansible/templates/suspend.py.j2 +++ b/ansible/templates/suspend.py.j2 @@ -1,7 +1,7 @@ #!/opt/slurm-tools/bin/python3 """ Delete openstack instances """ -import sys, subprocess, logging, logging.handlers +import sys, os, subprocess, logging, logging.handlers import openstack import pprint @@ -9,8 +9,16 @@ import pprint logger = logging.getLogger("syslogger") logger.setLevel(logging.DEBUG) handler = logging.handlers.SysLogHandler("/dev/log") +handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) logger.addHandler(handler) +def get_statesavelocation(): + """ Return the path for Slurm's StateSaveLocation """ + scontrol = subprocess.run(['scontrol', 'show', 'config'], stdout=subprocess.PIPE, universal_newlines=True) + for line in scontrol.stdout.splitlines(): + if line.startswith('StateSaveLocation'): # StateSaveLocation = /var/spool/slurm + return line.split()[-1] + def expand_nodes(hostlist_expr): scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) return scontrol.stdout.strip().split('\n') @@ -28,8 +36,17 @@ def suspend(): logger.info(f"Got openstack connection {conn}") for node in remove_nodes: - logger.info(f"deleting node {node}") - delete_server(conn, node) + instance_id = False + statedir = get_statesavelocation() + instance_file = os.path.join(statedir, node) + try: + with open(instance_file) as f: + instance_id = f.read() + except FileNotFoundError: + logger.info(f"no instance file found in {statedir} for node {node}") + + logger.info(f"deleting node {instance_id or node}") + delete_server(conn, (instance_id or node)) if __name__ == "__main__": try: From a1ba9ead1f1b7ef4d900acb82514d5da1ac85ce9 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 30 Sep 2021 08:37:42 +0000 Subject: [PATCH 055/105] fixup rebuild/autoscale variable names --- .../common/inventory/group_vars/all/autoscale.yml | 3 +-- environments/common/inventory/group_vars/all/openhpc.yml | 9 ++++----- environments/common/inventory/group_vars/all/rebuild.yml | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index 06ed90034..26edefea1 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -1,6 +1,5 @@ -autoscale_clouds: ~/.config/openstack/clouds.yaml autoscale_openhpc_suspend_exc_nodes: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all nodes in inventory, i.e. not in State=CLOUD initially -autoscale_openhpc_extra_config: +autoscale_openhpc_config: # required parameters: SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns CommunicationParameters: NoAddrCache diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 7ce64defd..fc540fd4e 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -28,11 +28,10 @@ openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}" openhpc_slurm_configless: true openhpc_login_only_nodes: login -openhpc_config_default: "{{ rebuild_openhpc_extra_config | combine(autoscale_openhpc_extra_config) }}" # TODO: handle case where groups aren't defined! +openhpc_config_default: "{{ rebuild_openhpc_config | combine(autoscale_openhpc_config) }}" # TODO: handle case where groups aren't defined! openhpc_config_extra: {} -# TODO: WIP PR for openhpc_extra_config -> openhpc_config openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra) }}" -openhpc_suspend_exc_nodes_default: "{{ autoscale_openhpc_suspend_exc_nodes }}" # TODO: handle cases where groups aren't defined -openhpc_env_suspend_exc_nodes_extra: [] -openhpc_suspend_exc_nodes: "{{ openhpc_suspend_exc_nodes_default + openhpc_env_suspend_exc_nodes_extra }}" +openhpc_suspend_exc_nodes_default: "{{ autoscale_openhpc_suspend_exc_nodes }}" +openhpc_suspend_exc_nodes_extra: [] +openhpc_suspend_exc_nodes: "{{ openhpc_suspend_exc_nodes_default + openhpc_suspend_exc_nodes_extra }}" diff --git a/environments/common/inventory/group_vars/all/rebuild.yml b/environments/common/inventory/group_vars/all/rebuild.yml index 4026a0e21..b1162ffc3 100644 --- a/environments/common/inventory/group_vars/all/rebuild.yml +++ b/environments/common/inventory/group_vars/all/rebuild.yml @@ -1,3 +1,3 @@ -rebuild_openhpc_extra_config: +rebuild_openhpc_config: RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml \ No newline at end of file From ebf3dd9265b721c3739e096eb9b8212a5fbee5f8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 30 Sep 2021 12:49:50 +0000 Subject: [PATCH 056/105] create autoscale role with auto-modification of openhpc_slurm_partitions --- ansible/.gitignore | 2 + ansible/roles/autoscale/.travis.yml | 29 +++++++ ansible/roles/autoscale/README.md | 74 ++++++++++++++++++ ansible/roles/autoscale/defaults/main.yml | 2 + .../openhpc_partitions.cpython-36.pyc | Bin 0 -> 1462 bytes .../filter_plugins/openhpc_partitions.py | 48 ++++++++++++ ansible/roles/autoscale/meta.empty/main.yml | 52 ++++++++++++ ansible/roles/autoscale/tasks/main.yml | 19 +++++ ansible/roles/autoscale/tasks/validate.yml | 5 ++ .../autoscale}/templates/resume.py.j2 | 0 .../autoscale}/templates/suspend.py.j2 | 0 ansible/slurm.yml | 28 +++---- ansible/validate.yml | 9 +++ .../group_vars/openhpc/partitions.yml | 15 ++-- 14 files changed, 256 insertions(+), 27 deletions(-) create mode 100644 ansible/roles/autoscale/.travis.yml create mode 100644 ansible/roles/autoscale/README.md create mode 100644 ansible/roles/autoscale/defaults/main.yml create mode 100644 ansible/roles/autoscale/filter_plugins/__pycache__/openhpc_partitions.cpython-36.pyc create mode 100644 ansible/roles/autoscale/filter_plugins/openhpc_partitions.py create mode 100644 ansible/roles/autoscale/meta.empty/main.yml create mode 100644 ansible/roles/autoscale/tasks/main.yml create mode 100644 ansible/roles/autoscale/tasks/validate.yml rename ansible/{ => roles/autoscale}/templates/resume.py.j2 (100%) rename ansible/{ => roles/autoscale}/templates/suspend.py.j2 (100%) diff --git a/ansible/.gitignore b/ansible/.gitignore index bf07028ab..fd78abade 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -22,3 +22,5 @@ roles/* !roles/block_devices/** !roles/basic_users/ !roles/basic_users/** +!roles/autoscale/ +!roles/autoscale/** \ No newline at end of file diff --git a/ansible/roles/autoscale/.travis.yml b/ansible/roles/autoscale/.travis.yml new file mode 100644 index 000000000..36bbf6208 --- /dev/null +++ b/ansible/roles/autoscale/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/ansible/roles/autoscale/README.md b/ansible/roles/autoscale/README.md new file mode 100644 index 000000000..63c6c13ab --- /dev/null +++ b/ansible/roles/autoscale/README.md @@ -0,0 +1,74 @@ +# autoscale + +Support autoscaling nodes on OpenStack clouds, i.e. creating nodes when necessary to service the queue and deleting them when they are no longer needed. + +This is implemented using Slurm's ["elastic computing"](https://slurm.schedmd.com/elastic_computing.html) features which are based on Slurm's [power saving](https://slurm.schedmd.com/power_save.html) features. + + +NOTES TODO: +- Won't get monitoring for autoscaling nodes +- Describe autoscale vs `State=CLOUD` and powersaving enablement. +- Describe groups. +- Describe cpu/memory info requirements (inc. for mixed partitions) +- Describe what happens on failure. + + +## Requirements + +- Role `stackhpc.slurm_openstack_tools.pytools`. Installs [slurm-openstack-tools](github.com/stackhpc/slurm-openstack-tools) which provides a venv with the `openstacksdk`. +- Role `stackhpc.openhpc` to create a Slurm cluster. +- This role should be run on the Slurm controller only, i.e. add the `control` group to the `autoscale` group to activate this functionality. + +## Role Variables + +- `openhpc_slurm_partitions`: This role modifies what the partitions/groups defined [openhpc_slurm_partitions](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) in the by `stackhpc.openhpc` role accept: + - `cloud_nodes`: Optional. As per the `stackhpc.openhpc` docs this defines nodes in a ["CLOUD" state](https://slurm.schedmd.com/slurm.conf.html#OPT_CLOUD), i.e. treated as powered down/not existing when the Slurm control daemon starts. The value is a suffix for the group/partition's node names in Slurm's hostlist expression format (e.g. `-[11-25]`) and therefore defines the number of CLOUD-state nodes. + - `cloud_instances`: Required if `cloud_nodes` is defined. A dict defining the `flavor`, `image`, `keypair` and `network` to use for CLOUD-state instances in this partition/group. Values for these parameters may be either names (if unique in the cloud) or IDs. + +Some examples are given below. + +### Processor/memory information +Non-CLOUD-state nodes in a group/partition are defined by the hosts in an inventory group named `_` as per `stackhpc.openhpc` [docs](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) and processor/memory information is automatically retrieved from them. + +- If a group/partition contains both CLOUD and non-CLOUD nodes the processor/memory information for the CLOUD nodes is assumed to match that retrieved for the non-CLOUD nodes. +- If a group/partition only contains CLOUD-state nodes (i.e. no matching inventory group or it is empty) then processor/memory information must be specified using the `ram_mb`, `sockets`, `cores_per_socket` and `threads_per_core` options. + + + + + + ```yaml + cloud_instances: + flavor: general.v1.medium + image: ohpc-compute-210909-1316.qcow2 + keypair: centos-at-steveb-ansible + network: "{{ autoscale_network }}" + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + + + + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/ansible/roles/autoscale/defaults/main.yml b/ansible/roles/autoscale/defaults/main.yml new file mode 100644 index 000000000..e85b2db21 --- /dev/null +++ b/ansible/roles/autoscale/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# defaults file for autoscale diff --git a/ansible/roles/autoscale/filter_plugins/__pycache__/openhpc_partitions.cpython-36.pyc b/ansible/roles/autoscale/filter_plugins/__pycache__/openhpc_partitions.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..100545137c74508e603d7c79d3de87c9f5cc88b6 GIT binary patch literal 1462 zcmaJ>PjBNy6rUM8w(F#@D%G-E5E!IYh?XV@ZcC`P+7`4DWJTLmC5x5U^-QvM9owB5 zmo{=FMEU{P3m3F^xFAlP_#|`X#8>EvH%`+=fEa7uyz#uhzxQT7SzUEkUOo+fvl03e zEe#X+hcHDS2t^d9XpDV~HBVCFlX^9LgEoDW+SIu~e&Y*7SEze|sLQOG;oFq#qfYBD zuz)&PS&Z|Xi-EF+X?U7*)riO8h$;J+ofl!um7Ov9ZO)G)tTkXXU^*}#!4!W0Is3gr zJLuW3Q+)Pwg%b#?!XrE*@bAYdsR;Zb)R>W}LCuOBTYQ5yV72*|5+{UOmGKjz_A6tC zzsJ-`3~9>73@27aXyY!L;TfKqmC3(bl3uAy&~aaZ9OyJ_Yd?$-Xf*pZy^9i?w&?1G z-hq|GnL&)&7DguvQL=^%cUYp$V4xa2hdEn7J60QeBStFj8g-~V|j@hKyFEh%9 zamMJD-;C0{q(PQbCj8gun>Z6P%pxWx??2BAmc1yVK%^xf2SvzbEaN;A&%N5F*PeXZ z_C&#=co;L`wU^Aby`xfkvGB&R5OFqI9A2AmcQ5gWllyHimTdg*(`8jHAs%=Nca<|_ zp)5HQ*9A17X1a*`x`1y30pQTH^>4@&PoaYaa1 zQutWmQ{Do-np__CUgTrei&!Re(W?{R42xouveRCeiTEgGJ)Wmb^uiJ(q7e4OIF*bC zMOuy^z1{+~8@#&3`K8-WVYzf!G3Oq1#ze z#)wI-)z#YeJfhF*0CsinD>KRCOc{bn_2%C0*H0hs?d}AR2m1%xgGal;_QAp4zH-6k zkZ}fOhAVR>Sw`WeXp}n*<=(`tOwFrCU2WyoOQ74xU%e`85K76#4SxKVj_a-eN$QSH zeggENqHnS!Omf2Ly3iz_xkwXPW=ll5g!-7A3s zgx~};C5YspPuL zxdo)-DEq1x=Y3!$ah8N1E5aGSt)(=8%F2%t7RlPse|c0+e;(u38i)%QNL>wL66cM- E0Z#me!T') + if 'cloud_nodes' in group: + if 'cloud_instances' not in group: + raise errors.AnsibleFilterError(f"`openhpc_slurm_partitions` group '{group_name}' specifies 'cloud_nodes' but is missing 'cloud_instances'.") + missing_attrs = ','.join(set(REQUIRED_INSTANCE_ATTRS).difference(group['cloud_instances'])) + if missing_attrs: + raise errors.AnsibleFilterError(f"`openhpc_slurm_partitions` group '{group_name}' item 'cloud_instances' is missing items: {missing_attrs}.") + if 'features' not in group: + group['features'] = [] + group['features'].extend(['%s=%s' % (k, v) for (k, v) in group['cloud_instances'].items()]) + + return partitions + +class FilterModule(object): + + def filters(self): + return { + 'modify_autoscale_partitions': modify_autoscale_partitions, + } diff --git a/ansible/roles/autoscale/meta.empty/main.yml b/ansible/roles/autoscale/meta.empty/main.yml new file mode 100644 index 000000000..c572acc9f --- /dev/null +++ b/ansible/roles/autoscale/meta.empty/main.yml @@ -0,0 +1,52 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.1 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. diff --git a/ansible/roles/autoscale/tasks/main.yml b/ansible/roles/autoscale/tasks/main.yml new file mode 100644 index 000000000..86ff4c438 --- /dev/null +++ b/ansible/roles/autoscale/tasks/main.yml @@ -0,0 +1,19 @@ +--- + +- name: Create SuspendProgram + template: + src: suspend.py.j2 + dest: /opt/slurm-tools/bin/suspend + mode: u=rx,go= + tags: suspend + when: "'autoscale' in group_names" +- name: Create ResumeProgram # TODO: FIXME: add to slurm-tools + template: + src: resume.py.j2 + dest: /opt/slurm-tools/bin/resume + mode: u=rx,go= + #was: mode: u=rwx,g=rx,o=rx # is OK as clouds.yaml is protected + tags: resume +- name: Modify openhpc_slurm_partitions + set_fact: + openhpc_slurm_partitions: "{{ openhpc_slurm_partitions | modify_autoscale_partitions }}" diff --git a/ansible/roles/autoscale/tasks/validate.yml b/ansible/roles/autoscale/tasks/validate.yml new file mode 100644 index 000000000..5a56fa019 --- /dev/null +++ b/ansible/roles/autoscale/tasks/validate.yml @@ -0,0 +1,5 @@ +--- + +- name: Check openhpc_slurm_partitions information + debug: + msg: "{{ openhpc_slurm_partitions | modify_autoscale_partitions | to_nice_yaml }}" diff --git a/ansible/templates/resume.py.j2 b/ansible/roles/autoscale/templates/resume.py.j2 similarity index 100% rename from ansible/templates/resume.py.j2 rename to ansible/roles/autoscale/templates/resume.py.j2 diff --git a/ansible/templates/suspend.py.j2 b/ansible/roles/autoscale/templates/suspend.py.j2 similarity index 100% rename from ansible/templates/suspend.py.j2 rename to ansible/roles/autoscale/templates/suspend.py.j2 diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 73fe05216..174a3cf55 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -27,7 +27,7 @@ file: path: /etc/openstack state: directory - owner: slurm # TODO: check if this works for rebuild too? + owner: slurm # TODO: check this works for rebuild too group: slurm mode: u=rX,go= - name: Copy out clouds.yaml @@ -35,29 +35,19 @@ src: "{{ openhpc_rebuild_clouds }}" # TODO: name is wrong really! dest: /etc/openstack/clouds.yaml mode: u=r,go= - owner: slurm # TODO: check if this works for rebuild too? + owner: slurm # TODO: check this works for rebuild too group: slurm - - name: Setup slurm tools # this adds reboot script only at present + - name: Setup slurm tools include_role: name: stackhpc.slurm_openstack_tools.pytools # TODO: could just move this into the repo? vars: # TODO: debug - pytools_editable: true # git repo in /opt/slurm-tools/src/slurm-openstack-tools - become_user: slurm # TODO: check if this works for rebuild too? - become_flags: '-s /bin/bash' - - name: Create SuspendProgram # TODO: FIXME: add to slurm-tools - template: - src: suspend.py.j2 - dest: /opt/slurm-tools/bin/suspend - mode: u=rwx,g=rx,o=rx # is OK as clouds.yaml is protected - tags: suspend - when: "'autoscale' in group_names" - - name: Create ResumeProgram # TODO: FIXME: add to slurm-tools - template: - src: resume.py.j2 - dest: /opt/slurm-tools/bin/resume - mode: u=rwx,g=rx,o=rx # is OK as clouds.yaml is protected - tags: resume + become_user: slurm # TODO: check this works for rebuild too + become_flags: '-s /bin/bash' # as has shell specified as /sbin/nologin + - name: Configure autoscale programs and parameters + include_role: + name: autoscale when: "'autoscale' in group_names" + # TODO: rebuild - name: Setup slurm hosts: openhpc diff --git a/ansible/validate.yml b/ansible/validate.yml index 0c0ba8f38..805f66164 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -2,6 +2,15 @@ # Fail early if configuration is invalid +- name: Validate autoscale configuration + hosts: autoscale + tags: autoscale + tasks: + - import_role: + name: autoscale + tasks_from: validate.yml + tags: validate + - name: Validate podman configuration hosts: podman tags: podman diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml index d2ed22fa3..1180bf9e5 100755 --- a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml +++ b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml @@ -14,14 +14,13 @@ openhpc_slurm_partitions: - name: small default: no cloud_nodes: '-[2-3]' - cloud_features: "{{ general_v1_small }}" + cloud_instances: "{{ general_v1_small }}" - name: burst default: yes - groups: - - name: medium - cloud_nodes: '[4-6]' - cloud_features: "{{ general_v1_medium }}" - ram_mb: "{{ (15258 * 0.95) | int }}" - sockets_per_board: 4 - + cloud_nodes: '-[1-4]' + cloud_instances: "{{ general_v1_medium }}" + ram_mb: "{{ (15258 * 0.95) | int }}" + sockets: 1 + cores_per_socket: 4 + threads_per_core: 1 From 0bde5fcb16377b5250587309f584922b62a41f74 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 30 Sep 2021 14:57:51 +0000 Subject: [PATCH 057/105] set autoscale defaults with merged options --- .../inventory/group_vars/all/autoscale.yml | 52 +++++++++++++------ .../inventory/group_vars/all/openhpc.yml | 9 ++-- .../inventory/group_vars/all/rebuild.yml | 5 +- 3 files changed, 44 insertions(+), 22 deletions(-) diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index 26edefea1..328938a97 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -1,18 +1,38 @@ -autoscale_openhpc_suspend_exc_nodes: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all nodes in inventory, i.e. not in State=CLOUD initially -autoscale_openhpc_config: - # required parameters: - SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns - CommunicationParameters: NoAddrCache +# recommended: +autoscale_suspend_exc_nodes: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all non-CLOUD nodes, and prevent login-only slurmd nodes getting powered down +autoscale_private_data: # PrivateData + - cloud # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud + +# for debugging, may want to amend in production: +autoscale_debug_flags: + - PowerSave # https://slurm.schedmd.com/slurm.conf.html#OPT_Power +autoscale_slurmctld_syslog_debug: info # https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldSyslogDebug + +# likely to need tuning: +autoscale_suspend_time: 120 # https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime +autoscale_suspend_timeout: 30 # https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout +autoscale_resume_timeout: 300 # https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout +autoscale_power_save_interval: 10 # https://slurm.schedmd.com/slurm.conf.html#OPT_power_save_interval +autoscale_power_save_min_interval: 0 # https://slurm.schedmd.com/slurm.conf.html#OPT_power_save_min_intervals + +openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: fix name here? + +_autoscale_openhpc_config: SuspendProgram: /opt/slurm-tools/bin/suspend ResumeProgram: /opt/slurm-tools/bin/resume - # recommended: - PrivateData: cloud # shows cloud node state - # TODO: for testing only, not production: - DebugFlags: PowerSave - SlurmctldSyslogDebug: info - SuspendTime: 120 - SuspendTimeout: 30 - ResumeTimeout: 300 - # FIXME: maybe need to set TreeWidth to >= number of nodes (default: 50) - # power_save_*interval: options are defaults but should enable changes -openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: fix name here? + SlurmctldParameters: + - idle_on_node_suspend # https://slurm.schedmd.com/slurm.conf.html#OPT_idle_on_node_suspend + - cloud_dns # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud_dns + # - "power_save_interval={{ autoscale_power_save_interval}}" # seems to break if you set this + # - "power_save_min_interval={{ autoscale_power_save_min_interval }}" + CommunicationParameters: + - NoAddrCache # https://slurm.schedmd.com/slurm.conf.html#OPT_NoAddrCache + PrivateData: "{{ autoscale_private_data }}" + DebugFlags: "{{ autoscale_debug_flags }}" + SlurmctldSyslogDebug: "{{ autoscale_slurmctld_syslog_debug }}" + SuspendTime: "{{ autoscale_suspend_time }}" + SuspendTimeout: "{{ autoscale_suspend_timeout }}" + ResumeTimeout: "{{ autoscale_resume_timeout }}" +# See also TreeWidth but shouldn't needs setting with cloud_dns + +autoscale_openhpc_config: "{{ _autoscale_openhpc_config if groups.get('autoscale', []) else {} }}" diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index fc540fd4e..18b819808 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -28,10 +28,11 @@ openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}" openhpc_slurm_configless: true openhpc_login_only_nodes: login -openhpc_config_default: "{{ rebuild_openhpc_config | combine(autoscale_openhpc_config) }}" # TODO: handle case where groups aren't defined! +openhpc_config_default: + SlurmctldParameters: + - enable_configless # required as we might override SlurmctldParameters elsewhere openhpc_config_extra: {} -openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra) }}" +openhpc_config: "{{ openhpc_config_default | combine(rebuild_openhpc_config, autoscale_openhpc_config, openhpc_config_extra, list_merge='append') }}" -openhpc_suspend_exc_nodes_default: "{{ autoscale_openhpc_suspend_exc_nodes }}" openhpc_suspend_exc_nodes_extra: [] -openhpc_suspend_exc_nodes: "{{ openhpc_suspend_exc_nodes_default + openhpc_suspend_exc_nodes_extra }}" +openhpc_suspend_exc_nodes: "{{ autoscale_suspend_exc_nodes + openhpc_suspend_exc_nodes_extra }}" diff --git a/environments/common/inventory/group_vars/all/rebuild.yml b/environments/common/inventory/group_vars/all/rebuild.yml index b1162ffc3..ba631a07c 100644 --- a/environments/common/inventory/group_vars/all/rebuild.yml +++ b/environments/common/inventory/group_vars/all/rebuild.yml @@ -1,3 +1,4 @@ -rebuild_openhpc_config: +_rebuild_openhpc_config: RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild -openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml \ No newline at end of file +rebuild_openhpc_config: "{{ _rebuild_openhpc_config if groups.get('rebuild', []) else {} }}" +openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: rename? From 37a1070b693ce69585f82ca65f0245ec420353e2 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 30 Sep 2021 16:15:42 +0000 Subject: [PATCH 058/105] enable rebuild from controller --- ansible/.gitignore | 4 +- ansible/roles/rebuild/README.md | 38 +++++ ansible/roles/rebuild/defaults/main.yml | 2 + ansible/roles/rebuild/tasks/main.yml | 9 ++ ansible/roles/rebuild/templates/rebuild.py.j2 | 147 ++++++++++++++++++ ansible/slurm.yml | 13 +- .../inventory/group_vars/all/rebuild.yml | 4 +- environments/smslabs/inventory/groups | 3 +- 8 files changed, 212 insertions(+), 8 deletions(-) create mode 100644 ansible/roles/rebuild/README.md create mode 100644 ansible/roles/rebuild/defaults/main.yml create mode 100644 ansible/roles/rebuild/tasks/main.yml create mode 100644 ansible/roles/rebuild/templates/rebuild.py.j2 diff --git a/ansible/.gitignore b/ansible/.gitignore index fd78abade..0ccc6a74f 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -23,4 +23,6 @@ roles/* !roles/basic_users/ !roles/basic_users/** !roles/autoscale/ -!roles/autoscale/** \ No newline at end of file +!roles/autoscale/** +!roles/rebuild/ +!roles/rebuild/** diff --git a/ansible/roles/rebuild/README.md b/ansible/roles/rebuild/README.md new file mode 100644 index 000000000..225dd44b9 --- /dev/null +++ b/ansible/roles/rebuild/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/ansible/roles/rebuild/defaults/main.yml b/ansible/roles/rebuild/defaults/main.yml new file mode 100644 index 000000000..0a0383df4 --- /dev/null +++ b/ansible/roles/rebuild/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# defaults file for rebuild diff --git a/ansible/roles/rebuild/tasks/main.yml b/ansible/roles/rebuild/tasks/main.yml new file mode 100644 index 000000000..68acdd606 --- /dev/null +++ b/ansible/roles/rebuild/tasks/main.yml @@ -0,0 +1,9 @@ +--- +- name: Create RebootProgram # TODO: FIXME: add to slurm-tools + template: + src: rebuild.py.j2 + dest: /opt/slurm-tools/bin/rebuild + mode: u=rx,go= + owner: slurm + group: slurm + tags: resume diff --git a/ansible/roles/rebuild/templates/rebuild.py.j2 b/ansible/roles/rebuild/templates/rebuild.py.j2 new file mode 100644 index 000000000..e080d763e --- /dev/null +++ b/ansible/roles/rebuild/templates/rebuild.py.j2 @@ -0,0 +1,147 @@ +#!/opt/slurm-tools/bin/python3 +# -*- coding: utf-8 -*- + +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import logging.handlers +import os +from os import path +import sys +import socket +import subprocess + +import openstack +import pbr.version + +__version__ = pbr.version.VersionInfo("slurm-openstack-tools").version_string() + +MAX_REASON_LENGTH = 1000 + +# configure logging to syslog - by default only "info" +# and above categories appear +logger = logging.getLogger("syslogger") +logger.setLevel(logging.DEBUG) +handler = logging.handlers.SysLogHandler("/dev/log") +handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) +logger.addHandler(handler) + +def get_statesavelocation(): + """ Return the path for Slurm's StateSaveLocation """ + scontrol = subprocess.run(['scontrol', 'show', 'config'], stdout=subprocess.PIPE, universal_newlines=True) + for line in scontrol.stdout.splitlines(): + if line.startswith('StateSaveLocation'): # StateSaveLocation = /var/spool/slurm + return line.split()[-1] + +def get_openstack_server_id(node): + + statedir = get_statesavelocation() + instance_file = os.path.join(statedir, node) + try: + with open(instance_file) as f: + instance_id = f.readline().strip() + return instance_id + except FileNotFoundError: + logger.error(f"no instance file found in {statedir} for node {node}") + return None + +def get_sinfo_path(): + # TODO(johngarbutt): get this from environment or config file? + sinfo_alt_path = "/usr/local/software/slurm/current/bin/sinfo" + if path.exists(sinfo_alt_path): + return sinfo_alt_path + return "sinfo" + + +def get_reboot_reason(node): + sinfo_path = get_sinfo_path() + # see why we're being rebooted: + sinfo = subprocess.run( + [ + sinfo_path, + "--noheader", + "--nodes=%s" % node, + "-O", + "Reason:%i" % MAX_REASON_LENGTH, + ], + stdout=subprocess.PIPE, + universal_newlines=True, + ) + return sinfo.stdout.strip() + + +def get_image_from_reason(reason): + tokens = reason.split() + image = None + if len(tokens) > 1: + image_tokens = tokens[1].split(":") + if len(image_tokens) == 2 and image_tokens[0] == "image": + if image_tokens[1]: + image = image_tokens[1] + logger.info(f"user requested image: {image}") + return image + + +def rebuild_openstack_server(server_id, reason): + # Validate server_id + conn = openstack.connection.from_config() + try: + server = conn.get_server(server_id) + except openstack.exceptions.ResourceNotFound: + logger.error(f"server id {server_id} is not valid") + return None + + image_name_or_uuid = get_image_from_reason(reason) + if not image_name_or_uuid: + image_name_or_uuid = server.image.id + logger.info(f"couldn't parse image from reason '{reason}', falling back to existing image: {image_name_or_uuid}") + + image = conn.image.find_image(image_name_or_uuid) # doesn't throw exception + if image is None: + logger.error(f"image {image_name_or_uuid} either not found or not unique") + return None + + # Note that OpenStack will power down the server as part of the rebuild + logger.info(f"rebuilding server {server_id} with image {image.id}") + conn.rebuild_server(server_id, image.id) + +def reboot_openstack_server(server_id): + conn = openstack.connection.from_config() + server = conn.get_server(server_id) + logger.info(f"rebooting server %{server_id} with image %{image_uuid}") + conn.reboot_server(server_id, 'SOFT') + +def expand_nodes(hostlist_expr): + scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) + return scontrol.stdout.strip().split('\n') + +def rebuild_or_reboot(): + """ Rebuild or reboot an OpenStack node from the controller. """ + + hostlist_expr = sys.argv[1] + logger.info(f"Slurmctld invoked RebootProgram {hostlist_expr}") + for node in expand_nodes(hostlist_expr): + server_uuid = get_openstack_server_id(node) + if not server_uuid: + continue # can just try next one (but really should now exit > 0 even if others succeed) + reason = get_reboot_reason(node) + if not reason.startswith("rebuild"): + reboot_openstack_server(server_uuid) # TODO: support selecting soft or hard reboot via reason? + else: + rebuild_openstack_server(server_uuid, reason) + +if __name__ == "__main__": + try: + rebuild_or_reboot() + except: + logger.exception('Exception in main:') + raise \ No newline at end of file diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 174a3cf55..8d825a64b 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -35,19 +35,22 @@ src: "{{ openhpc_rebuild_clouds }}" # TODO: name is wrong really! dest: /etc/openstack/clouds.yaml mode: u=r,go= - owner: slurm # TODO: check this works for rebuild too + owner: slurm group: slurm - name: Setup slurm tools include_role: - name: stackhpc.slurm_openstack_tools.pytools # TODO: could just move this into the repo? - vars: # TODO: debug - become_user: slurm # TODO: check this works for rebuild too + name: stackhpc.slurm_openstack_tools.pytools + vars: + become_user: slurm become_flags: '-s /bin/bash' # as has shell specified as /sbin/nologin - name: Configure autoscale programs and parameters include_role: name: autoscale when: "'autoscale' in group_names" - # TODO: rebuild + - name: Configure rebuild programs and parameters + include_role: + name: rebuild + when: "'rebuild' in group_names" - name: Setup slurm hosts: openhpc diff --git a/environments/common/inventory/group_vars/all/rebuild.yml b/environments/common/inventory/group_vars/all/rebuild.yml index ba631a07c..57e30ad5f 100644 --- a/environments/common/inventory/group_vars/all/rebuild.yml +++ b/environments/common/inventory/group_vars/all/rebuild.yml @@ -1,4 +1,6 @@ _rebuild_openhpc_config: - RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild + RebootProgram: /opt/slurm-tools/bin/rebuild + SlurmctldParameters: + - reboot_from_controller rebuild_openhpc_config: "{{ _rebuild_openhpc_config if groups.get('rebuild', []) else {} }}" openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: rename? diff --git a/environments/smslabs/inventory/groups b/environments/smslabs/inventory/groups index acf3ca6bc..1f4e97615 100644 --- a/environments/smslabs/inventory/groups +++ b/environments/smslabs/inventory/groups @@ -34,7 +34,8 @@ control [filebeat:children] slurm_stats -# NB: [rebuild] not defined here as this template is used in CI, which does not run in openstack +[rebuild:children] +control [update:children] cluster From 138de0a52d6e2d51c23313ad800fa18d076ce4db Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 1 Oct 2021 08:47:52 +0000 Subject: [PATCH 059/105] make suspend less picky about instance ID file format --- ansible/roles/autoscale/templates/suspend.py.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/autoscale/templates/suspend.py.j2 b/ansible/roles/autoscale/templates/suspend.py.j2 index e296604ab..d52bd4d03 100644 --- a/ansible/roles/autoscale/templates/suspend.py.j2 +++ b/ansible/roles/autoscale/templates/suspend.py.j2 @@ -41,7 +41,7 @@ def suspend(): instance_file = os.path.join(statedir, node) try: with open(instance_file) as f: - instance_id = f.read() + instance_id = f.readline().strip() except FileNotFoundError: logger.info(f"no instance file found in {statedir} for node {node}") From dee0807ce15255b30c3f9e6cf801750f5b2ffd5e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 1 Oct 2021 14:16:12 +0000 Subject: [PATCH 060/105] use existing compute-based rebuild --- ansible/slurm.yml | 6 +----- environments/common/inventory/group_vars/all/rebuild.yml | 4 +--- environments/smslabs/inventory/groups | 1 + 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 8d825a64b..2a73c3a84 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -37,7 +37,7 @@ mode: u=r,go= owner: slurm group: slurm - - name: Setup slurm tools + - name: Setup slurm tools # this installs RebootProgram for rebuild too include_role: name: stackhpc.slurm_openstack_tools.pytools vars: @@ -47,10 +47,6 @@ include_role: name: autoscale when: "'autoscale' in group_names" - - name: Configure rebuild programs and parameters - include_role: - name: rebuild - when: "'rebuild' in group_names" - name: Setup slurm hosts: openhpc diff --git a/environments/common/inventory/group_vars/all/rebuild.yml b/environments/common/inventory/group_vars/all/rebuild.yml index 57e30ad5f..ba631a07c 100644 --- a/environments/common/inventory/group_vars/all/rebuild.yml +++ b/environments/common/inventory/group_vars/all/rebuild.yml @@ -1,6 +1,4 @@ _rebuild_openhpc_config: - RebootProgram: /opt/slurm-tools/bin/rebuild - SlurmctldParameters: - - reboot_from_controller + RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild rebuild_openhpc_config: "{{ _rebuild_openhpc_config if groups.get('rebuild', []) else {} }}" openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: rename? diff --git a/environments/smslabs/inventory/groups b/environments/smslabs/inventory/groups index 1f4e97615..6fde43dfa 100644 --- a/environments/smslabs/inventory/groups +++ b/environments/smslabs/inventory/groups @@ -36,6 +36,7 @@ slurm_stats [rebuild:children] control +compute [update:children] cluster From 993d413a35cb46bcc5e8082267de35633c36eb5d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 1 Oct 2021 19:53:22 +0000 Subject: [PATCH 061/105] move suspend/resume program into slurm_openstack_tools --- .../roles/autoscale/templates/resume.py.j2 | 127 ------------------ .../roles/autoscale/templates/suspend.py.j2 | 56 -------- .../inventory/group_vars/all/autoscale.yml | 4 +- .../inventory/group_vars/rebuild/override.yml | 1 + 4 files changed, 3 insertions(+), 185 deletions(-) delete mode 100644 ansible/roles/autoscale/templates/resume.py.j2 delete mode 100644 ansible/roles/autoscale/templates/suspend.py.j2 create mode 100644 environments/smslabs/inventory/group_vars/rebuild/override.yml diff --git a/ansible/roles/autoscale/templates/resume.py.j2 b/ansible/roles/autoscale/templates/resume.py.j2 deleted file mode 100644 index c17890608..000000000 --- a/ansible/roles/autoscale/templates/resume.py.j2 +++ /dev/null @@ -1,127 +0,0 @@ -#!/opt/slurm-tools/bin/python3 -""" A Slurm ResumeProgram to create OpenStack instances. - - Usage: - - resume HOSTLIST_EXPRESSION [debug] - - where: - HOSTLIST_EXPRESSION: Name(s) of node(s) to create, using Slurm's hostlist expression, as per [1]. - debug: Any 2nd argument puts this in debug mode which is more verbose but does not actually create nodes. - - Output and exceptions are written to the syslog. - - The flavor, image, network and keypair to be used must be defined as node Features [2] in the format "parameter=value". - - OpenStack credentials must be available to this script (e.g. via an application credential in /etc/openstack/clouds.yaml readable by the slurm user) - - [1]: https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeProgram - [2]: https://slurm.schedmd.com/slurm.conf.html#OPT_Features -""" - -import sys, os, subprocess, logging.handlers -import openstack -import pprint - -REQUIRED_PARAMS = ('image', 'flavor', 'keypair', 'network') - -# configure logging to syslog - by default only "info" and above categories appear -logger = logging.getLogger("syslogger") -logger.setLevel(logging.DEBUG) -handler = logging.handlers.SysLogHandler("/dev/log") -handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) -logger.addHandler(handler) - -def get_statesavelocation(): - """ Return the path for Slurm's StateSaveLocation """ - scontrol = subprocess.run(['scontrol', 'show', 'config'], stdout=subprocess.PIPE, universal_newlines=True) - for line in scontrol.stdout.splitlines(): - if line.startswith('StateSaveLocation'): # StateSaveLocation = /var/spool/slurm - return line.split()[-1] - -def expand_nodes(hostlist_expr): - scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) - return scontrol.stdout.strip().split('\n') - -def get_features(nodenames): - """ Retrieve the features specified for given node(s). - - Returns a dict with a key/value pair for each node. Keys are node names, values are lists of strings, one string per feature. - """ - - scontrol = subprocess.run(['scontrol', 'show', 'node', nodenames], stdout=subprocess.PIPE, universal_newlines=True) - features = {} - for line in scontrol.stdout.splitlines(): - line = line.strip() - if line.startswith('NodeName'): # NodeName=dev-small-cloud-1 CoresPerSocket=1 - node = line.split()[0].split('=')[1] - if line.startswith('AvailableFeatures'): - feature_args = line.split('=', 1)[1] - features[node] = feature_args.split(',') - - return features - -def create_server(conn, name, image, flavor, network, keypair): - - server = conn.compute.create_server( - name=name, image_id=image.id, flavor_id=flavor.id, - networks=[{"uuid": network.id}], key_name=keypair.name, - ) - #server = conn.compute.wait_for_server(...) - - return server - -def resume(): - debug = False - if len(sys.argv) > 2: - logger.info(f"Running in debug mode - won't actually create nodes") - debug = True - hostlist_expr = sys.argv[1] - logger.info(f"Slurmctld invoked resume {hostlist_expr}") - new_nodes = expand_nodes(hostlist_expr) - - conn = openstack.connection.from_config() - logger.info(f"Got openstack connection {conn}") - - features = get_features(hostlist_expr) - logger.info(f"Read feature information from slurm") - - statedir = get_statesavelocation() - - for node in new_nodes: - # extract the openstack parameters from node features: - if node not in features: - logger.error(f"No Feature definitions found for node {node}: {features}") - os_parameters = dict(feature.split('=') for feature in features[node]) - if debug: - logger.info(f"os_parameters for {node}: {os_parameters}") - missing = set(REQUIRED_PARAMS).difference(os_parameters.keys()) - if missing: - logger.error(f"Missing {','.join(missing)} from feature definition for node {node}: {os_parameters}") - - # get openstack objects: - os_objects = { - 'image': conn.compute.find_image(os_parameters['image']), - 'flavor': conn.compute.find_flavor(os_parameters['flavor']), - 'network': conn.network.find_network(os_parameters['network']), - 'keypair': conn.compute.find_keypair(os_parameters['keypair']), - } - not_found = dict((k, v) for (k, v) in os_objects.items() if v is None) - if not_found: - raise ValueError('Could not find openstack objects for: %s' % ', '.join(not_found)) - if debug: - logger.info(f"os_objects for {node} : {os_objects}") - if not debug: - logger.info(f"creating node {node}") - server = create_server(conn, node, **os_objects) # TODO: save id to disk so can use it instead of name on deletion (to cope with multiple instances with same name) - logger.info(f"server: {server}") - with open(os.path.join(statedir, node), 'w') as f: - f.write(server.id) - # Don't need scontrol update nodename={node} nodeaddr={server_ip} as using SlurmctldParameters=cloud_dns - -if __name__ == "__main__": - try: - resume() - except: - logger.exception('Exception in main:') - raise diff --git a/ansible/roles/autoscale/templates/suspend.py.j2 b/ansible/roles/autoscale/templates/suspend.py.j2 deleted file mode 100644 index d52bd4d03..000000000 --- a/ansible/roles/autoscale/templates/suspend.py.j2 +++ /dev/null @@ -1,56 +0,0 @@ -#!/opt/slurm-tools/bin/python3 -""" Delete openstack instances """ - -import sys, os, subprocess, logging, logging.handlers -import openstack -import pprint - -# configure logging to syslog - by default only "info" and above categories appear -logger = logging.getLogger("syslogger") -logger.setLevel(logging.DEBUG) -handler = logging.handlers.SysLogHandler("/dev/log") -handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) -logger.addHandler(handler) - -def get_statesavelocation(): - """ Return the path for Slurm's StateSaveLocation """ - scontrol = subprocess.run(['scontrol', 'show', 'config'], stdout=subprocess.PIPE, universal_newlines=True) - for line in scontrol.stdout.splitlines(): - if line.startswith('StateSaveLocation'): # StateSaveLocation = /var/spool/slurm - return line.split()[-1] - -def expand_nodes(hostlist_expr): - scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) - return scontrol.stdout.strip().split('\n') - -def delete_server(conn, name): - server = conn.compute.find_server(name) - conn.compute.delete_server(server) - -def suspend(): - hostlist_expr = sys.argv[1] - logger.info(f"Slurmctld invoked suspend {hostlist_expr}") - remove_nodes = expand_nodes(hostlist_expr) - - conn = openstack.connection.from_config() - logger.info(f"Got openstack connection {conn}") - - for node in remove_nodes: - instance_id = False - statedir = get_statesavelocation() - instance_file = os.path.join(statedir, node) - try: - with open(instance_file) as f: - instance_id = f.readline().strip() - except FileNotFoundError: - logger.info(f"no instance file found in {statedir} for node {node}") - - logger.info(f"deleting node {instance_id or node}") - delete_server(conn, (instance_id or node)) - -if __name__ == "__main__": - try: - suspend() - except: - logger.exception('Exception in main:') - raise diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index 328938a97..cc27f9b26 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -18,8 +18,8 @@ autoscale_power_save_min_interval: 0 # https://slurm.schedmd.com/slurm.conf.html openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: fix name here? _autoscale_openhpc_config: - SuspendProgram: /opt/slurm-tools/bin/suspend - ResumeProgram: /opt/slurm-tools/bin/resume + SuspendProgram: /opt/slurm-tools/bin/slurm-openstack-suspend + ResumeProgram: /opt/slurm-tools/bin/slurm-openstack-resume SlurmctldParameters: - idle_on_node_suspend # https://slurm.schedmd.com/slurm.conf.html#OPT_idle_on_node_suspend - cloud_dns # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud_dns diff --git a/environments/smslabs/inventory/group_vars/rebuild/override.yml b/environments/smslabs/inventory/group_vars/rebuild/override.yml new file mode 100644 index 000000000..178ab7848 --- /dev/null +++ b/environments/smslabs/inventory/group_vars/rebuild/override.yml @@ -0,0 +1 @@ +pytools_gitref: feature/autoscale From 53e27fdb955b7c144381e4be79ef0023065020a8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 1 Oct 2021 20:11:39 +0000 Subject: [PATCH 062/105] use autoscale defaults in role via set_fact --- ansible/roles/autoscale/defaults/main.yml | 38 +++++++++++++++++- ansible/roles/autoscale/tasks/main.yml | 17 ++------ .../inventory/group_vars/all/autoscale.yml | 40 ++----------------- .../inventory/group_vars/all/openhpc.yml | 6 +-- 4 files changed, 44 insertions(+), 57 deletions(-) diff --git a/ansible/roles/autoscale/defaults/main.yml b/ansible/roles/autoscale/defaults/main.yml index e85b2db21..73cd8792f 100644 --- a/ansible/roles/autoscale/defaults/main.yml +++ b/ansible/roles/autoscale/defaults/main.yml @@ -1,2 +1,36 @@ ---- -# defaults file for autoscale +# recommended: +autoscale_private_data: # PrivateData + - cloud # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud + +# useful for debugging, may want to amend in production: +autoscale_debug_flags: + - PowerSave # https://slurm.schedmd.com/slurm.conf.html#OPT_Power +autoscale_slurmctld_syslog_debug: info # https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldSyslogDebug + +# likely to need tuning: +autoscale_suspend_time: 120 # https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime +autoscale_suspend_timeout: 30 # https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout +autoscale_resume_timeout: 300 # https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout +# autoscale_power_save_interval: 10 # https://slurm.schedmd.com/slurm.conf.html#OPT_power_save_interval +# autoscale_power_save_min_interval: 0 # https://slurm.schedmd.com/slurm.conf.html#OPT_power_save_min_intervals + +# likely to need defining: +autoscale_suspend_exc_nodes: [] + +autoscale_openhpc_config: + SuspendProgram: /opt/slurm-tools/bin/slurm-openstack-suspend + ResumeProgram: /opt/slurm-tools/bin/slurm-openstack-resume + SlurmctldParameters: + - idle_on_node_suspend # https://slurm.schedmd.com/slurm.conf.html#OPT_idle_on_node_suspend + - cloud_dns # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud_dns + # - "power_save_interval={{ autoscale_power_save_interval}}" # seems to break if you set this + # - "power_save_min_interval={{ autoscale_power_save_min_interval }}" + CommunicationParameters: + - NoAddrCache # https://slurm.schedmd.com/slurm.conf.html#OPT_NoAddrCache + PrivateData: "{{ autoscale_private_data }}" + DebugFlags: "{{ autoscale_debug_flags }}" + SlurmctldSyslogDebug: "{{ autoscale_slurmctld_syslog_debug }}" + SuspendTime: "{{ autoscale_suspend_time }}" + SuspendTimeout: "{{ autoscale_suspend_timeout }}" + ResumeTimeout: "{{ autoscale_resume_timeout }}" +# See also TreeWidth but shouldn't needs setting with cloud_dns diff --git a/ansible/roles/autoscale/tasks/main.yml b/ansible/roles/autoscale/tasks/main.yml index 86ff4c438..1ad1bbefc 100644 --- a/ansible/roles/autoscale/tasks/main.yml +++ b/ansible/roles/autoscale/tasks/main.yml @@ -1,19 +1,8 @@ --- -- name: Create SuspendProgram - template: - src: suspend.py.j2 - dest: /opt/slurm-tools/bin/suspend - mode: u=rx,go= - tags: suspend - when: "'autoscale' in group_names" -- name: Create ResumeProgram # TODO: FIXME: add to slurm-tools - template: - src: resume.py.j2 - dest: /opt/slurm-tools/bin/resume - mode: u=rx,go= - #was: mode: u=rwx,g=rx,o=rx # is OK as clouds.yaml is protected - tags: resume - name: Modify openhpc_slurm_partitions set_fact: openhpc_slurm_partitions: "{{ openhpc_slurm_partitions | modify_autoscale_partitions }}" +- name: Merge autoscale configuration + set_fact: + openhpc_config: "{{ autoscale_openhpc_config | combine(openhpc_config, list_merge='append') }}" diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index cc27f9b26..14c3ef38a 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -1,38 +1,4 @@ -# recommended: -autoscale_suspend_exc_nodes: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all non-CLOUD nodes, and prevent login-only slurmd nodes getting powered down -autoscale_private_data: # PrivateData - - cloud # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud - -# for debugging, may want to amend in production: -autoscale_debug_flags: - - PowerSave # https://slurm.schedmd.com/slurm.conf.html#OPT_Power -autoscale_slurmctld_syslog_debug: info # https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldSyslogDebug - -# likely to need tuning: -autoscale_suspend_time: 120 # https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime -autoscale_suspend_timeout: 30 # https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout -autoscale_resume_timeout: 300 # https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout -autoscale_power_save_interval: 10 # https://slurm.schedmd.com/slurm.conf.html#OPT_power_save_interval -autoscale_power_save_min_interval: 0 # https://slurm.schedmd.com/slurm.conf.html#OPT_power_save_min_intervals +openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: rename? -openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: fix name here? - -_autoscale_openhpc_config: - SuspendProgram: /opt/slurm-tools/bin/slurm-openstack-suspend - ResumeProgram: /opt/slurm-tools/bin/slurm-openstack-resume - SlurmctldParameters: - - idle_on_node_suspend # https://slurm.schedmd.com/slurm.conf.html#OPT_idle_on_node_suspend - - cloud_dns # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud_dns - # - "power_save_interval={{ autoscale_power_save_interval}}" # seems to break if you set this - # - "power_save_min_interval={{ autoscale_power_save_min_interval }}" - CommunicationParameters: - - NoAddrCache # https://slurm.schedmd.com/slurm.conf.html#OPT_NoAddrCache - PrivateData: "{{ autoscale_private_data }}" - DebugFlags: "{{ autoscale_debug_flags }}" - SlurmctldSyslogDebug: "{{ autoscale_slurmctld_syslog_debug }}" - SuspendTime: "{{ autoscale_suspend_time }}" - SuspendTimeout: "{{ autoscale_suspend_timeout }}" - ResumeTimeout: "{{ autoscale_resume_timeout }}" -# See also TreeWidth but shouldn't needs setting with cloud_dns - -autoscale_openhpc_config: "{{ _autoscale_openhpc_config if groups.get('autoscale', []) else {} }}" +# TODO: should this get moved?? +autoscale_suspend_exc_nodes: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all non-CLOUD nodes, and prevent login-only slurmd nodes getting powered down diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 18b819808..2139d140b 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -25,14 +25,12 @@ openhpc_packages_extra: [] openhpc_packages: "{{ openhpc_packages_default + openhpc_packages_extra }}" openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}" -openhpc_slurm_configless: true openhpc_login_only_nodes: login - openhpc_config_default: SlurmctldParameters: - - enable_configless # required as we might override SlurmctldParameters elsewhere + - enable_configless openhpc_config_extra: {} -openhpc_config: "{{ openhpc_config_default | combine(rebuild_openhpc_config, autoscale_openhpc_config, openhpc_config_extra, list_merge='append') }}" +openhpc_config: "{{ openhpc_config_default | combine(rebuild_openhpc_config, openhpc_config_extra, list_merge='append') }}" openhpc_suspend_exc_nodes_extra: [] openhpc_suspend_exc_nodes: "{{ autoscale_suspend_exc_nodes + openhpc_suspend_exc_nodes_extra }}" From 051649985ab6f0bf8b684749c559aacb828a7fbf Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 1 Oct 2021 20:36:00 +0000 Subject: [PATCH 063/105] improve autoscale vars/defaults/docs --- ansible/roles/autoscale/README.md | 15 +++++++++++++-- ansible/roles/autoscale/defaults/main.yml | 10 ++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/ansible/roles/autoscale/README.md b/ansible/roles/autoscale/README.md index 63c6c13ab..69e3ddeb3 100644 --- a/ansible/roles/autoscale/README.md +++ b/ansible/roles/autoscale/README.md @@ -11,7 +11,7 @@ NOTES TODO: - Describe groups. - Describe cpu/memory info requirements (inc. for mixed partitions) - Describe what happens on failure. - +- Note that DNS is REQUIRED for this. ## Requirements @@ -24,8 +24,19 @@ NOTES TODO: - `openhpc_slurm_partitions`: This role modifies what the partitions/groups defined [openhpc_slurm_partitions](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) in the by `stackhpc.openhpc` role accept: - `cloud_nodes`: Optional. As per the `stackhpc.openhpc` docs this defines nodes in a ["CLOUD" state](https://slurm.schedmd.com/slurm.conf.html#OPT_CLOUD), i.e. treated as powered down/not existing when the Slurm control daemon starts. The value is a suffix for the group/partition's node names in Slurm's hostlist expression format (e.g. `-[11-25]`) and therefore defines the number of CLOUD-state nodes. - `cloud_instances`: Required if `cloud_nodes` is defined. A dict defining the `flavor`, `image`, `keypair` and `network` to use for CLOUD-state instances in this partition/group. Values for these parameters may be either names (if unique in the cloud) or IDs. + + Some examples are given below. + +- `autoscale_show_suspended_nodes`: Optional, default `true`. Whether to show suspended/powered-down nodes in `sinfo` etc. See `slurm.conf` parameter [PrivateData - cloud](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_cloud). + +The following variables have defaults useful for debugging autoscaling, but may be altered for production: +- `autoscale_debug_powersaving`: Optional, default `true`. Log additional information for powersaving, see `slurm.conf` parameter [DebugFlags - PowerSave](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_PowerSave_2). +- `autoscale_slurmctld_syslog_debug`: Optional, default `info`. Syslog logging level. See `slurm.conf` parameter [SlurmctldSyslogDebug](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SlurmctldSyslogDebug). -Some examples are given below. +The following variables are likely to need tuning for the specific site/instances: +- `autoscale_suspend_time`: Optional, default 120s TODO https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime +- `autoscale_suspend_timeout`: Optional, default 30s TODO https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout +- `autoscale_resume_timeout`: Optional, default 300s TODO https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout ### Processor/memory information Non-CLOUD-state nodes in a group/partition are defined by the hosts in an inventory group named `_` as per `stackhpc.openhpc` [docs](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) and processor/memory information is automatically retrieved from them. diff --git a/ansible/roles/autoscale/defaults/main.yml b/ansible/roles/autoscale/defaults/main.yml index 73cd8792f..d8a2b3bac 100644 --- a/ansible/roles/autoscale/defaults/main.yml +++ b/ansible/roles/autoscale/defaults/main.yml @@ -1,10 +1,8 @@ # recommended: -autoscale_private_data: # PrivateData - - cloud # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud +autoscale_show_suspended_nodes: true # useful for debugging, may want to amend in production: -autoscale_debug_flags: - - PowerSave # https://slurm.schedmd.com/slurm.conf.html#OPT_Power +autoscale_debug_powersaving: true autoscale_slurmctld_syslog_debug: info # https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldSyslogDebug # likely to need tuning: @@ -27,8 +25,8 @@ autoscale_openhpc_config: # - "power_save_min_interval={{ autoscale_power_save_min_interval }}" CommunicationParameters: - NoAddrCache # https://slurm.schedmd.com/slurm.conf.html#OPT_NoAddrCache - PrivateData: "{{ autoscale_private_data }}" - DebugFlags: "{{ autoscale_debug_flags }}" + PrivateData: "{{ ['cloud'] if autoscale_show_suspended_nodes else [] }}" + DebugFlags: "{{ ['PowerSave'] if autoscale_debug_powersaving else [] }}" # NB: Seems to have disappeared in latest Slurm SlurmctldSyslogDebug: "{{ autoscale_slurmctld_syslog_debug }}" SuspendTime: "{{ autoscale_suspend_time }}" SuspendTimeout: "{{ autoscale_suspend_timeout }}" From 04198d5966347bff2a60d7db4a2ba2e3aac4d071 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 5 Oct 2021 13:58:15 +0000 Subject: [PATCH 064/105] use set_fact merging on rebuild and fix venv deployment --- ansible/roles/rebuild/README.md | 33 ++-- ansible/roles/rebuild/defaults/main.yml | 4 +- ansible/roles/rebuild/tasks/main.yml | 12 +- ansible/roles/rebuild/templates/rebuild.py.j2 | 147 ------------------ ansible/slurm.yml | 8 +- .../inventory/group_vars/all/openhpc.yml | 5 +- .../inventory/group_vars/all/rebuild.yml | 3 - 7 files changed, 34 insertions(+), 178 deletions(-) delete mode 100644 ansible/roles/rebuild/templates/rebuild.py.j2 diff --git a/ansible/roles/rebuild/README.md b/ansible/roles/rebuild/README.md index 225dd44b9..f846bbeb3 100644 --- a/ansible/roles/rebuild/README.md +++ b/ansible/roles/rebuild/README.md @@ -1,38 +1,47 @@ -Role Name -========= +rebuild +======= -A brief description of the role goes here. +Enable the compute nodes to be reimaged from Slurm. To use this functionality add the `control` and `compute` groups to the `rebuild` group. + +Once `ansible/slurm.yml` has run, node(s) can be reimaged using: + + scontrol reboot [ASAP] [nextstate=] reason="rebuild image:" [] + +where: +- `` is the name (if unique) or ID of an image in OpenStack. +- `` is a Slurm hostlist expression defining the nodes to reimage. +- `ASAP` means the rebuild will happen as soon as existing jobs on the node(s) complete - no new jobs will be scheduled on it. +- If `nextstate=...` is not given nodes remain in DRAIN state after the rebuild. Requirements ------------ -Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. +- This role must be run before the `stackhpc.openhpc` role's `runtime.yml` playbook as it modifies the `openhpc_config` variable. +- OpenStack credentials on the compute nodes, e.g. at `/etc/openstack/clouds.yaml` which are readable by the root user. It is recommended these credentials are an [application credential](https://docs.openstack.org/keystone/latest/user/application_credentials.html). This can be created in Horizon via Identity > Application Credentials > +Create Application Credential. The usual role required is `member`. Using access rules has been found not to work at present. Note that the downloaded credential can be encrpyted using `ansible-vault` to allow commit to source control. It will automatically be decrypted when copied onto the compute nodes. +- An image which when booted adds that node to the Slurm cluster. E.g. see `packer/README.md`. Role Variables -------------- -A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. +None normally required. Dependencies ------------ -A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. +See above. Example Playbook ---------------- -Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: +See `ansible/slurm.yml` - - hosts: servers - roles: - - { role: username.rolename, x: 42 } License ------- -BSD +Apache v2 Author Information ------------------ -An optional section for the role authors to include contact information, or a website (HTML is not allowed). +StackHPC Ltd. diff --git a/ansible/roles/rebuild/defaults/main.yml b/ansible/roles/rebuild/defaults/main.yml index 0a0383df4..5e532ef24 100644 --- a/ansible/roles/rebuild/defaults/main.yml +++ b/ansible/roles/rebuild/defaults/main.yml @@ -1,2 +1,4 @@ --- -# defaults file for rebuild + +rebuild_openhpc_config: + RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild diff --git a/ansible/roles/rebuild/tasks/main.yml b/ansible/roles/rebuild/tasks/main.yml index 68acdd606..a191e807a 100644 --- a/ansible/roles/rebuild/tasks/main.yml +++ b/ansible/roles/rebuild/tasks/main.yml @@ -1,9 +1,5 @@ --- -- name: Create RebootProgram # TODO: FIXME: add to slurm-tools - template: - src: rebuild.py.j2 - dest: /opt/slurm-tools/bin/rebuild - mode: u=rx,go= - owner: slurm - group: slurm - tags: resume + +- name: Merge rebuild configuration + set_fact: + openhpc_config: "{{ rebuild_openhpc_config | combine(openhpc_config, list_merge='append') }}" diff --git a/ansible/roles/rebuild/templates/rebuild.py.j2 b/ansible/roles/rebuild/templates/rebuild.py.j2 deleted file mode 100644 index e080d763e..000000000 --- a/ansible/roles/rebuild/templates/rebuild.py.j2 +++ /dev/null @@ -1,147 +0,0 @@ -#!/opt/slurm-tools/bin/python3 -# -*- coding: utf-8 -*- - -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import logging.handlers -import os -from os import path -import sys -import socket -import subprocess - -import openstack -import pbr.version - -__version__ = pbr.version.VersionInfo("slurm-openstack-tools").version_string() - -MAX_REASON_LENGTH = 1000 - -# configure logging to syslog - by default only "info" -# and above categories appear -logger = logging.getLogger("syslogger") -logger.setLevel(logging.DEBUG) -handler = logging.handlers.SysLogHandler("/dev/log") -handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) -logger.addHandler(handler) - -def get_statesavelocation(): - """ Return the path for Slurm's StateSaveLocation """ - scontrol = subprocess.run(['scontrol', 'show', 'config'], stdout=subprocess.PIPE, universal_newlines=True) - for line in scontrol.stdout.splitlines(): - if line.startswith('StateSaveLocation'): # StateSaveLocation = /var/spool/slurm - return line.split()[-1] - -def get_openstack_server_id(node): - - statedir = get_statesavelocation() - instance_file = os.path.join(statedir, node) - try: - with open(instance_file) as f: - instance_id = f.readline().strip() - return instance_id - except FileNotFoundError: - logger.error(f"no instance file found in {statedir} for node {node}") - return None - -def get_sinfo_path(): - # TODO(johngarbutt): get this from environment or config file? - sinfo_alt_path = "/usr/local/software/slurm/current/bin/sinfo" - if path.exists(sinfo_alt_path): - return sinfo_alt_path - return "sinfo" - - -def get_reboot_reason(node): - sinfo_path = get_sinfo_path() - # see why we're being rebooted: - sinfo = subprocess.run( - [ - sinfo_path, - "--noheader", - "--nodes=%s" % node, - "-O", - "Reason:%i" % MAX_REASON_LENGTH, - ], - stdout=subprocess.PIPE, - universal_newlines=True, - ) - return sinfo.stdout.strip() - - -def get_image_from_reason(reason): - tokens = reason.split() - image = None - if len(tokens) > 1: - image_tokens = tokens[1].split(":") - if len(image_tokens) == 2 and image_tokens[0] == "image": - if image_tokens[1]: - image = image_tokens[1] - logger.info(f"user requested image: {image}") - return image - - -def rebuild_openstack_server(server_id, reason): - # Validate server_id - conn = openstack.connection.from_config() - try: - server = conn.get_server(server_id) - except openstack.exceptions.ResourceNotFound: - logger.error(f"server id {server_id} is not valid") - return None - - image_name_or_uuid = get_image_from_reason(reason) - if not image_name_or_uuid: - image_name_or_uuid = server.image.id - logger.info(f"couldn't parse image from reason '{reason}', falling back to existing image: {image_name_or_uuid}") - - image = conn.image.find_image(image_name_or_uuid) # doesn't throw exception - if image is None: - logger.error(f"image {image_name_or_uuid} either not found or not unique") - return None - - # Note that OpenStack will power down the server as part of the rebuild - logger.info(f"rebuilding server {server_id} with image {image.id}") - conn.rebuild_server(server_id, image.id) - -def reboot_openstack_server(server_id): - conn = openstack.connection.from_config() - server = conn.get_server(server_id) - logger.info(f"rebooting server %{server_id} with image %{image_uuid}") - conn.reboot_server(server_id, 'SOFT') - -def expand_nodes(hostlist_expr): - scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) - return scontrol.stdout.strip().split('\n') - -def rebuild_or_reboot(): - """ Rebuild or reboot an OpenStack node from the controller. """ - - hostlist_expr = sys.argv[1] - logger.info(f"Slurmctld invoked RebootProgram {hostlist_expr}") - for node in expand_nodes(hostlist_expr): - server_uuid = get_openstack_server_id(node) - if not server_uuid: - continue # can just try next one (but really should now exit > 0 even if others succeed) - reason = get_reboot_reason(node) - if not reason.startswith("rebuild"): - reboot_openstack_server(server_uuid) # TODO: support selecting soft or hard reboot via reason? - else: - rebuild_openstack_server(server_uuid, reason) - -if __name__ == "__main__": - try: - rebuild_or_reboot() - except: - logger.exception('Exception in main:') - raise \ No newline at end of file diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 2a73c3a84..f7c60543f 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -37,16 +37,16 @@ mode: u=r,go= owner: slurm group: slurm - - name: Setup slurm tools # this installs RebootProgram for rebuild too + - name: Setup Python/Slurm tools include_role: name: stackhpc.slurm_openstack_tools.pytools - vars: - become_user: slurm - become_flags: '-s /bin/bash' # as has shell specified as /sbin/nologin - name: Configure autoscale programs and parameters include_role: name: autoscale when: "'autoscale' in group_names" + - name: Configure rebuild programs and parameters + include_role: + name: rebuild - name: Setup slurm hosts: openhpc diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 2139d140b..d9b9eca49 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -27,10 +27,9 @@ openhpc_packages: "{{ openhpc_packages_default + openhpc_packages_extra }}" openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}" openhpc_login_only_nodes: login openhpc_config_default: - SlurmctldParameters: - - enable_configless + SlurmctldParameters: enable_configless openhpc_config_extra: {} -openhpc_config: "{{ openhpc_config_default | combine(rebuild_openhpc_config, openhpc_config_extra, list_merge='append') }}" +openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" openhpc_suspend_exc_nodes_extra: [] openhpc_suspend_exc_nodes: "{{ autoscale_suspend_exc_nodes + openhpc_suspend_exc_nodes_extra }}" diff --git a/environments/common/inventory/group_vars/all/rebuild.yml b/environments/common/inventory/group_vars/all/rebuild.yml index ba631a07c..0be916def 100644 --- a/environments/common/inventory/group_vars/all/rebuild.yml +++ b/environments/common/inventory/group_vars/all/rebuild.yml @@ -1,4 +1 @@ -_rebuild_openhpc_config: - RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild -rebuild_openhpc_config: "{{ _rebuild_openhpc_config if groups.get('rebuild', []) else {} }}" openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: rename? From 60e74a89de141d3c6fbffb9c7085ad5b9a0f57f3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 5 Oct 2021 15:39:26 +0000 Subject: [PATCH 065/105] use openhpc role's extra_nodes feature --- .../autoscale/filter_plugins/openhpc_partitions.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py b/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py index d424e419b..1e9a778d1 100644 --- a/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py +++ b/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py @@ -28,15 +28,21 @@ def modify_autoscale_partitions(partitions): for part in partitions: for group in part.get('groups', [part]): group_name = group.get('name', '') + extra_nodes = group.get('extra_nodes', []) + if 'cloud_nodes' in group: if 'cloud_instances' not in group: raise errors.AnsibleFilterError(f"`openhpc_slurm_partitions` group '{group_name}' specifies 'cloud_nodes' but is missing 'cloud_instances'.") missing_attrs = ','.join(set(REQUIRED_INSTANCE_ATTRS).difference(group['cloud_instances'])) if missing_attrs: raise errors.AnsibleFilterError(f"`openhpc_slurm_partitions` group '{group_name}' item 'cloud_instances' is missing items: {missing_attrs}.") - if 'features' not in group: - group['features'] = [] - group['features'].extend(['%s=%s' % (k, v) for (k, v) in group['cloud_instances'].items()]) + cloud_names = group['cloud_nodes'] + # TODO: check for cloud nodes overlapping real ones? + + features = ['%s=%s' % (k, v) for (k, v) in group['cloud_instances'].items()] + cloud_nodes = f'NodeName={cloud_names} State=CLOUD Features={features}' + + extra_nodes.append(cloud_nodes) return partitions From 1ee10e9931c9478a48922d09d6d5495a647515fc Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Oct 2021 13:09:06 +0000 Subject: [PATCH 066/105] fix actually generataing cloud_node info --- .../filter_plugins/openhpc_partitions.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py b/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py index 1e9a778d1..46d648177 100644 --- a/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py +++ b/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py @@ -21,28 +21,32 @@ REQUIRED_INSTANCE_ATTRS=('flavor', 'image', 'keypair', 'network') -def modify_autoscale_partitions(partitions): +def modify_autoscale_partitions(partitions, flavors): """ TODO: docs """ for part in partitions: for group in part.get('groups', [part]): group_name = group.get('name', '') - extra_nodes = group.get('extra_nodes', []) if 'cloud_nodes' in group: if 'cloud_instances' not in group: raise errors.AnsibleFilterError(f"`openhpc_slurm_partitions` group '{group_name}' specifies 'cloud_nodes' but is missing 'cloud_instances'.") - missing_attrs = ','.join(set(REQUIRED_INSTANCE_ATTRS).difference(group['cloud_instances'])) + missing_attrs = ', '.join(set(REQUIRED_INSTANCE_ATTRS).difference(group['cloud_instances'])) if missing_attrs: raise errors.AnsibleFilterError(f"`openhpc_slurm_partitions` group '{group_name}' item 'cloud_instances' is missing items: {missing_attrs}.") cloud_names = group['cloud_nodes'] # TODO: check for cloud nodes overlapping real ones? features = ['%s=%s' % (k, v) for (k, v) in group['cloud_instances'].items()] - cloud_nodes = f'NodeName={cloud_names} State=CLOUD Features={features}' - - extra_nodes.append(cloud_nodes) + cloud_nodes = { + 'NodeName': cloud_names, + 'State':'CLOUD', + 'Features': ','.join(features), + } + + group['extra_nodes'] = group.get('extra_nodes', []) + group['extra_nodes'].append(cloud_nodes) return partitions From 8a956677af8867dd9c60f814cfff71766eb25530 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Oct 2021 13:36:05 +0000 Subject: [PATCH 067/105] retrieve cloud_node instance cpu/mem from openstack --- .../openhpc_partitions.cpython-36.pyc | Bin 1462 -> 0 bytes .../filter_plugins/openhpc_partitions.py | 19 +++++++++++++++--- ansible/roles/autoscale/tasks/main.yml | 16 ++++++++++++++- 3 files changed, 31 insertions(+), 4 deletions(-) delete mode 100644 ansible/roles/autoscale/filter_plugins/__pycache__/openhpc_partitions.cpython-36.pyc diff --git a/ansible/roles/autoscale/filter_plugins/__pycache__/openhpc_partitions.cpython-36.pyc b/ansible/roles/autoscale/filter_plugins/__pycache__/openhpc_partitions.cpython-36.pyc deleted file mode 100644 index 100545137c74508e603d7c79d3de87c9f5cc88b6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1462 zcmaJ>PjBNy6rUM8w(F#@D%G-E5E!IYh?XV@ZcC`P+7`4DWJTLmC5x5U^-QvM9owB5 zmo{=FMEU{P3m3F^xFAlP_#|`X#8>EvH%`+=fEa7uyz#uhzxQT7SzUEkUOo+fvl03e zEe#X+hcHDS2t^d9XpDV~HBVCFlX^9LgEoDW+SIu~e&Y*7SEze|sLQOG;oFq#qfYBD zuz)&PS&Z|Xi-EF+X?U7*)riO8h$;J+ofl!um7Ov9ZO)G)tTkXXU^*}#!4!W0Is3gr zJLuW3Q+)Pwg%b#?!XrE*@bAYdsR;Zb)R>W}LCuOBTYQ5yV72*|5+{UOmGKjz_A6tC zzsJ-`3~9>73@27aXyY!L;TfKqmC3(bl3uAy&~aaZ9OyJ_Yd?$-Xf*pZy^9i?w&?1G z-hq|GnL&)&7DguvQL=^%cUYp$V4xa2hdEn7J60QeBStFj8g-~V|j@hKyFEh%9 zamMJD-;C0{q(PQbCj8gun>Z6P%pxWx??2BAmc1yVK%^xf2SvzbEaN;A&%N5F*PeXZ z_C&#=co;L`wU^Aby`xfkvGB&R5OFqI9A2AmcQ5gWllyHimTdg*(`8jHAs%=Nca<|_ zp)5HQ*9A17X1a*`x`1y30pQTH^>4@&PoaYaa1 zQutWmQ{Do-np__CUgTrei&!Re(W?{R42xouveRCeiTEgGJ)Wmb^uiJ(q7e4OIF*bC zMOuy^z1{+~8@#&3`K8-WVYzf!G3Oq1#ze z#)wI-)z#YeJfhF*0CsinD>KRCOc{bn_2%C0*H0hs?d}AR2m1%xgGal;_QAp4zH-6k zkZ}fOhAVR>Sw`WeXp}n*<=(`tOwFrCU2WyoOQ74xU%e`85K76#4SxKVj_a-eN$QSH zeggENqHnS!Omf2Ly3iz_xkwXPW=ll5g!-7A3s zgx~};C5YspPuL zxdo)-DEq1x=Y3!$ah8N1E5aGSt)(=8%F2%t7RlPse|c0+e;(u38i)%QNL>wL66cM- E0Z#me!T') @@ -38,17 +43,25 @@ def modify_autoscale_partitions(partitions, flavors): cloud_names = group['cloud_nodes'] # TODO: check for cloud nodes overlapping real ones? + flavor = [f for f in flavors if f['name'] == group['cloud_instances']['flavor']] + if len(flavor) != 1: + raise errors.AnsibleFilterError(f'expected one flavor matching {group["cloud_instances"]["flavor"]}, found {len(flavor)}: {flavor}') + flavor = flavor[0] + ram_mb = int(flavor['ram'] * group.get('ram_multiplier', openhpc_ram_multiplier)) # ram in flavor in MB, so no units conversion needed + features = ['%s=%s' % (k, v) for (k, v) in group['cloud_instances'].items()] cloud_nodes = { 'NodeName': cloud_names, 'State':'CLOUD', 'Features': ','.join(features), + 'CPUs': flavor['vcpus'], + 'RealMemory': group.get('ram_mb', ram_mb) } group['extra_nodes'] = group.get('extra_nodes', []) group['extra_nodes'].append(cloud_nodes) - return partitions + return openhpc_slurm_partitions class FilterModule(object): diff --git a/ansible/roles/autoscale/tasks/main.yml b/ansible/roles/autoscale/tasks/main.yml index 1ad1bbefc..cbaca6b13 100644 --- a/ansible/roles/autoscale/tasks/main.yml +++ b/ansible/roles/autoscale/tasks/main.yml @@ -1,8 +1,22 @@ --- +- name: Get cloud_node specs + shell: + cmd: "openstack flavor show --format json {{ item.cloud_instances.flavor }}" + delegate_to: localhost + run_once: true + loop: "{{ openhpc_slurm_partitions }}" + when: "'cloud_instances' in item" + register: _os_flavors + become: no +- name: Manipulate flavor information + set_fact: + flavor_info: "{{ _os_flavors.results | map(attribute='stdout') | map('from_json') }}" # list of json info +- debug: + var: flavor_info - name: Modify openhpc_slurm_partitions set_fact: - openhpc_slurm_partitions: "{{ openhpc_slurm_partitions | modify_autoscale_partitions }}" + openhpc_slurm_partitions: "{{ openhpc_slurm_partitions | modify_autoscale_partitions(flavor_info, openhpc_ram_multiplier) }}" - name: Merge autoscale configuration set_fact: openhpc_config: "{{ autoscale_openhpc_config | combine(openhpc_config, list_merge='append') }}" From a96e68c9de4e87dc10bd60aa64716cd9ff85adbe Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Oct 2021 14:04:51 +0000 Subject: [PATCH 068/105] WIP autoscale README --- ansible/roles/autoscale/README.md | 94 ++++++++++++++++--------------- 1 file changed, 49 insertions(+), 45 deletions(-) diff --git a/ansible/roles/autoscale/README.md b/ansible/roles/autoscale/README.md index 69e3ddeb3..aa4eed164 100644 --- a/ansible/roles/autoscale/README.md +++ b/ansible/roles/autoscale/README.md @@ -4,82 +4,86 @@ Support autoscaling nodes on OpenStack clouds, i.e. creating nodes when necessar This is implemented using Slurm's ["elastic computing"](https://slurm.schedmd.com/elastic_computing.html) features which are based on Slurm's [power saving](https://slurm.schedmd.com/power_save.html) features. - -NOTES TODO: -- Won't get monitoring for autoscaling nodes -- Describe autoscale vs `State=CLOUD` and powersaving enablement. -- Describe groups. -- Describe cpu/memory info requirements (inc. for mixed partitions) -- Describe what happens on failure. -- Note that DNS is REQUIRED for this. +Add the `control` group to the `autoscale` group to activate this functionality in the `ansible/slurm.yml` playbook. Note some role variables are likely to need configuring. By default, node creation and deletion will be logged in the control node's syslog. ## Requirements -- Role `stackhpc.slurm_openstack_tools.pytools`. Installs [slurm-openstack-tools](github.com/stackhpc/slurm-openstack-tools) which provides a venv with the `openstacksdk`. +- Working DNS. +- Active OpenStack credentials on localhost (e.g a sourced `openrc.sh` in the shell running ansible). +- Role `stackhpc.slurm_openstack_tools.pytools`. Installs [slurm-openstack-tools](github.com/stackhpc/slurm-openstack-tools) which provides a venv with the `openstacksdk` and the resume/suspend scripts. - Role `stackhpc.openhpc` to create a Slurm cluster. -- This role should be run on the Slurm controller only, i.e. add the `control` group to the `autoscale` group to activate this functionality. +- This role should be run on the Slurm controller only. ## Role Variables -- `openhpc_slurm_partitions`: This role modifies what the partitions/groups defined [openhpc_slurm_partitions](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) in the by `stackhpc.openhpc` role accept: - - `cloud_nodes`: Optional. As per the `stackhpc.openhpc` docs this defines nodes in a ["CLOUD" state](https://slurm.schedmd.com/slurm.conf.html#OPT_CLOUD), i.e. treated as powered down/not existing when the Slurm control daemon starts. The value is a suffix for the group/partition's node names in Slurm's hostlist expression format (e.g. `-[11-25]`) and therefore defines the number of CLOUD-state nodes. - - `cloud_instances`: Required if `cloud_nodes` is defined. A dict defining the `flavor`, `image`, `keypair` and `network` to use for CLOUD-state instances in this partition/group. Values for these parameters may be either names (if unique in the cloud) or IDs. - - Some examples are given below. - -- `autoscale_show_suspended_nodes`: Optional, default `true`. Whether to show suspended/powered-down nodes in `sinfo` etc. See `slurm.conf` parameter [PrivateData - cloud](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_cloud). - -The following variables have defaults useful for debugging autoscaling, but may be altered for production: -- `autoscale_debug_powersaving`: Optional, default `true`. Log additional information for powersaving, see `slurm.conf` parameter [DebugFlags - PowerSave](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_PowerSave_2). -- `autoscale_slurmctld_syslog_debug`: Optional, default `info`. Syslog logging level. See `slurm.conf` parameter [SlurmctldSyslogDebug](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SlurmctldSyslogDebug). +### openhpc_slurm_partitions +This role modifies what the [openhpc_slurm_partitions variable](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) in the `stackhpc.openhpc` role accepts. Partition/group definitions may additionally include: +- `cloud_nodes`: Optional. Slurm hostlist expression (e.g. `'small-[8,10-16]'`) defining names of nodes to be defined in a ["CLOUD" state](https://slurm.schedmd.com/slurm.conf.html#OPT_CLOUD), i.e. not operational when the Slurm control daemon starts. +- `cloud_instances`: Required if `cloud_nodes` is defined. A mapping with keys `flavor`, `image`, `keypair` and `network` defining the OpenStack ID or names of properties for the CLOUD-state instances. -The following variables are likely to need tuning for the specific site/instances: -- `autoscale_suspend_time`: Optional, default 120s TODO https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime -- `autoscale_suspend_timeout`: Optional, default 30s TODO https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout -- `autoscale_resume_timeout`: Optional, default 300s TODO https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout +Partitions/groups defining `cloud_nodes` may or may not also contain non-CLOUD state nodes (i.e. nodes in a matching inventory group). For CLOUD-state nodes, memory and CPU information is retrieved from OpenStack for the specified flavors. The `stackhpc.openhpc` group/partition options `ram_mb` and `ram_multiplier` and role variable `openhpc_ram_multiplier` are handled exactly as for non-CLOUD state nodes. This implies that if CLOUD and non-CLOUD state nodes are mixed in a single group all nodes must be homogenous in terms of processors/memory. -### Processor/memory information -Non-CLOUD-state nodes in a group/partition are defined by the hosts in an inventory group named `_` as per `stackhpc.openhpc` [docs](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) and processor/memory information is automatically retrieved from them. +Some examples are given below. Note that currently monitoring is not enabled for CLOUD-state nodes. -- If a group/partition contains both CLOUD and non-CLOUD nodes the processor/memory information for the CLOUD nodes is assumed to match that retrieved for the non-CLOUD nodes. -- If a group/partition only contains CLOUD-state nodes (i.e. no matching inventory group or it is empty) then processor/memory information must be specified using the `ram_mb`, `sockets`, `cores_per_socket` and `threads_per_core` options. +### Other variables +TODO: what about suspend_excl +The following variables are likely to need tuning for the specific site/instances: +- `autoscale_suspend_time`: Optional, default 120s. See `slurm.conf` parameter [SuspendTime](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SuspendTime). +- `autoscale_suspend_timeout`: Optional, default 30s. See `slurm.conf` parameter [SuspendTimeout](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SuspendTimeout). +- `autoscale_resume_timeout`: Optional, default 300s See `slurm.conf` parameter [ResumeTimeout](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_ResumeTimeout). +The following variables have defaults useful for debugging autoscaling, but may be altered for production: +- `autoscale_show_suspended_nodes`: Optional, default `true`. Whether to show suspended/powered-down nodes in `sinfo` etc. See `slurm.conf` parameter [PrivateData - cloud](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_cloud). +- `autoscale_debug_powersaving`: Optional, default `true`. Log additional information for powersaving, see `slurm.conf` parameter [DebugFlags - PowerSave](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_PowerSave_2). +- `autoscale_slurmctld_syslog_debug`: Optional, default `info`. Syslog logging level. See `slurm.conf` parameter [SlurmctldSyslogDebug](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SlurmctldSyslogDebug). +### Examples - - ```yaml - cloud_instances: - flavor: general.v1.medium - image: ohpc-compute-210909-1316.qcow2 - keypair: centos-at-steveb-ansible - network: "{{ autoscale_network }}" +Below is an example of partition definition, e.g. in `environments//inventory/group_vars/openhpc/overrides.yml`. Not shown here the inventory group `dev_small` contains 2 (non-CLOUD state) nodes. The "small" partition is the default and contains 2 non-CLOUD and 2 CLOUD nodes. The "burst" partition contains only CLOUD-state nodes. -A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. +```yaml +openhpc_cluster_name: dev +general_v1_small: + image: ohpc-compute-210909-1316.qcow2 + flavor: general.v1.small + keypair: centos-at-steveb-ansible + network: stackhpc-ipv4-geneve +general_v1_medium: + image: ohpc-compute-210909-1316.qcow2 + flavor: general.v1.medium + keypair: centos-at-steveb-ansible + network: stackhpc-ipv4-geneve +openhpc_slurm_partitions: +- name: small + default: yes + cloud_nodes: dev-small-[2-3] + cloud_instances: "{{ general_v1_small }}" +- name: burst + default: no + cloud_nodes: 'burst-[0-3]' + cloud_instances: "{{ general_v1_medium }}" +``` Dependencies ------------ -A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. +TODO: A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. Example Playbook ---------------- -Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: - - - hosts: servers - roles: - - { role: username.rolename, x: 42 } +See ansible/slurm.yml License ------- -BSD +Apache v2 Author Information ------------------ -An optional section for the role authors to include contact information, or a website (HTML is not allowed). +StackHPC Ltd. From 20d98fcb1bb22eebc7c494ed0ecfe84d61cecde3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Oct 2021 14:05:20 +0000 Subject: [PATCH 069/105] smslabs: update demo partition --- .../inventory/group_vars/openhpc/partitions.yml | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml index 1180bf9e5..95fa1d839 100755 --- a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml +++ b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml @@ -12,15 +12,12 @@ general_v1_medium: openhpc_slurm_partitions: - name: small - default: no - cloud_nodes: '-[2-3]' + default: yes + cloud_nodes: dev-small-[2-3] cloud_instances: "{{ general_v1_small }}" - name: burst - default: yes - cloud_nodes: '-[1-4]' + default: no + cloud_nodes: 'burst-[0-3]' cloud_instances: "{{ general_v1_medium }}" - ram_mb: "{{ (15258 * 0.95) | int }}" - sockets: 1 - cores_per_socket: 4 - threads_per_core: 1 + From 173fe3e34d25f1c6a0a23d4ccb878322c5cde55c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Oct 2021 14:06:36 +0000 Subject: [PATCH 070/105] add install tag to first run of stackhpc.openhpc:install.yml --- ansible/slurm.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index f7c60543f..6a76aaf85 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -23,6 +23,7 @@ import_role: name: stackhpc.openhpc tasks_from: install.yml + tags: install - name: Create /etc/openstack file: path: /etc/openstack From 474c8384ccc0752d9843998a1dc69e2b89be9390 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Oct 2021 14:15:20 +0000 Subject: [PATCH 071/105] fix changed_when --- ansible/roles/autoscale/tasks/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/autoscale/tasks/main.yml b/ansible/roles/autoscale/tasks/main.yml index cbaca6b13..6dbc37325 100644 --- a/ansible/roles/autoscale/tasks/main.yml +++ b/ansible/roles/autoscale/tasks/main.yml @@ -9,6 +9,7 @@ when: "'cloud_instances' in item" register: _os_flavors become: no + changed_when: false - name: Manipulate flavor information set_fact: flavor_info: "{{ _os_flavors.results | map(attribute='stdout') | map('from_json') }}" # list of json info From 8054f7793c06263683920129873e740aac2636d6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Oct 2021 08:31:44 +0000 Subject: [PATCH 072/105] add autoscale_clouds --- ansible/roles/autoscale/README.md | 2 ++ ansible/roles/autoscale/defaults/main.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/ansible/roles/autoscale/README.md b/ansible/roles/autoscale/README.md index aa4eed164..eda01b774 100644 --- a/ansible/roles/autoscale/README.md +++ b/ansible/roles/autoscale/README.md @@ -27,6 +27,8 @@ Some examples are given below. Note that currently monitoring is not enabled for ### Other variables +- `autoscale_clouds`: Optional, path to a `clouds.yaml` file containing a single cloud. Defaults to `~/.config/openstack/clouds.yaml`. It is recommended this is an [application credential](https://docs.openstack.org/keystone/latest/user/application_credentials.html). This can be created in Horizon via Identity > Application Credentials > +Create Application Credential. The usual role required is `member`. Using access rules has been found not to work at present. Note that the downloaded credential can be encrpyted using `ansible-vault` to allow it to be committed to source control. It will automatically be decrypted when copied onto the compute nodes. + TODO: what about suspend_excl The following variables are likely to need tuning for the specific site/instances: - `autoscale_suspend_time`: Optional, default 120s. See `slurm.conf` parameter [SuspendTime](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SuspendTime). diff --git a/ansible/roles/autoscale/defaults/main.yml b/ansible/roles/autoscale/defaults/main.yml index d8a2b3bac..c1f76c899 100644 --- a/ansible/roles/autoscale/defaults/main.yml +++ b/ansible/roles/autoscale/defaults/main.yml @@ -1,3 +1,5 @@ +autoscale_clouds: ~/.config/openstack/clouds.yaml + # recommended: autoscale_show_suspended_nodes: true From 8c1b4be7a4ed8a23178582defb765ccc9d21e0f4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Oct 2021 08:48:07 +0000 Subject: [PATCH 073/105] move suspend_excl_nodes definition from openhpc role to here --- ansible/roles/autoscale/README.md | 40 ++++++++----------- ansible/roles/autoscale/defaults/main.yml | 1 + .../inventory/group_vars/all/autoscale.yml | 8 ++-- .../inventory/group_vars/all/openhpc.yml | 3 -- 4 files changed, 22 insertions(+), 30 deletions(-) diff --git a/ansible/roles/autoscale/README.md b/ansible/roles/autoscale/README.md index eda01b774..4e1ac8336 100644 --- a/ansible/roles/autoscale/README.md +++ b/ansible/roles/autoscale/README.md @@ -10,35 +10,33 @@ Add the `control` group to the `autoscale` group to activate this functionality - Working DNS. - Active OpenStack credentials on localhost (e.g a sourced `openrc.sh` in the shell running ansible). -- Role `stackhpc.slurm_openstack_tools.pytools`. Installs [slurm-openstack-tools](github.com/stackhpc/slurm-openstack-tools) which provides a venv with the `openstacksdk` and the resume/suspend scripts. +- Role `stackhpc.slurm_openstack_tools.pytools`. Installs [slurm-openstack-tools](github.com/stackhpc/slurm-openstack-tools) which provides a venv with the `openstacksdk` and the required resume/suspend scripts. - Role `stackhpc.openhpc` to create a Slurm cluster. - This role should be run on the Slurm controller only. ## Role Variables -### openhpc_slurm_partitions -This role modifies what the [openhpc_slurm_partitions variable](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) in the `stackhpc.openhpc` role accepts. Partition/group definitions may additionally include: -- `cloud_nodes`: Optional. Slurm hostlist expression (e.g. `'small-[8,10-16]'`) defining names of nodes to be defined in a ["CLOUD" state](https://slurm.schedmd.com/slurm.conf.html#OPT_CLOUD), i.e. not operational when the Slurm control daemon starts. -- `cloud_instances`: Required if `cloud_nodes` is defined. A mapping with keys `flavor`, `image`, `keypair` and `network` defining the OpenStack ID or names of properties for the CLOUD-state instances. - -Partitions/groups defining `cloud_nodes` may or may not also contain non-CLOUD state nodes (i.e. nodes in a matching inventory group). For CLOUD-state nodes, memory and CPU information is retrieved from OpenStack for the specified flavors. The `stackhpc.openhpc` group/partition options `ram_mb` and `ram_multiplier` and role variable `openhpc_ram_multiplier` are handled exactly as for non-CLOUD state nodes. This implies that if CLOUD and non-CLOUD state nodes are mixed in a single group all nodes must be homogenous in terms of processors/memory. - -Some examples are given below. Note that currently monitoring is not enabled for CLOUD-state nodes. - -### Other variables - - `autoscale_clouds`: Optional, path to a `clouds.yaml` file containing a single cloud. Defaults to `~/.config/openstack/clouds.yaml`. It is recommended this is an [application credential](https://docs.openstack.org/keystone/latest/user/application_credentials.html). This can be created in Horizon via Identity > Application Credentials > +Create Application Credential. The usual role required is `member`. Using access rules has been found not to work at present. Note that the downloaded credential can be encrpyted using `ansible-vault` to allow it to be committed to source control. It will automatically be decrypted when copied onto the compute nodes. -TODO: what about suspend_excl The following variables are likely to need tuning for the specific site/instances: - `autoscale_suspend_time`: Optional, default 120s. See `slurm.conf` parameter [SuspendTime](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SuspendTime). - `autoscale_suspend_timeout`: Optional, default 30s. See `slurm.conf` parameter [SuspendTimeout](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SuspendTimeout). - `autoscale_resume_timeout`: Optional, default 300s See `slurm.conf` parameter [ResumeTimeout](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_ResumeTimeout). -The following variables have defaults useful for debugging autoscaling, but may be altered for production: +The following variables may need altering for production: - `autoscale_show_suspended_nodes`: Optional, default `true`. Whether to show suspended/powered-down nodes in `sinfo` etc. See `slurm.conf` parameter [PrivateData - cloud](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_cloud). - `autoscale_debug_powersaving`: Optional, default `true`. Log additional information for powersaving, see `slurm.conf` parameter [DebugFlags - PowerSave](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_PowerSave_2). - `autoscale_slurmctld_syslog_debug`: Optional, default `info`. Syslog logging level. See `slurm.conf` parameter [SlurmctldSyslogDebug](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SlurmctldSyslogDebug). +- `autoscale_suspend_exc_nodes`: Optional. List of nodenames (or Slurm hostlist expressions) to exclude from "power saving", i.e. they will not be autoscaled away. + +## stackhpc.openhpc role variables +This role modifies what the [openhpc_slurm_partitions variable](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) in the `stackhpc.openhpc` role accepts. Partition/group definitions may additionally include: +- `cloud_nodes`: Optional. Slurm hostlist expression (e.g. `'small-[8,10-16]'`) defining names of nodes to be defined in a ["CLOUD" state](https://slurm.schedmd.com/slurm.conf.html#OPT_CLOUD), i.e. not operational when the Slurm control daemon starts. +- `cloud_instances`: Required if `cloud_nodes` is defined. A mapping with keys `flavor`, `image`, `keypair` and `network` defining the OpenStack ID or names of properties for the CLOUD-state instances. + +Partitions/groups defining `cloud_nodes` may or may not also contain non-CLOUD state nodes (i.e. nodes in a matching inventory group). For CLOUD-state nodes, memory and CPU information is retrieved from OpenStack for the specified flavors. The `stackhpc.openhpc` group/partition options `ram_mb` and `ram_multiplier` and role variable `openhpc_ram_multiplier` are handled exactly as for non-CLOUD state nodes. This implies that if CLOUD and non-CLOUD state nodes are mixed in a single group all nodes must be homogenous in terms of processors/memory. + +Some examples are given below. Note that currently monitoring is not enabled for CLOUD-state nodes. ### Examples @@ -70,22 +68,18 @@ openhpc_slurm_partitions: cloud_instances: "{{ general_v1_medium }}" ``` -Dependencies ------------- +# Dependencies -TODO: A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. +`stackhpc.openhpc` role as described above. -Example Playbook ----------------- +# Example Playbook See ansible/slurm.yml -License -------- +# License Apache v2 -Author Information ------------------- +# Author Information StackHPC Ltd. diff --git a/ansible/roles/autoscale/defaults/main.yml b/ansible/roles/autoscale/defaults/main.yml index c1f76c899..70916ddc2 100644 --- a/ansible/roles/autoscale/defaults/main.yml +++ b/ansible/roles/autoscale/defaults/main.yml @@ -33,4 +33,5 @@ autoscale_openhpc_config: SuspendTime: "{{ autoscale_suspend_time }}" SuspendTimeout: "{{ autoscale_suspend_timeout }}" ResumeTimeout: "{{ autoscale_resume_timeout }}" + SuspendExcNodes: "{{ autoscale_suspend_exc_nodes | join(',') }}" # See also TreeWidth but shouldn't needs setting with cloud_dns diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index 14c3ef38a..b4816f571 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -1,4 +1,4 @@ -openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: rename? - -# TODO: should this get moved?? -autoscale_suspend_exc_nodes: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all non-CLOUD nodes, and prevent login-only slurmd nodes getting powered down +autoscale_rebuild_clouds: ~/.config/openstack/clouds.yaml +autoscale_suspend_exc_nodes_default: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all non-CLOUD nodes, and prevent login-only slurmd nodes getting powered down +autoscale_suspend_exc_nodes_extra: [] +autoscale_suspend_exc_nodes: "{{ autoscale_suspend_exc_nodes_default + autoscale_suspend_exc_nodes_extra }}" diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index d9b9eca49..e587c7884 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -30,6 +30,3 @@ openhpc_config_default: SlurmctldParameters: enable_configless openhpc_config_extra: {} openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" - -openhpc_suspend_exc_nodes_extra: [] -openhpc_suspend_exc_nodes: "{{ autoscale_suspend_exc_nodes + openhpc_suspend_exc_nodes_extra }}" From dfc859e47344568773666745be6f754e2c7911c0 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Oct 2021 08:49:53 +0000 Subject: [PATCH 074/105] use separate tasks for rebuild and autoscale and move rebuild role into appliance --- ansible/roles/autoscale/tasks/main.yml | 30 +++++++++++- ansible/roles/rebuild/tasks/main.yml | 19 ++++++++ ansible/slurm.yml | 46 ++++++------------- .../inventory/group_vars/all/rebuild.yml | 2 +- 4 files changed, 61 insertions(+), 36 deletions(-) diff --git a/ansible/roles/autoscale/tasks/main.yml b/ansible/roles/autoscale/tasks/main.yml index 6dbc37325..6283410f4 100644 --- a/ansible/roles/autoscale/tasks/main.yml +++ b/ansible/roles/autoscale/tasks/main.yml @@ -1,4 +1,29 @@ --- +- name: Install slurm packages to create slurm user + import_role: + name: stackhpc.openhpc + tasks_from: install.yml + tags: install + +- name: Create /etc/openstack + file: + path: /etc/openstack + state: directory + owner: slurm + group: slurm + mode: u=rX,go= + +- name: Copy out clouds.yaml + copy: + src: "{{ autoscale_clouds }}" + dest: /etc/openstack/clouds.yaml + mode: u=r,go= + owner: slurm + group: slurm + +- name: Setup Python/Slurm tools + include_role: + name: stackhpc.slurm_openstack_tools.pytools - name: Get cloud_node specs shell: @@ -10,14 +35,15 @@ register: _os_flavors become: no changed_when: false + - name: Manipulate flavor information set_fact: flavor_info: "{{ _os_flavors.results | map(attribute='stdout') | map('from_json') }}" # list of json info -- debug: - var: flavor_info + - name: Modify openhpc_slurm_partitions set_fact: openhpc_slurm_partitions: "{{ openhpc_slurm_partitions | modify_autoscale_partitions(flavor_info, openhpc_ram_multiplier) }}" + - name: Merge autoscale configuration set_fact: openhpc_config: "{{ autoscale_openhpc_config | combine(openhpc_config, list_merge='append') }}" diff --git a/ansible/roles/rebuild/tasks/main.yml b/ansible/roles/rebuild/tasks/main.yml index a191e807a..281b46043 100644 --- a/ansible/roles/rebuild/tasks/main.yml +++ b/ansible/roles/rebuild/tasks/main.yml @@ -1,4 +1,23 @@ --- +- block: + - name: Create /etc/openstack + file: + path: /etc/openstack + state: directory + owner: root + group: root + mode: '0400' + - name: Copy out clouds.yaml + copy: + src: "{{ openhpc_rebuild_clouds }}" + dest: /etc/openstack/clouds.yaml + owner: root + group: root + mode: '0400' + - name: Setup slurm tools + include_role: + name: stackhpc.slurm_openstack_tools.pytools + when: openhpc_enable.batch - name: Merge rebuild configuration set_fact: diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 6a76aaf85..de4386e53 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -9,45 +9,25 @@ - include_role: name: geerlingguy.mysql -- name: Enable Slurm/OpenStack integrations - hosts: - - rebuild - - autoscale - become: true +- name: Setup Slurm-driven reimage on OpenStack + hosts: rebuild + become: yes tags: - rebuild + - openhpc + tasks: + - import_role: + name: rebuild + +- name: Setup autoscaling on OpenStack + hosts: autoscale + become: yes + tags: - autoscale - openhpc tasks: - - name: Install slurm packages to create slurm user - import_role: - name: stackhpc.openhpc - tasks_from: install.yml - tags: install - - name: Create /etc/openstack - file: - path: /etc/openstack - state: directory - owner: slurm # TODO: check this works for rebuild too - group: slurm - mode: u=rX,go= - - name: Copy out clouds.yaml - copy: - src: "{{ openhpc_rebuild_clouds }}" # TODO: name is wrong really! - dest: /etc/openstack/clouds.yaml - mode: u=r,go= - owner: slurm - group: slurm - - name: Setup Python/Slurm tools - include_role: - name: stackhpc.slurm_openstack_tools.pytools - - name: Configure autoscale programs and parameters - include_role: + - import_role: name: autoscale - when: "'autoscale' in group_names" - - name: Configure rebuild programs and parameters - include_role: - name: rebuild - name: Setup slurm hosts: openhpc diff --git a/environments/common/inventory/group_vars/all/rebuild.yml b/environments/common/inventory/group_vars/all/rebuild.yml index 0be916def..e40ffe66c 100644 --- a/environments/common/inventory/group_vars/all/rebuild.yml +++ b/environments/common/inventory/group_vars/all/rebuild.yml @@ -1 +1 @@ -openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: rename? +openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml From a236d36b1884ea444ac271f960c81074d8f952f1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Oct 2021 09:38:00 +0000 Subject: [PATCH 075/105] move rebuild role back into collection --- ansible/slurm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index de4386e53..766d54f71 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -17,7 +17,7 @@ - openhpc tasks: - import_role: - name: rebuild + name: stackhpc.slurm_openstack_tools.rebuild - name: Setup autoscaling on OpenStack hosts: autoscale From 62b6cf240e5f0772cff0bcda74665128a9719cb6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Oct 2021 10:13:58 +0000 Subject: [PATCH 076/105] move autoscale into collection --- ansible/roles/autoscale/.travis.yml | 29 ------- ansible/roles/autoscale/README.md | 85 ------------------- ansible/roles/autoscale/defaults/main.yml | 37 -------- .../filter_plugins/openhpc_partitions.py | 71 ---------------- ansible/roles/autoscale/meta.empty/main.yml | 52 ------------ ansible/roles/autoscale/tasks/main.yml | 49 ----------- ansible/roles/autoscale/tasks/validate.yml | 5 -- ansible/slurm.yml | 2 +- 8 files changed, 1 insertion(+), 329 deletions(-) delete mode 100644 ansible/roles/autoscale/.travis.yml delete mode 100644 ansible/roles/autoscale/README.md delete mode 100644 ansible/roles/autoscale/defaults/main.yml delete mode 100644 ansible/roles/autoscale/filter_plugins/openhpc_partitions.py delete mode 100644 ansible/roles/autoscale/meta.empty/main.yml delete mode 100644 ansible/roles/autoscale/tasks/main.yml delete mode 100644 ansible/roles/autoscale/tasks/validate.yml diff --git a/ansible/roles/autoscale/.travis.yml b/ansible/roles/autoscale/.travis.yml deleted file mode 100644 index 36bbf6208..000000000 --- a/ansible/roles/autoscale/.travis.yml +++ /dev/null @@ -1,29 +0,0 @@ ---- -language: python -python: "2.7" - -# Use the new container infrastructure -sudo: false - -# Install ansible -addons: - apt: - packages: - - python-pip - -install: - # Install ansible - - pip install ansible - - # Check ansible version - - ansible --version - - # Create ansible.cfg with correct roles_path - - printf '[defaults]\nroles_path=../' >ansible.cfg - -script: - # Basic role syntax check - - ansible-playbook tests/test.yml -i tests/inventory --syntax-check - -notifications: - webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/ansible/roles/autoscale/README.md b/ansible/roles/autoscale/README.md deleted file mode 100644 index 4e1ac8336..000000000 --- a/ansible/roles/autoscale/README.md +++ /dev/null @@ -1,85 +0,0 @@ -# autoscale - -Support autoscaling nodes on OpenStack clouds, i.e. creating nodes when necessary to service the queue and deleting them when they are no longer needed. - -This is implemented using Slurm's ["elastic computing"](https://slurm.schedmd.com/elastic_computing.html) features which are based on Slurm's [power saving](https://slurm.schedmd.com/power_save.html) features. - -Add the `control` group to the `autoscale` group to activate this functionality in the `ansible/slurm.yml` playbook. Note some role variables are likely to need configuring. By default, node creation and deletion will be logged in the control node's syslog. - -## Requirements - -- Working DNS. -- Active OpenStack credentials on localhost (e.g a sourced `openrc.sh` in the shell running ansible). -- Role `stackhpc.slurm_openstack_tools.pytools`. Installs [slurm-openstack-tools](github.com/stackhpc/slurm-openstack-tools) which provides a venv with the `openstacksdk` and the required resume/suspend scripts. -- Role `stackhpc.openhpc` to create a Slurm cluster. -- This role should be run on the Slurm controller only. - -## Role Variables - -- `autoscale_clouds`: Optional, path to a `clouds.yaml` file containing a single cloud. Defaults to `~/.config/openstack/clouds.yaml`. It is recommended this is an [application credential](https://docs.openstack.org/keystone/latest/user/application_credentials.html). This can be created in Horizon via Identity > Application Credentials > +Create Application Credential. The usual role required is `member`. Using access rules has been found not to work at present. Note that the downloaded credential can be encrpyted using `ansible-vault` to allow it to be committed to source control. It will automatically be decrypted when copied onto the compute nodes. - -The following variables are likely to need tuning for the specific site/instances: -- `autoscale_suspend_time`: Optional, default 120s. See `slurm.conf` parameter [SuspendTime](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SuspendTime). -- `autoscale_suspend_timeout`: Optional, default 30s. See `slurm.conf` parameter [SuspendTimeout](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SuspendTimeout). -- `autoscale_resume_timeout`: Optional, default 300s See `slurm.conf` parameter [ResumeTimeout](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_ResumeTimeout). - -The following variables may need altering for production: -- `autoscale_show_suspended_nodes`: Optional, default `true`. Whether to show suspended/powered-down nodes in `sinfo` etc. See `slurm.conf` parameter [PrivateData - cloud](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_cloud). -- `autoscale_debug_powersaving`: Optional, default `true`. Log additional information for powersaving, see `slurm.conf` parameter [DebugFlags - PowerSave](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_PowerSave_2). -- `autoscale_slurmctld_syslog_debug`: Optional, default `info`. Syslog logging level. See `slurm.conf` parameter [SlurmctldSyslogDebug](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SlurmctldSyslogDebug). -- `autoscale_suspend_exc_nodes`: Optional. List of nodenames (or Slurm hostlist expressions) to exclude from "power saving", i.e. they will not be autoscaled away. - -## stackhpc.openhpc role variables -This role modifies what the [openhpc_slurm_partitions variable](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) in the `stackhpc.openhpc` role accepts. Partition/group definitions may additionally include: -- `cloud_nodes`: Optional. Slurm hostlist expression (e.g. `'small-[8,10-16]'`) defining names of nodes to be defined in a ["CLOUD" state](https://slurm.schedmd.com/slurm.conf.html#OPT_CLOUD), i.e. not operational when the Slurm control daemon starts. -- `cloud_instances`: Required if `cloud_nodes` is defined. A mapping with keys `flavor`, `image`, `keypair` and `network` defining the OpenStack ID or names of properties for the CLOUD-state instances. - -Partitions/groups defining `cloud_nodes` may or may not also contain non-CLOUD state nodes (i.e. nodes in a matching inventory group). For CLOUD-state nodes, memory and CPU information is retrieved from OpenStack for the specified flavors. The `stackhpc.openhpc` group/partition options `ram_mb` and `ram_multiplier` and role variable `openhpc_ram_multiplier` are handled exactly as for non-CLOUD state nodes. This implies that if CLOUD and non-CLOUD state nodes are mixed in a single group all nodes must be homogenous in terms of processors/memory. - -Some examples are given below. Note that currently monitoring is not enabled for CLOUD-state nodes. - -### Examples - -Below is an example of partition definition, e.g. in `environments//inventory/group_vars/openhpc/overrides.yml`. Not shown here the inventory group `dev_small` contains 2 (non-CLOUD state) nodes. The "small" partition is the default and contains 2 non-CLOUD and 2 CLOUD nodes. The "burst" partition contains only CLOUD-state nodes. - -```yaml -openhpc_cluster_name: dev -general_v1_small: - image: ohpc-compute-210909-1316.qcow2 - flavor: general.v1.small - keypair: centos-at-steveb-ansible - network: stackhpc-ipv4-geneve - -general_v1_medium: - image: ohpc-compute-210909-1316.qcow2 - flavor: general.v1.medium - keypair: centos-at-steveb-ansible - network: stackhpc-ipv4-geneve - -openhpc_slurm_partitions: -- name: small - default: yes - cloud_nodes: dev-small-[2-3] - cloud_instances: "{{ general_v1_small }}" - -- name: burst - default: no - cloud_nodes: 'burst-[0-3]' - cloud_instances: "{{ general_v1_medium }}" -``` - -# Dependencies - -`stackhpc.openhpc` role as described above. - -# Example Playbook - -See ansible/slurm.yml - -# License - -Apache v2 - -# Author Information - -StackHPC Ltd. diff --git a/ansible/roles/autoscale/defaults/main.yml b/ansible/roles/autoscale/defaults/main.yml deleted file mode 100644 index 70916ddc2..000000000 --- a/ansible/roles/autoscale/defaults/main.yml +++ /dev/null @@ -1,37 +0,0 @@ -autoscale_clouds: ~/.config/openstack/clouds.yaml - -# recommended: -autoscale_show_suspended_nodes: true - -# useful for debugging, may want to amend in production: -autoscale_debug_powersaving: true -autoscale_slurmctld_syslog_debug: info # https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldSyslogDebug - -# likely to need tuning: -autoscale_suspend_time: 120 # https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime -autoscale_suspend_timeout: 30 # https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout -autoscale_resume_timeout: 300 # https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout -# autoscale_power_save_interval: 10 # https://slurm.schedmd.com/slurm.conf.html#OPT_power_save_interval -# autoscale_power_save_min_interval: 0 # https://slurm.schedmd.com/slurm.conf.html#OPT_power_save_min_intervals - -# likely to need defining: -autoscale_suspend_exc_nodes: [] - -autoscale_openhpc_config: - SuspendProgram: /opt/slurm-tools/bin/slurm-openstack-suspend - ResumeProgram: /opt/slurm-tools/bin/slurm-openstack-resume - SlurmctldParameters: - - idle_on_node_suspend # https://slurm.schedmd.com/slurm.conf.html#OPT_idle_on_node_suspend - - cloud_dns # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud_dns - # - "power_save_interval={{ autoscale_power_save_interval}}" # seems to break if you set this - # - "power_save_min_interval={{ autoscale_power_save_min_interval }}" - CommunicationParameters: - - NoAddrCache # https://slurm.schedmd.com/slurm.conf.html#OPT_NoAddrCache - PrivateData: "{{ ['cloud'] if autoscale_show_suspended_nodes else [] }}" - DebugFlags: "{{ ['PowerSave'] if autoscale_debug_powersaving else [] }}" # NB: Seems to have disappeared in latest Slurm - SlurmctldSyslogDebug: "{{ autoscale_slurmctld_syslog_debug }}" - SuspendTime: "{{ autoscale_suspend_time }}" - SuspendTimeout: "{{ autoscale_suspend_timeout }}" - ResumeTimeout: "{{ autoscale_resume_timeout }}" - SuspendExcNodes: "{{ autoscale_suspend_exc_nodes | join(',') }}" -# See also TreeWidth but shouldn't needs setting with cloud_dns diff --git a/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py b/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py deleted file mode 100644 index 061fa01ae..000000000 --- a/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) 2021 StackHPC Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -# NB: To test this from the repo root run: -# ansible-playbook -i tests/inventory -i tests/inventory-mock-groups tests/filter.yml - -from ansible import errors -import jinja2 -import re - -REQUIRED_INSTANCE_ATTRS=('flavor', 'image', 'keypair', 'network') - -def modify_autoscale_partitions(openhpc_slurm_partitions, flavors, openhpc_ram_multiplier): - """ TODO: docs - - partitions: openhpc_slurm_partitions variable from stackhpc.openhpc role - flavors: List of dicts with info from `openstack flavor show`. Must contain keys 'ram' and 'vcpus' - openhpc_ram_multiplier: openhpc_ram_multiplier variable from stackhpc.openhpc role - - """ - - for part in openhpc_slurm_partitions: - for group in part.get('groups', [part]): - group_name = group.get('name', '') - - if 'cloud_nodes' in group: - if 'cloud_instances' not in group: - raise errors.AnsibleFilterError(f"`openhpc_slurm_partitions` group '{group_name}' specifies 'cloud_nodes' but is missing 'cloud_instances'.") - missing_attrs = ', '.join(set(REQUIRED_INSTANCE_ATTRS).difference(group['cloud_instances'])) - if missing_attrs: - raise errors.AnsibleFilterError(f"`openhpc_slurm_partitions` group '{group_name}' item 'cloud_instances' is missing items: {missing_attrs}.") - cloud_names = group['cloud_nodes'] - # TODO: check for cloud nodes overlapping real ones? - - flavor = [f for f in flavors if f['name'] == group['cloud_instances']['flavor']] - if len(flavor) != 1: - raise errors.AnsibleFilterError(f'expected one flavor matching {group["cloud_instances"]["flavor"]}, found {len(flavor)}: {flavor}') - flavor = flavor[0] - ram_mb = int(flavor['ram'] * group.get('ram_multiplier', openhpc_ram_multiplier)) # ram in flavor in MB, so no units conversion needed - - features = ['%s=%s' % (k, v) for (k, v) in group['cloud_instances'].items()] - cloud_nodes = { - 'NodeName': cloud_names, - 'State':'CLOUD', - 'Features': ','.join(features), - 'CPUs': flavor['vcpus'], - 'RealMemory': group.get('ram_mb', ram_mb) - } - - group['extra_nodes'] = group.get('extra_nodes', []) - group['extra_nodes'].append(cloud_nodes) - - return openhpc_slurm_partitions - -class FilterModule(object): - - def filters(self): - return { - 'modify_autoscale_partitions': modify_autoscale_partitions, - } diff --git a/ansible/roles/autoscale/meta.empty/main.yml b/ansible/roles/autoscale/meta.empty/main.yml deleted file mode 100644 index c572acc9f..000000000 --- a/ansible/roles/autoscale/meta.empty/main.yml +++ /dev/null @@ -1,52 +0,0 @@ -galaxy_info: - author: your name - description: your role description - company: your company (optional) - - # If the issue tracker for your role is not on github, uncomment the - # next line and provide a value - # issue_tracker_url: http://example.com/issue/tracker - - # Choose a valid license ID from https://spdx.org - some suggested licenses: - # - BSD-3-Clause (default) - # - MIT - # - GPL-2.0-or-later - # - GPL-3.0-only - # - Apache-2.0 - # - CC-BY-4.0 - license: license (GPL-2.0-or-later, MIT, etc) - - min_ansible_version: 2.1 - - # If this a Container Enabled role, provide the minimum Ansible Container version. - # min_ansible_container_version: - - # - # Provide a list of supported platforms, and for each platform a list of versions. - # If you don't wish to enumerate all versions for a particular platform, use 'all'. - # To view available platforms and versions (or releases), visit: - # https://galaxy.ansible.com/api/v1/platforms/ - # - # platforms: - # - name: Fedora - # versions: - # - all - # - 25 - # - name: SomePlatform - # versions: - # - all - # - 1.0 - # - 7 - # - 99.99 - - galaxy_tags: [] - # List tags for your role here, one per line. A tag is a keyword that describes - # and categorizes the role. Users find roles by searching for tags. Be sure to - # remove the '[]' above, if you add tags to this list. - # - # NOTE: A tag is limited to a single word comprised of alphanumeric characters. - # Maximum 20 tags per role. - -dependencies: [] - # List your role dependencies here, one per line. Be sure to remove the '[]' above, - # if you add dependencies to this list. diff --git a/ansible/roles/autoscale/tasks/main.yml b/ansible/roles/autoscale/tasks/main.yml deleted file mode 100644 index 6283410f4..000000000 --- a/ansible/roles/autoscale/tasks/main.yml +++ /dev/null @@ -1,49 +0,0 @@ ---- -- name: Install slurm packages to create slurm user - import_role: - name: stackhpc.openhpc - tasks_from: install.yml - tags: install - -- name: Create /etc/openstack - file: - path: /etc/openstack - state: directory - owner: slurm - group: slurm - mode: u=rX,go= - -- name: Copy out clouds.yaml - copy: - src: "{{ autoscale_clouds }}" - dest: /etc/openstack/clouds.yaml - mode: u=r,go= - owner: slurm - group: slurm - -- name: Setup Python/Slurm tools - include_role: - name: stackhpc.slurm_openstack_tools.pytools - -- name: Get cloud_node specs - shell: - cmd: "openstack flavor show --format json {{ item.cloud_instances.flavor }}" - delegate_to: localhost - run_once: true - loop: "{{ openhpc_slurm_partitions }}" - when: "'cloud_instances' in item" - register: _os_flavors - become: no - changed_when: false - -- name: Manipulate flavor information - set_fact: - flavor_info: "{{ _os_flavors.results | map(attribute='stdout') | map('from_json') }}" # list of json info - -- name: Modify openhpc_slurm_partitions - set_fact: - openhpc_slurm_partitions: "{{ openhpc_slurm_partitions | modify_autoscale_partitions(flavor_info, openhpc_ram_multiplier) }}" - -- name: Merge autoscale configuration - set_fact: - openhpc_config: "{{ autoscale_openhpc_config | combine(openhpc_config, list_merge='append') }}" diff --git a/ansible/roles/autoscale/tasks/validate.yml b/ansible/roles/autoscale/tasks/validate.yml deleted file mode 100644 index 5a56fa019..000000000 --- a/ansible/roles/autoscale/tasks/validate.yml +++ /dev/null @@ -1,5 +0,0 @@ ---- - -- name: Check openhpc_slurm_partitions information - debug: - msg: "{{ openhpc_slurm_partitions | modify_autoscale_partitions | to_nice_yaml }}" diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 766d54f71..e3051a2f6 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -27,7 +27,7 @@ - openhpc tasks: - import_role: - name: autoscale + name: stackhpc.slurm_openstack_tools.autoscale - name: Setup slurm hosts: openhpc From e140d6a45b50a8c137789232e4bdb84a1aa0e7e6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Oct 2021 10:15:31 +0000 Subject: [PATCH 077/105] remove autoscale validation as needed vars not available --- ansible/validate.yml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/ansible/validate.yml b/ansible/validate.yml index 805f66164..0c0ba8f38 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -2,15 +2,6 @@ # Fail early if configuration is invalid -- name: Validate autoscale configuration - hosts: autoscale - tags: autoscale - tasks: - - import_role: - name: autoscale - tasks_from: validate.yml - tags: validate - - name: Validate podman configuration hosts: podman tags: podman From 1132ccd1b29ea92f300de928448f2ed095d79a2c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Oct 2021 11:56:35 +0000 Subject: [PATCH 078/105] fix merging of enable_configless --- environments/common/inventory/group_vars/all/openhpc.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index e587c7884..5cc011fa6 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -27,6 +27,7 @@ openhpc_packages: "{{ openhpc_packages_default + openhpc_packages_extra }}" openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}" openhpc_login_only_nodes: login openhpc_config_default: - SlurmctldParameters: enable_configless + SlurmctldParameters: + - enable_configless openhpc_config_extra: {} openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" From 4e7b28da38c0b2016df4852c07cb2137ffb16f6e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Oct 2021 15:20:37 +0000 Subject: [PATCH 079/105] avoid multiple package installation tasks when using autoscale --- ansible/slurm.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index e3051a2f6..14f1a0cc2 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -19,6 +19,20 @@ - import_role: name: stackhpc.slurm_openstack_tools.rebuild +- name: Preinstall Slurm packages to create slurm user + # This is an optimisation for speed as it avoids having to do this once for `control` then again for `openhpc` nodes. + hosts: openhpc + become: yes + tags: + - autoscale + - openhpc + - install + tasks: + - import_role: + name: stackhpc.openhpc + tasks_from: install.yml + when: groups.get('autoscale', []) | length > 0 + - name: Setup autoscaling on OpenStack hosts: autoscale become: yes From 99950adac059b5e46f05a2dbd44c8709915f12b5 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 8 Oct 2021 07:57:51 +0000 Subject: [PATCH 080/105] remove in-appliance rebuild role --- ansible/roles/rebuild/README.md | 47 ------------------------- ansible/roles/rebuild/defaults/main.yml | 4 --- ansible/roles/rebuild/tasks/main.yml | 24 ------------- 3 files changed, 75 deletions(-) delete mode 100644 ansible/roles/rebuild/README.md delete mode 100644 ansible/roles/rebuild/defaults/main.yml delete mode 100644 ansible/roles/rebuild/tasks/main.yml diff --git a/ansible/roles/rebuild/README.md b/ansible/roles/rebuild/README.md deleted file mode 100644 index f846bbeb3..000000000 --- a/ansible/roles/rebuild/README.md +++ /dev/null @@ -1,47 +0,0 @@ -rebuild -======= - -Enable the compute nodes to be reimaged from Slurm. To use this functionality add the `control` and `compute` groups to the `rebuild` group. - -Once `ansible/slurm.yml` has run, node(s) can be reimaged using: - - scontrol reboot [ASAP] [nextstate=] reason="rebuild image:" [] - -where: -- `` is the name (if unique) or ID of an image in OpenStack. -- `` is a Slurm hostlist expression defining the nodes to reimage. -- `ASAP` means the rebuild will happen as soon as existing jobs on the node(s) complete - no new jobs will be scheduled on it. -- If `nextstate=...` is not given nodes remain in DRAIN state after the rebuild. - -Requirements ------------- - -- This role must be run before the `stackhpc.openhpc` role's `runtime.yml` playbook as it modifies the `openhpc_config` variable. -- OpenStack credentials on the compute nodes, e.g. at `/etc/openstack/clouds.yaml` which are readable by the root user. It is recommended these credentials are an [application credential](https://docs.openstack.org/keystone/latest/user/application_credentials.html). This can be created in Horizon via Identity > Application Credentials > +Create Application Credential. The usual role required is `member`. Using access rules has been found not to work at present. Note that the downloaded credential can be encrpyted using `ansible-vault` to allow commit to source control. It will automatically be decrypted when copied onto the compute nodes. -- An image which when booted adds that node to the Slurm cluster. E.g. see `packer/README.md`. - -Role Variables --------------- - -None normally required. - -Dependencies ------------- - -See above. - -Example Playbook ----------------- - -See `ansible/slurm.yml` - - -License -------- - -Apache v2 - -Author Information ------------------- - -StackHPC Ltd. diff --git a/ansible/roles/rebuild/defaults/main.yml b/ansible/roles/rebuild/defaults/main.yml deleted file mode 100644 index 5e532ef24..000000000 --- a/ansible/roles/rebuild/defaults/main.yml +++ /dev/null @@ -1,4 +0,0 @@ ---- - -rebuild_openhpc_config: - RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild diff --git a/ansible/roles/rebuild/tasks/main.yml b/ansible/roles/rebuild/tasks/main.yml deleted file mode 100644 index 281b46043..000000000 --- a/ansible/roles/rebuild/tasks/main.yml +++ /dev/null @@ -1,24 +0,0 @@ ---- -- block: - - name: Create /etc/openstack - file: - path: /etc/openstack - state: directory - owner: root - group: root - mode: '0400' - - name: Copy out clouds.yaml - copy: - src: "{{ openhpc_rebuild_clouds }}" - dest: /etc/openstack/clouds.yaml - owner: root - group: root - mode: '0400' - - name: Setup slurm tools - include_role: - name: stackhpc.slurm_openstack_tools.pytools - when: openhpc_enable.batch - -- name: Merge rebuild configuration - set_fact: - openhpc_config: "{{ rebuild_openhpc_config | combine(openhpc_config, list_merge='append') }}" From 3f6419d39f2fc33e5dbb8818e37283261014d22e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 8 Oct 2021 07:58:23 +0000 Subject: [PATCH 081/105] fallback to working smslabs partition definition for demo --- .../smslabs/inventory/group_vars/openhpc/partitions.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml index 95fa1d839..92a606279 100755 --- a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml +++ b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml @@ -10,12 +10,21 @@ general_v1_medium: keypair: centos-at-steveb-ansible network: stackhpc-ipv4-geneve + +openhpc_ram_multiplier: 0.90 # TODO: fixme for groups openhpc_slurm_partitions: - name: small default: yes cloud_nodes: dev-small-[2-3] cloud_instances: "{{ general_v1_small }}" + # groups: # TODO: support this + # - name: small + # - name: small_cloud + # ram_multiplier: 0.90 + # cloud_nodes: dev-small-[2-3] + # cloud_instances: "{{ general_v1_small }}" + - name: burst default: no cloud_nodes: 'burst-[0-3]' From ef90759ee58ad00f9a227b35060f4968bc04773c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 12 Oct 2021 07:29:36 +0000 Subject: [PATCH 082/105] smslabs: demo groups in openhpc_slurm_partitions --- .../inventory/group_vars/openhpc/partitions.yml | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml index 92a606279..f407239bd 100755 --- a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml +++ b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml @@ -14,16 +14,12 @@ general_v1_medium: openhpc_ram_multiplier: 0.90 # TODO: fixme for groups openhpc_slurm_partitions: - name: small - default: yes - cloud_nodes: dev-small-[2-3] - cloud_instances: "{{ general_v1_small }}" - - # groups: # TODO: support this - # - name: small - # - name: small_cloud - # ram_multiplier: 0.90 - # cloud_nodes: dev-small-[2-3] - # cloud_instances: "{{ general_v1_small }}" + groups: + - name: small + - name: small_cloud + ram_multiplier: 0.90 + cloud_nodes: dev-small-[2-3] + cloud_instances: "{{ general_v1_small }}" - name: burst default: no From 2ee93044f8c6abd5c7f7a3783f637b08490ff6fe Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 15 Oct 2021 08:04:40 +0000 Subject: [PATCH 083/105] tidy for PR --- ansible/.gitignore | 2 -- ansible/roles/podman/tasks/validate.yml | 2 +- environments/common/inventory/group_vars/all/openhpc.yml | 3 --- environments/common/inventory/groups | 3 ++- requirements.yml | 2 +- 5 files changed, 4 insertions(+), 8 deletions(-) diff --git a/ansible/.gitignore b/ansible/.gitignore index 0ccc6a74f..15ab96184 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -24,5 +24,3 @@ roles/* !roles/basic_users/** !roles/autoscale/ !roles/autoscale/** -!roles/rebuild/ -!roles/rebuild/** diff --git a/ansible/roles/podman/tasks/validate.yml b/ansible/roles/podman/tasks/validate.yml index edd877cbd..2b7bcb18d 100644 --- a/ansible/roles/podman/tasks/validate.yml +++ b/ansible/roles/podman/tasks/validate.yml @@ -12,4 +12,4 @@ assert: that: ( podman_cidr | ansible.netcommon.network_in_network(item)) == false fail_msg: "Address {{ item }} for {{ inventory_hostname }} is in podman network range {{ podman_cidr }} - set `podman_cidr` to avoid host network address ranges" - loop: "{{ ansible_all_ipv4_addresses }}" + loop: "{{ ansible_all_ipv4_addresses }}" \ No newline at end of file diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 5cc011fa6..f757eb04e 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -15,15 +15,12 @@ openhpc_slurmdbd_mysql_username: slurm openhpc_slurm_control_host: "{{ groups['control'] | first }}" openhpc_slurm_partitions: - name: "compute" - -# TODO: WIP PR to change/deprecate name here: openhpc_packages_default: - slurm-libpmi-ohpc # to allow intel mpi to work properly - ohpc-gnu9-openmpi4-perf-tools # for hpctests - openblas-gnu9-ohpc # for hpctests (HPL) openhpc_packages_extra: [] openhpc_packages: "{{ openhpc_packages_default + openhpc_packages_extra }}" - openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}" openhpc_login_only_nodes: login openhpc_config_default: diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 6811d0cc6..e9072ef18 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -76,7 +76,8 @@ cluster # All hosts to (optionally) run yum update on. [autoscale] -# Add control to enable autoscaling +# Add control to enable autoscaling on OpenStack. +# See ansible/collections/ansible_collections/stackhpc/slurm_openstack_tools/roles/autoscale/README.md [block_devices] # Superset of hosts to configure filesystems on - see ansible/roles/block_devices/README.md diff --git a/requirements.yml b/requirements.yml index afa1ab90e..28ea1a948 100644 --- a/requirements.yml +++ b/requirements.yml @@ -2,7 +2,7 @@ roles: - src: stackhpc.nfs - src: https://github.com/stackhpc/ansible-role-openhpc - version: feature/autoscale + version: feature/autoscale # TODO: remove once merged name: stackhpc.openhpc - src: cloudalchemy.node_exporter - src: cloudalchemy.blackbox-exporter From 6476e8280c40ee6484a5cfa42ee48ac5f9d5702e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 15 Oct 2021 10:27:14 +0000 Subject: [PATCH 084/105] fix branch for ansible_collection_slurm_openstack_tools --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 28ea1a948..07b4ae319 100644 --- a/requirements.yml +++ b/requirements.yml @@ -17,5 +17,5 @@ collections: - name: community.grafana - name: https://github.com/stackhpc/ansible_collection_slurm_openstack_tools type: git - version: main + version: feature/autoscale # TODO: FIXME once merged ... From 2c6c642d7224f942bfd8103ac7433934092f2e37 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 31 Jan 2022 10:06:39 +0000 Subject: [PATCH 085/105] fix up autoscale test environment --- ansible/.gitignore | 2 - environments/smslabs/activate | 23 ---------- environments/smslabs/hooks/post.yml | 19 -------- .../smslabs/inventory/group_vars/all/.gitkeep | 0 .../inventory/group_vars/all/users.yml | 3 -- .../group_vars/openhpc/overrides.yml | 9 ---- .../group_vars/openhpc/partitions.yml | 28 ------------ .../inventory/group_vars/podman/overrides.yml | 1 - .../inventory/group_vars/rebuild/override.yml | 1 - environments/smslabs/inventory/groups | 45 ------------------- environments/smslabs/inventory/hosts | 18 -------- 11 files changed, 149 deletions(-) delete mode 100644 environments/smslabs/activate delete mode 100644 environments/smslabs/hooks/post.yml delete mode 100644 environments/smslabs/inventory/group_vars/all/.gitkeep delete mode 100644 environments/smslabs/inventory/group_vars/all/users.yml delete mode 100644 environments/smslabs/inventory/group_vars/openhpc/overrides.yml delete mode 100755 environments/smslabs/inventory/group_vars/openhpc/partitions.yml delete mode 100644 environments/smslabs/inventory/group_vars/podman/overrides.yml delete mode 100644 environments/smslabs/inventory/group_vars/rebuild/override.yml delete mode 100644 environments/smslabs/inventory/groups delete mode 100755 environments/smslabs/inventory/hosts diff --git a/ansible/.gitignore b/ansible/.gitignore index 15ab96184..bf07028ab 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -22,5 +22,3 @@ roles/* !roles/block_devices/** !roles/basic_users/ !roles/basic_users/** -!roles/autoscale/ -!roles/autoscale/** diff --git a/environments/smslabs/activate b/environments/smslabs/activate deleted file mode 100644 index e74031095..000000000 --- a/environments/smslabs/activate +++ /dev/null @@ -1,23 +0,0 @@ -export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) -echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" - -APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) -export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" - -export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") -echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" - -export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") -echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root" - -export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") -echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root" - -export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT") -echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" - -if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then - export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg -fi - - diff --git a/environments/smslabs/hooks/post.yml b/environments/smslabs/hooks/post.yml deleted file mode 100644 index 87e637f8c..000000000 --- a/environments/smslabs/hooks/post.yml +++ /dev/null @@ -1,19 +0,0 @@ -- hosts: control - become: true - tasks: - - name: Prevent ansible_user's processes being killed on compute nodes at job completion - replace: - path: /etc/slurm/slurm.epilog.clean - regexp: 'if \[ \$SLURM_UID -lt 100 \] ; then' - replace: "if [[ $SLURM_UID -lt 100 || $SLURM_JOB_USER -eq {{ ansible_user }} ]] ; then" - - name: Make a /home/test directory for centos - file: - path: /home/test - state: directory - owner: centos - group: centos - - name: Install ewatch - git: - repo: https://github.com/sjpb/ewatch.git - dest: /home/test/ewatch - force: yes diff --git a/environments/smslabs/inventory/group_vars/all/.gitkeep b/environments/smslabs/inventory/group_vars/all/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/environments/smslabs/inventory/group_vars/all/users.yml b/environments/smslabs/inventory/group_vars/all/users.yml deleted file mode 100644 index 3de23fee4..000000000 --- a/environments/smslabs/inventory/group_vars/all/users.yml +++ /dev/null @@ -1,3 +0,0 @@ -users: - - name: stig - pubkey: ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDTXW9Y0r0cTW/ay6FEIlIejuRPZZ+ObzR08XFzp4x8ecCW//WSZAjo1fD/u/CQGoV552QCjWj+tP9Cy9UcsI3WLAx+n4i48oHqvpRLO1CLgJazNpQ8Bc7GveF78xhD5EoL/IpcAFKIad3CU7gb8HLRJIQpER1OsY96T9ViKe9lDWy8mk2WjoYoU1niMtmbs549Gqwl+fGNdBVUsGS5k7Xy4D/0T8TitthN3W6UbMHXVCUzdd3v9TNl7hgyeq6dCvRS6g8Vmlp2Ia0NLkrWF+bqP2RhRuqWOj71PD3auPAq0hF4yqdW9awMuZY8vBesnjE3iC2h34jvFkYaolGTfDZUa48s7yBTpjWoINUSbg105KJoPg55lWwXj58MMhvyX6hyYl3oJMiG3eq48jAAA4n80EKK4IBXrg/yjpuoDiNGqVe9hDAoT94j3+s8Smz5rohsKQVS+l266eyjo2VLUVR2NaOnw5fW86MEUyTicvHjSN4xOCGjSK2j1k6hXT7EiuM= stig@nrel-jumphost.novalocal \ No newline at end of file diff --git a/environments/smslabs/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs/inventory/group_vars/openhpc/overrides.yml deleted file mode 100644 index 4bed1823f..000000000 --- a/environments/smslabs/inventory/group_vars/openhpc/overrides.yml +++ /dev/null @@ -1,9 +0,0 @@ -openhpc_extra_packages: - - git - - python3 -openhpc_extra_config_overrides: - SlurmctldDebug: debug - SlurmdDebug: debug - -#example_list: "{{ example_list + [7] }}" # FAILS - recursive -#example_dict: "{{ example_dict | combine({c: 4} ) }}" # FAILS - recursive diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml deleted file mode 100755 index f407239bd..000000000 --- a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml +++ /dev/null @@ -1,28 +0,0 @@ -general_v1_small: - image: ohpc-compute-210909-1316.qcow2 - flavor: general.v1.small - keypair: centos-at-steveb-ansible - network: stackhpc-ipv4-geneve - -general_v1_medium: - image: ohpc-compute-210909-1316.qcow2 - flavor: general.v1.medium - keypair: centos-at-steveb-ansible - network: stackhpc-ipv4-geneve - - -openhpc_ram_multiplier: 0.90 # TODO: fixme for groups -openhpc_slurm_partitions: -- name: small - groups: - - name: small - - name: small_cloud - ram_multiplier: 0.90 - cloud_nodes: dev-small-[2-3] - cloud_instances: "{{ general_v1_small }}" - -- name: burst - default: no - cloud_nodes: 'burst-[0-3]' - cloud_instances: "{{ general_v1_medium }}" - diff --git a/environments/smslabs/inventory/group_vars/podman/overrides.yml b/environments/smslabs/inventory/group_vars/podman/overrides.yml deleted file mode 100644 index 18e712665..000000000 --- a/environments/smslabs/inventory/group_vars/podman/overrides.yml +++ /dev/null @@ -1 +0,0 @@ -podman_cidr: 192.168.1.0/24 diff --git a/environments/smslabs/inventory/group_vars/rebuild/override.yml b/environments/smslabs/inventory/group_vars/rebuild/override.yml deleted file mode 100644 index 178ab7848..000000000 --- a/environments/smslabs/inventory/group_vars/rebuild/override.yml +++ /dev/null @@ -1 +0,0 @@ -pytools_gitref: feature/autoscale diff --git a/environments/smslabs/inventory/groups b/environments/smslabs/inventory/groups deleted file mode 100644 index 6fde43dfa..000000000 --- a/environments/smslabs/inventory/groups +++ /dev/null @@ -1,45 +0,0 @@ -[nfs:children] -openhpc - -[hpctests:children] -# Login node to use for running mpi-based testing. -login - -[mysql:children] -control - -[prometheus:children] -control - -[grafana:children] -control - -[alertmanager:children] -control - -[node_exporter:children] -# disabled node_exporter on control to avoid noise in syslog -login -compute - -[opendistro:children] -control - -[kibana:children] -control - -[slurm_stats:children] -control - -[filebeat:children] -slurm_stats - -[rebuild:children] -control -compute - -[update:children] -cluster - -[autoscale:children] -control diff --git a/environments/smslabs/inventory/hosts b/environments/smslabs/inventory/hosts deleted file mode 100755 index 5ab90d3b8..000000000 --- a/environments/smslabs/inventory/hosts +++ /dev/null @@ -1,18 +0,0 @@ -[all:vars] -ansible_user=centos -openhpc_cluster_name=dev - -[control] -dev-control ansible_host=10.0.3.182 server_networks='{"stackhpc-ipv4-geneve":["10.0.3.182"]}' - -[login] -dev-login-1 ansible_host=10.0.1.54 server_networks='{"stackhpc-ipv4-geneve":["10.0.1.54"]}' - -[compute] -dev-small-0 ansible_host=10.0.1.217 server_networks='{"stackhpc-ipv4-geneve":["10.0.1.217"]}' -dev-small-1 ansible_host=10.0.3.253 server_networks='{"stackhpc-ipv4-geneve":["10.0.3.253"]}' - -# Define groups for slurm parititions: -[dev_small] -dev-small-0 -dev-small-1 From 12e7de430f6afb8c3f42a33ad43113f9cf6427bf Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 31 Jan 2022 10:07:55 +0000 Subject: [PATCH 086/105] change autoscale group to be openstack-specific --- .github/workflows/smslabs.yml | 2 +- ansible/slurm.yml | 16 ++++------------ environments/common/layouts/everything | 3 +++ environments/smslabs-example/inventory/groups | 3 +++ 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/.github/workflows/smslabs.yml b/.github/workflows/smslabs.yml index 00217cee3..5cdd23ea8 100644 --- a/.github/workflows/smslabs.yml +++ b/.github/workflows/smslabs.yml @@ -95,7 +95,7 @@ jobs: env: ANSIBLE_FORCE_COLOR: True - - name: Build control and compute images + - name: Build login and compute images run: | . venv/bin/activate . environments/smslabs-example/activate diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 14f1a0cc2..f00fec45d 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -24,20 +24,20 @@ hosts: openhpc become: yes tags: - - autoscale + - openstack_autoscale - openhpc - install tasks: - import_role: name: stackhpc.openhpc tasks_from: install.yml - when: groups.get('autoscale', []) | length > 0 + when: groups.get('openstack_autoscale', []) | length > 0 - name: Setup autoscaling on OpenStack - hosts: autoscale + hosts: openstack_autoscale become: yes tags: - - autoscale + - openstack_autoscale - openhpc tasks: - import_role: @@ -49,14 +49,6 @@ tags: - openhpc tasks: - # - name: Add CentOS 8.3 Vault repo for OpenHPC hwloc dependency - # # NB: REMOVE THIS once OpenHPC works on CentOS 8.4 - # yum_repository: - # name: vault - # file: CentOS-Linux-Vault8.3 - # description: CentOS 8.3 packages from Vault - # baseurl: https://vault.centos.org/8.3.2011/BaseOS/$basearch/os/ - # gpgkey: file:///etc/pki/rpm-gpg/RPM-GPG-KEY-centosofficial - import_role: name: stackhpc.openhpc diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index c6a47453e..e120aa1f7 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -41,3 +41,6 @@ cluster [basic_users] # Add `openhpc` group to add Slurm users via creation of users on each node. + +[openstack_autoscale] +# Add `control` group to configure autoscaling on OpenStack clouds. diff --git a/environments/smslabs-example/inventory/groups b/environments/smslabs-example/inventory/groups index 2e5efeb67..b721d0e93 100644 --- a/environments/smslabs-example/inventory/groups +++ b/environments/smslabs-example/inventory/groups @@ -38,3 +38,6 @@ compute [update:children] cluster + +[openstack_autoscale:children] +control From c0370d6ff81bfaf0f0c1c6d31afe114eeb6ad46d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 17 Feb 2022 10:17:38 +0000 Subject: [PATCH 087/105] fix security groups in smslabs for idempotency --- environments/smslabs-example/terraform/nodes.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/smslabs-example/terraform/nodes.tf b/environments/smslabs-example/terraform/nodes.tf index 3bca7fb36..4b849f0bb 100644 --- a/environments/smslabs-example/terraform/nodes.tf +++ b/environments/smslabs-example/terraform/nodes.tf @@ -6,7 +6,7 @@ resource "openstack_compute_instance_v2" "control" { flavor_name = var.control_node.flavor key_pair = var.key_pair config_drive = true - security_groups = ["default", "ssh"] + security_groups = ["default", "SSH"] network { uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id # ensures nodes not created till subnet created @@ -24,7 +24,7 @@ resource "openstack_compute_instance_v2" "login" { flavor_name = each.value.flavor key_pair = var.key_pair config_drive = true - security_groups = ["default", "ssh"] + security_groups = ["default", "SSH"] network { uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id From 2192ab7648f340fdecb14ce84ebd04e162ee1ab1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 17 Feb 2022 17:08:22 +0000 Subject: [PATCH 088/105] fix smslabs env not being configless, add checks for this --- ansible/slurm.yml | 6 ++++++ .../inventory/group_vars/openhpc/overrides.yml | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index f00fec45d..a60918070 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -49,6 +49,12 @@ tags: - openhpc tasks: + - assert: + that: "'enable_configless' in openhpc_config.SlurmctldParameters | default([])" + fail_msg: | + 'enable_configless' not found in openhpc_config.SlurmctldParameters - is variable openhpc_config overridden? + Additional slurm.conf parameters should be provided using variable openhpc_config_extra. + success_msg: Checked Slurm will be configured for configless operation - import_role: name: stackhpc.openhpc diff --git a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml index 3585ae073..4cf1e5bc1 100644 --- a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml +++ b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml @@ -1,4 +1,4 @@ -openhpc_config: +openhpc_config_extra: SlurmctldDebug: debug SlurmdDebug: debug openhpc_slurm_partitions: From 0290115ffb400233a059ff8825b47777309eb07a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 17 Feb 2022 17:34:59 +0000 Subject: [PATCH 089/105] WIP for smslabs autoscale --- .github/workflows/smslabs.yml | 2 ++ environments/common/inventory/group_vars/all/openhpc.yml | 1 + 2 files changed, 3 insertions(+) diff --git a/.github/workflows/smslabs.yml b/.github/workflows/smslabs.yml index 5cdd23ea8..05a392aa0 100644 --- a/.github/workflows/smslabs.yml +++ b/.github/workflows/smslabs.yml @@ -132,3 +132,5 @@ jobs: OS_CLOUD: openstack TF_VAR_cluster_name: ci${{ github.run_id }} if: ${{ success() || cancelled() }} + +# TODO: delete images! \ No newline at end of file diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index f757eb04e..0b3912622 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -28,3 +28,4 @@ openhpc_config_default: - enable_configless openhpc_config_extra: {} openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" +openhpc_ram_multiplier: 0.90 # TODO: DOCS: needs to be available to stackhpc.slurm_openstack_tools.autoscale role, plus lowered a bit to cope with autoscale problems From 7c33f1c1669294c787b5693a81a4637e9fe2a0f7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 21 Feb 2022 19:04:56 +0000 Subject: [PATCH 090/105] add basic autoscale to CI --- .github/workflows/smslabs.yml | 16 ++++++++-------- .../smslabs-example/ci/reimage-compute.yml | 8 ++++++++ environments/smslabs-example/hooks/post.yml | 17 ++++++++++++++--- .../inventory/group_vars/openhpc/overrides.yml | 6 ++++++ 4 files changed, 36 insertions(+), 11 deletions(-) diff --git a/.github/workflows/smslabs.yml b/.github/workflows/smslabs.yml index 05a392aa0..aed4aa433 100644 --- a/.github/workflows/smslabs.yml +++ b/.github/workflows/smslabs.yml @@ -87,14 +87,6 @@ jobs: env: ANSIBLE_FORCE_COLOR: True - - name: Run MPI-based tests - run: | - . venv/bin/activate - . environments/smslabs-example/activate - ansible-playbook -vv ansible/adhoc/hpctests.yml - env: - ANSIBLE_FORCE_COLOR: True - - name: Build login and compute images run: | . venv/bin/activate @@ -121,6 +113,14 @@ jobs: ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml env: OS_CLOUD: openstack + + - name: Run MPI-based tests, triggering autoscaling + run: | + . venv/bin/activate + . environments/smslabs-example/activate + ansible-playbook -vv ansible/adhoc/hpctests.yml + env: + ANSIBLE_FORCE_COLOR: True - name: Delete infrastructure run: | diff --git a/environments/smslabs-example/ci/reimage-compute.yml b/environments/smslabs-example/ci/reimage-compute.yml index 3efa4e47c..42989800a 100644 --- a/environments/smslabs-example/ci/reimage-compute.yml +++ b/environments/smslabs-example/ci/reimage-compute.yml @@ -14,6 +14,14 @@ set_fact: compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" + - name: Add compute image ID to autoscale definition + copy: + dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/openhpc/autoscale.yml" + content: | + openhpc_autoscale_image: {{ compute_build.artifact_id }} + delegate_to: localhost + - meta: end_here + - name: Request compute node rebuild via Slurm shell: cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1] diff --git a/environments/smslabs-example/hooks/post.yml b/environments/smslabs-example/hooks/post.yml index 68303c5cb..e764f99fc 100644 --- a/environments/smslabs-example/hooks/post.yml +++ b/environments/smslabs-example/hooks/post.yml @@ -4,11 +4,22 @@ tasks: - block: - name: Run sinfo - shell: 'sinfo --noheader --format="%N %P %a %l %D %t"' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name + shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name register: sinfo changed_when: false - name: Check nodes have expected slurm state assert: - that: "(sinfo.stdout_lines[0] | split)[1:] == ['small*', 'up', '60-00:00:00', '2', 'idle']" # don't know what instance names are as have CI run ID in them - fail_msg: "sinfo output not as expected: {{ sinfo.stdout }}" + that: sinfo.stdout_lines == expected_sinfo + fail_msg: | + sinfo output not as expected: + actual: + {{ sinfo.stdout_lines }} + expected: + {{ expected_sinfo }} + + vars: + expected_sinfo: + - "{{ openhpc_cluster_name }}-compute-[0-1] small* up 60-00:00:00 2 idle" + - "{{ openhpc_cluster_name }}-compute-[2-3] small* up 60-00:00:00 2 idle~" + when: "'builder' not in group_names" # won't have a slurm control daemon when in build diff --git a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml index 4cf1e5bc1..a8d82a032 100644 --- a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml +++ b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml @@ -3,3 +3,9 @@ openhpc_config_extra: SlurmdDebug: debug openhpc_slurm_partitions: - name: small + cloud_nodes: autoscale-compute-[2-3] + cloud_instances: # TODO: can we somehow check these when templating?? + flavor: general.v1.tiny + image: "{{ openhpc_autoscale_image | default('IMAGE_PLACEHOLDER') }}" + keypair: slurm-app-ci + network: "{{ server_networks.keys() | first }}" # TODO: bit hacky?? From e9a05521aa53e06eff3543229a13a93c3a65eb89 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 22 Feb 2022 13:07:55 +0000 Subject: [PATCH 091/105] fix failure during CI at 'stackhpc.slurm_openstack_tools.autoscale : Modify openhpc_slurm_partitions' --- .github/workflows/smslabs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/smslabs.yml b/.github/workflows/smslabs.yml index aed4aa433..aac6d8d7e 100644 --- a/.github/workflows/smslabs.yml +++ b/.github/workflows/smslabs.yml @@ -86,6 +86,7 @@ jobs: ansible-playbook -vv ansible/site.yml env: ANSIBLE_FORCE_COLOR: True + OS_CLOUD: openstack # required so openhpc_slurm_partitions filter used by stackhpc.slurm_openstack_tools.autoscale can use clouds.yaml file to run openstack cli to get node config - name: Build login and compute images run: | From 2414203e4a6c294017f171df984fc79f0e8c511a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 22 Feb 2022 14:38:06 +0000 Subject: [PATCH 092/105] fix cloud instance name in CI --- .../smslabs-example/inventory/group_vars/openhpc/overrides.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml index a8d82a032..c19026b0e 100644 --- a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml +++ b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml @@ -3,7 +3,7 @@ openhpc_config_extra: SlurmdDebug: debug openhpc_slurm_partitions: - name: small - cloud_nodes: autoscale-compute-[2-3] + cloud_nodes: "{{ openhpc_cluster_name }}-compute-[2-3]" cloud_instances: # TODO: can we somehow check these when templating?? flavor: general.v1.tiny image: "{{ openhpc_autoscale_image | default('IMAGE_PLACEHOLDER') }}" From e649ab869b600532d12603ddcea6c5a865969020 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 22 Feb 2022 15:39:46 +0000 Subject: [PATCH 093/105] fix cloud network definition during CI image build --- .../inventory/group_vars/openhpc/overrides.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml index c19026b0e..1b040c6cb 100644 --- a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml +++ b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml @@ -6,6 +6,6 @@ openhpc_slurm_partitions: cloud_nodes: "{{ openhpc_cluster_name }}-compute-[2-3]" cloud_instances: # TODO: can we somehow check these when templating?? flavor: general.v1.tiny - image: "{{ openhpc_autoscale_image | default('IMAGE_PLACEHOLDER') }}" + image: "{{ openhpc_autoscale_image | default('IMAGE_PLACEHOLDER') }}" # Gets set by CI after image build task. keypair: slurm-app-ci - network: "{{ server_networks.keys() | first }}" # TODO: bit hacky?? + network: "{{ hostvars[groups['control'] | first]['server_networks'].keys() | first }}" # Defined in inventory, so only defined for control during Packer build. From c9e956e161314e37ba4bf58bedeb9683e02e4168 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 22 Feb 2022 16:40:29 +0000 Subject: [PATCH 094/105] remove debugging exit --- environments/smslabs-example/ci/reimage-compute.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/environments/smslabs-example/ci/reimage-compute.yml b/environments/smslabs-example/ci/reimage-compute.yml index 42989800a..f8dd05214 100644 --- a/environments/smslabs-example/ci/reimage-compute.yml +++ b/environments/smslabs-example/ci/reimage-compute.yml @@ -20,7 +20,6 @@ content: | openhpc_autoscale_image: {{ compute_build.artifact_id }} delegate_to: localhost - - meta: end_here - name: Request compute node rebuild via Slurm shell: From 69e5d075b012f34d29068b09710c05ed2d2290f3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 22 Feb 2022 19:48:06 +0000 Subject: [PATCH 095/105] smslabs CI fix for cloud node image --- .github/workflows/smslabs.yml | 13 ++++++++++++- README.md | 16 ++++++++++++---- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/.github/workflows/smslabs.yml b/.github/workflows/smslabs.yml index aac6d8d7e..309c0eef9 100644 --- a/.github/workflows/smslabs.yml +++ b/.github/workflows/smslabs.yml @@ -97,13 +97,14 @@ jobs: env: OS_CLOUD: openstack - - name: Reimage compute nodes via slurm and check cluster still up + - name: Reimage compute nodes via slurm and check cluster still up (also adds image to partition cloud node info) run: | . venv/bin/activate . environments/smslabs-example/activate ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-compute.yml ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml env: + ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack - name: Reimage login nodes via openstack and check cluster still up @@ -113,6 +114,16 @@ jobs: ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-login.yml ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml env: + ANSIBLE_FORCE_COLOR: True + OS_CLOUD: openstack + + - name: Reconfigure Slurm for new cloud node image + run: | + . venv/bin/activate + . environments/smslabs-example/activate + ansible-playbook -vv ansible/slurm.yml --tags openhpc --skip-tags install + env: + ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack - name: Run MPI-based tests, triggering autoscaling diff --git a/README.md b/README.md index ab9b84fc3..8db847fa7 100644 --- a/README.md +++ b/README.md @@ -98,9 +98,17 @@ NB: This section describes generic instructions - check for any environment-spec source environments//activate -2. Deploy instances - see environment-specific instructions. +1. Activate your OpenStack credentials (required if Slurm-controlled rebuild or Slurm autoscaling is enabled): -3. Generate passwords: + # either source an openrc.sh file + source path_to/openrc.sh + + # or if using a clouds.yaml file in ~/.config/openstack/clouds.yaml: + export OS_CLOUD=openstack + +1. Deploy instances - see environment-specific instructions. + +1. Generate passwords: ansible-playbook ansible/adhoc/generate-passwords.yml @@ -110,7 +118,7 @@ NB: This section describes generic instructions - check for any environment-spec See the [Ansible vault documentation](https://docs.ansible.com/ansible/latest/user_guide/vault.html) for more details. -4. Deploy the appliance: +1. Deploy the appliance: ansible-playbook ansible/site.yml @@ -120,7 +128,7 @@ NB: This section describes generic instructions - check for any environment-spec Tags as defined in the various sub-playbooks defined in `ansible/` may be used to only run part of the `site` tasks. -5. "Utility" playbooks for managing a running appliance are contained in `ansible/adhoc` - run these by activating the environment and using: +1. "Utility" playbooks for managing a running appliance are contained in `ansible/adhoc` - run these by activating the environment and using: ansible-playbook ansible/adhoc/ From b41b4ae09fa2c8d5c65013318f6b0638540a4df6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Feb 2022 15:56:11 +0000 Subject: [PATCH 096/105] fix HPL-solo issue due to memory mismatch with 2 static + 2 cloud nodes in smslabs CI --- .../smslabs-example/inventory/group_vars/openhpc/overrides.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml index 1b040c6cb..e92d58549 100644 --- a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml +++ b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml @@ -3,6 +3,7 @@ openhpc_config_extra: SlurmdDebug: debug openhpc_slurm_partitions: - name: small + ram_mb: 3362 # fixes wrong OpenStack flavor value for cloud_nodes cloud_nodes: "{{ openhpc_cluster_name }}-compute-[2-3]" cloud_instances: # TODO: can we somehow check these when templating?? flavor: general.v1.tiny From 0e597f8650ebc4df2776350e2172f243e6532f97 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Feb 2022 20:50:46 +0000 Subject: [PATCH 097/105] re-fix security groups in smslabs for idempotency --- environments/smslabs-example/terraform/nodes.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/smslabs-example/terraform/nodes.tf b/environments/smslabs-example/terraform/nodes.tf index 4b849f0bb..7d728e2c4 100644 --- a/environments/smslabs-example/terraform/nodes.tf +++ b/environments/smslabs-example/terraform/nodes.tf @@ -42,7 +42,7 @@ resource "openstack_compute_instance_v2" "compute" { flavor_name = var.compute_types[each.value].flavor key_pair = var.key_pair config_drive = true - security_groups = ["default", "ssh"] + security_groups = ["default", "SSH"] network { uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id From efc7ab400f6540bedfd0630d172970f1f29538f3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Feb 2022 20:51:12 +0000 Subject: [PATCH 098/105] smslabs: don't require OS_CLOUD= to be set for TF --- environments/smslabs-example/terraform/main.tf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/environments/smslabs-example/terraform/main.tf b/environments/smslabs-example/terraform/main.tf index 49a84ffce..03beb0adc 100644 --- a/environments/smslabs-example/terraform/main.tf +++ b/environments/smslabs-example/terraform/main.tf @@ -6,3 +6,7 @@ terraform { } } } + +provider "openstack" { + cloud = "openstack" +} From 776db15f740877c653b2cb2ed90f783d760c9301 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 7 Mar 2022 11:55:00 +0000 Subject: [PATCH 099/105] update smslabs CI to do build in parallel with deploy --- .github/workflows/smslabs.yml | 38 +++-------- .../smslabs-example/ci/test_reimage.yml | 64 +++++++++++++++++++ .../ci/update_cloudnode_image.yml | 22 +++++++ environments/smslabs-example/hooks/pre.yml | 26 ++++++++ 4 files changed, 122 insertions(+), 28 deletions(-) create mode 100644 environments/smslabs-example/ci/test_reimage.yml create mode 100644 environments/smslabs-example/ci/update_cloudnode_image.yml create mode 100644 environments/smslabs-example/hooks/pre.yml diff --git a/.github/workflows/smslabs.yml b/.github/workflows/smslabs.yml index 309c0eef9..09f79aed4 100644 --- a/.github/workflows/smslabs.yml +++ b/.github/workflows/smslabs.yml @@ -77,7 +77,7 @@ jobs: TF_VAR_cluster_name: ci${{ github.run_id }} if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }} - - name: Configure infrastructure + - name: Directly configure cluster and build compute + login images run: | . venv/bin/activate . environments/smslabs-example/activate @@ -85,48 +85,30 @@ jobs: ansible-playbook ansible/adhoc/generate-passwords.yml ansible-playbook -vv ansible/site.yml env: - ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack # required so openhpc_slurm_partitions filter used by stackhpc.slurm_openstack_tools.autoscale can use clouds.yaml file to run openstack cli to get node config - - - name: Build login and compute images + ANSIBLE_FORCE_COLOR: True + + - name: Test reimage of login and compute nodes run: | . venv/bin/activate . environments/smslabs-example/activate - cd packer - PACKER_LOG=1 PACKER_LOG_PATH=build.log packer build -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl + ansible all -m wait_for_connection + ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/test_reimage.yml env: OS_CLOUD: openstack - - - name: Reimage compute nodes via slurm and check cluster still up (also adds image to partition cloud node info) - run: | - . venv/bin/activate - . environments/smslabs-example/activate - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-compute.yml - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml - env: ANSIBLE_FORCE_COLOR: True - OS_CLOUD: openstack - - name: Reimage login nodes via openstack and check cluster still up - run: | - . venv/bin/activate - . environments/smslabs-example/activate - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-login.yml - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml - env: - ANSIBLE_FORCE_COLOR: True - OS_CLOUD: openstack - - - name: Reconfigure Slurm for new cloud node image + - name: Update cloud image and reconfigure Slurm run: | . venv/bin/activate . environments/smslabs-example/activate + ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/update_cloudnode_image.yml ansible-playbook -vv ansible/slurm.yml --tags openhpc --skip-tags install env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack - - - name: Run MPI-based tests, triggering autoscaling + + - name: Run MPI-based tests (triggers autoscaling) run: | . venv/bin/activate . environments/smslabs-example/activate diff --git a/environments/smslabs-example/ci/test_reimage.yml b/environments/smslabs-example/ci/test_reimage.yml new file mode 100644 index 000000000..ae459d1a9 --- /dev/null +++ b/environments/smslabs-example/ci/test_reimage.yml @@ -0,0 +1,64 @@ +- hosts: login:!builder + become: no + tasks: + - name: Read packer build manifest + set_fact: + manifest: "{{ lookup('file', manifest_path) | from_json }}" + vars: + manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" + delegate_to: localhost + + - name: Get latest image builds + set_fact: + login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}" + compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" + + - name: Reimage login node via openstack + shell: + cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ login_build.artifact_id }}" + delegate_to: localhost + + - name: Check login node rebuild completed + shell: + cmd: openstack server show {{ inventory_hostname }} --format value -c image + register: openstack_login + delegate_to: localhost + retries: 5 + delay: 30 + until: login_build.artifact_id in openstack_login.stdout + changed_when: false + + - name: Wait for login connection + wait_for_connection: + timeout: 800 + + - name: Check slurm up after reimaging login node + import_tasks: ../hooks/check_slurm.yml + + - name: Request compute node rebuild via Slurm + shell: + cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1] + become: yes + + - name: Check compute node rebuild completed + shell: + cmd: openstack server show {{ item }} --format value -c image + register: openstack_compute + delegate_to: localhost + loop: "{{ groups['compute'] }}" + retries: 5 + delay: 30 + until: compute_build.artifact_id in openstack_compute.stdout + changed_when: false + +- hosts: compute:!builder + become: no + gather_facts: no + tasks: + - name: Wait for compute connection + wait_for_connection: + timeout: 800 + + - name: Check slurm up after reimaging login node + import_tasks: ../hooks/check_slurm.yml + run_once: true diff --git a/environments/smslabs-example/ci/update_cloudnode_image.yml b/environments/smslabs-example/ci/update_cloudnode_image.yml new file mode 100644 index 000000000..a171d4de8 --- /dev/null +++ b/environments/smslabs-example/ci/update_cloudnode_image.yml @@ -0,0 +1,22 @@ +- hosts: localhost + become: no + tasks: + - name: Read packer build manifest + set_fact: + manifest: "{{ lookup('file', manifest_path) | from_json }}" + vars: + manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" + delegate_to: localhost + + - name: Get latest image builds + set_fact: + login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}" + compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" + + - name: Add compute image ID to autoscale definition (for later autoscaling tests) + copy: + dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/openhpc/autoscale.yml" + content: | + openhpc_autoscale_image: {{ compute_build.artifact_id }} + delegate_to: localhost + run_once: true diff --git a/environments/smslabs-example/hooks/pre.yml b/environments/smslabs-example/hooks/pre.yml new file mode 100644 index 000000000..91244983e --- /dev/null +++ b/environments/smslabs-example/hooks/pre.yml @@ -0,0 +1,26 @@ +- hosts: all + become: yes + gather_facts: no + tasks: + - name: Configure dnf proxy + community.general.ini_file: + path: /etc/dnf/dnf.conf + section: main + option: proxy + value: "{{ squid_proxy }}" + no_extra_spaces: true + +- hosts: localhost + become: false + tags: build + tasks: + - name: Build packer images + shell: + cmd: | + cd packer + PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl + chdir: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}" + when: "'builder' not in group_names" # avoid recursion! + register: packer_run + async: 2700 # 45 minutes + poll: 0 From 4b3ac9655323540d3e6731c56becf46c98fe2354 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 7 Mar 2022 13:48:22 +0000 Subject: [PATCH 100/105] add squid proxy for smslabs --- environments/smslabs-example/inventory/group_vars/all/squid.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 environments/smslabs-example/inventory/group_vars/all/squid.yml diff --git a/environments/smslabs-example/inventory/group_vars/all/squid.yml b/environments/smslabs-example/inventory/group_vars/all/squid.yml new file mode 100644 index 000000000..0e9ba5733 --- /dev/null +++ b/environments/smslabs-example/inventory/group_vars/all/squid.yml @@ -0,0 +1 @@ +squid_proxy: http://10.0.1.163:3128 From f6b3efa632782e4ac4b32c148f64f40ef349d0a2 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 7 Mar 2022 13:52:07 +0000 Subject: [PATCH 101/105] move pytools to feature/ports --- environments/common/inventory/group_vars/all/pytools.yml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 environments/common/inventory/group_vars/all/pytools.yml diff --git a/environments/common/inventory/group_vars/all/pytools.yml b/environments/common/inventory/group_vars/all/pytools.yml new file mode 100644 index 000000000..e47e9b1a3 --- /dev/null +++ b/environments/common/inventory/group_vars/all/pytools.yml @@ -0,0 +1,2 @@ +# pytools_editable: false +pytools_gitref: feature/ports \ No newline at end of file From 2e3c77b6557e480f83d1585bacd7c179295b17d8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 7 Mar 2022 15:20:06 +0000 Subject: [PATCH 102/105] use CI project squid for smslabs --- environments/smslabs-example/inventory/group_vars/all/squid.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/smslabs-example/inventory/group_vars/all/squid.yml b/environments/smslabs-example/inventory/group_vars/all/squid.yml index 0e9ba5733..8524b5843 100644 --- a/environments/smslabs-example/inventory/group_vars/all/squid.yml +++ b/environments/smslabs-example/inventory/group_vars/all/squid.yml @@ -1 +1 @@ -squid_proxy: http://10.0.1.163:3128 +squid_proxy: http://10.20.2.12:3128 From 0f7dc8361a2998cb2bebab904ea291ecdf7f4147 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 7 Mar 2022 16:34:37 +0000 Subject: [PATCH 103/105] change smslabs CI workflow name for clarity --- .github/workflows/smslabs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/smslabs.yml b/.github/workflows/smslabs.yml index 09f79aed4..a05471277 100644 --- a/.github/workflows/smslabs.yml +++ b/.github/workflows/smslabs.yml @@ -7,7 +7,7 @@ on: pull_request: concurrency: stackhpc-ci # openstack project jobs: - openstack-example: + smslabs: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 From c217f2038b26da85d52b7e8ad2a730c9ec895a96 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 7 Mar 2022 17:07:55 +0000 Subject: [PATCH 104/105] disable dnf proxy in smslabs for debugging --- environments/smslabs-example/hooks/pre.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/environments/smslabs-example/hooks/pre.yml b/environments/smslabs-example/hooks/pre.yml index 91244983e..f400d6f9b 100644 --- a/environments/smslabs-example/hooks/pre.yml +++ b/environments/smslabs-example/hooks/pre.yml @@ -2,13 +2,13 @@ become: yes gather_facts: no tasks: - - name: Configure dnf proxy - community.general.ini_file: - path: /etc/dnf/dnf.conf - section: main - option: proxy - value: "{{ squid_proxy }}" - no_extra_spaces: true + # - name: Configure dnf proxy + # community.general.ini_file: + # path: /etc/dnf/dnf.conf + # section: main + # option: proxy + # value: "{{ squid_proxy }}" + # no_extra_spaces: true - hosts: localhost become: false From 24bfeb23a6274ac45afb1c582f35df75b5ac86c8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 7 Mar 2022 20:38:09 +0000 Subject: [PATCH 105/105] add check_slurm tasks to sms-labs CI --- .../smslabs-example/hooks/check_slurm.yml | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 environments/smslabs-example/hooks/check_slurm.yml diff --git a/environments/smslabs-example/hooks/check_slurm.yml b/environments/smslabs-example/hooks/check_slurm.yml new file mode 100644 index 000000000..b2ae67c7b --- /dev/null +++ b/environments/smslabs-example/hooks/check_slurm.yml @@ -0,0 +1,21 @@ +- name: Run sinfo + shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name + register: sinfo + changed_when: false + until: "'boot' not in sinfo.stdout_lines" + retries: 5 + delay: 10 +- name: Check nodes have expected slurm state + assert: + that: sinfo.stdout_lines == expected_sinfo + fail_msg: | + sinfo output not as expected: + actual: + {{ sinfo.stdout_lines }} + expected: + {{ expected_sinfo }} + + vars: + expected_sinfo: + - "{{ openhpc_cluster_name }}-compute-[0-1] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle" + - "{{ openhpc_cluster_name }}-compute-[2-3] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle~"