From a12349276c99db03b91a5975366d37cad722666f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 30 Mar 2021 16:26:32 +0000 Subject: [PATCH 001/133] WIP autoscale PoC --- ansible/autoscale.yml | 24 +++ ansible/slurm.yml | 8 + ansible/templates/resume.j2 | 22 +++ ansible/templates/suspend.j2 | 8 + .../inventory/group_vars/all/openhpc.yml | 1 + environments/sausage-autoscale/README.md | 176 ++++++++++++++++++ environments/sausage-autoscale/activate | 23 +++ environments/sausage-autoscale/ansible.cfg | 14 ++ environments/sausage-autoscale/hooks/.gitkeep | 0 .../inventory/group_vars/all/.gitkeep | 0 .../inventory/group_vars/all/secrets.yml | 43 +++++ .../inventory/group_vars/autoscale.yml | 1 + .../inventory/group_vars/openhpc.yml | 19 ++ .../inventory/group_vars/rebuild.yml | 1 + .../sausage-autoscale/inventory/groups | 17 ++ .../sausage-autoscale/inventory/hosts | 31 +++ .../terraform/.terraform.lock.hcl | 39 ++++ .../sausage-autoscale/terraform/inventory.tpl | 32 ++++ .../sausage-autoscale/terraform/main.tf | 131 +++++++++++++ .../terraform/terraform.tfvars | 10 + 20 files changed, 600 insertions(+) create mode 100644 ansible/autoscale.yml create mode 100644 ansible/templates/resume.j2 create mode 100644 ansible/templates/suspend.j2 create mode 100644 environments/sausage-autoscale/README.md create mode 100644 environments/sausage-autoscale/activate create mode 100644 environments/sausage-autoscale/ansible.cfg create mode 100644 environments/sausage-autoscale/hooks/.gitkeep create mode 100644 environments/sausage-autoscale/inventory/group_vars/all/.gitkeep create mode 100644 environments/sausage-autoscale/inventory/group_vars/all/secrets.yml create mode 100644 environments/sausage-autoscale/inventory/group_vars/autoscale.yml create mode 100644 environments/sausage-autoscale/inventory/group_vars/openhpc.yml create mode 100644 environments/sausage-autoscale/inventory/group_vars/rebuild.yml create mode 100644 environments/sausage-autoscale/inventory/groups create mode 100755 environments/sausage-autoscale/inventory/hosts create mode 100644 environments/sausage-autoscale/terraform/.terraform.lock.hcl create mode 100644 environments/sausage-autoscale/terraform/inventory.tpl create mode 100644 environments/sausage-autoscale/terraform/main.tf create mode 100644 environments/sausage-autoscale/terraform/terraform.tfvars diff --git a/ansible/autoscale.yml b/ansible/autoscale.yml new file mode 100644 index 000000000..1e179deeb --- /dev/null +++ b/ansible/autoscale.yml @@ -0,0 +1,24 @@ +- name: Copy out clouds.yaml + copy: + src: "{{ openhpc_autoscale_clouds }}" + dest: /etc/openstack/clouds.yaml + owner: slurm + group: slurm + mode: '0400' +- name: Setup slurm tools + include_role: + name: stackhpc.slurm_openstack_tools.pytools +- name: Create SuspendProgram + template: + src: suspend.j2 + dest: /opt/slurm-tools/bin/suspend.sh + owner: slurm + group: slurm + mode: u=rwx,go= +- name: Create ResumeProgram + template: + src: resume.j2 + dest: /opt/slurm-tools/bin/resume + owner: slurm + group: slurm + mode: u=rwx,go= diff --git a/ansible/slurm.yml b/ansible/slurm.yml index e7a0cb4c9..f94145f81 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -35,6 +35,14 @@ - import_role: name: stackhpc.slurm_openstack_tools.rebuild +- name: Setup autoscaling suspend/resume programs + hosts: autoscale # this is the *controller* + become: yes + tags: + - autoscale + tasks: + - import_tasks: autoscale.yml + - name: Set locked memory limits on user-facing nodes hosts: - compute diff --git a/ansible/templates/resume.j2 b/ansible/templates/resume.j2 new file mode 100644 index 000000000..748b5f270 --- /dev/null +++ b/ansible/templates/resume.j2 @@ -0,0 +1,22 @@ +#!/opt/slurm-tools/bin/python3 +""" Create OpenStack instances """ + +import sys, subprocess + +# configure logging to syslog - by default only "info" +# and above categories appear +logger = logging.getLogger("syslogger") +logger.setLevel(logging.DEBUG) +handler = logging.handlers.SysLogHandler("/dev/log") +logger.addHandler(handler) + +def expand_nodes(hostlist_expr): + scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) # TODO: pass full path to binary + return scontrol.stdout.strip().split('\n') + +def launch(): + hostlist_expr = sys.argv[1:] + logger.info(f"Resume invoked for %{hostexphostlist_expr}") + nodes = expand_nodes(hostlist_expr) + for node in nodes: + logger.info(f"TODO: Resume node %{node}") diff --git a/ansible/templates/suspend.j2 b/ansible/templates/suspend.j2 new file mode 100644 index 000000000..1df641b6d --- /dev/null +++ b/ansible/templates/suspend.j2 @@ -0,0 +1,8 @@ +#!/bin/bash +# Example SuspendProgram +echo "`date` Suspend invoked $0 $*" >>/var/log/power_save.log +hosts=`scontrol show hostnames $1` +for host in $hosts +do + openstack server delete $host +done diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 028b932f4..b6d8abacf 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -15,6 +15,7 @@ openhpc_slurmdbd_mysql_username: slurm openhpc_slurm_control_host: "{{ groups['control'] | first }}" openhpc_slurm_partitions: - name: "compute" + # cloud_nodes: 2 openhpc_default_packages: - slurm-libpmi-ohpc # to allow intel mpi to work properly - ohpc-gnu9-openmpi4-perf-tools # for hpctests diff --git a/environments/sausage-autoscale/README.md b/environments/sausage-autoscale/README.md new file mode 100644 index 000000000..69f25a09e --- /dev/null +++ b/environments/sausage-autoscale/README.md @@ -0,0 +1,176 @@ +# Sausage-Autoscale cluster + +Dev env for autoscaling on sausagecloud + +# Directory structure + +## terraform + +Contains terraform configuration to deploy infrastructure. + +## inventory + +Ansible inventory for configuring the infrastructure. + +# Setup + +In the repo root, run: + + python3 -m venv venv # TODO: do we need system-site-packages? + . venv/bin/activate + pip install -U upgrade pip + pip install requirements.txt + ansible-galaxy install -r requirements.yml -p ansible/roles + ansible-galaxy collection install -r requirements.yml -p ansible/collections # don't worry about collections path warning + +# Activating the environment + +There is a small environment file that you must `source` which defines environment +variables that reference the configuration path. This is so that we can locate +resources relative the environment directory. + + . environments/sausage-autoscale/activate + +The pattern we use is that all resources referenced in the inventory +are located in the environment directory containing the inventory that +references them. + +# Common configuration + +Configuarion is shared by specifiying multiple inventories. We reference the `common` +inventory from `ansible.cfg`, including it before the environment specific +inventory, located at `./inventory`. + +Inventories specified later in the list can override values set in the inventories +that appear earlier. This allows you to override values set by the `common` inventory. + +Any variables that would be identical for all environments should be defined in the `common` inventory. + +# Passwords + +Prior to running any other playbooks, you need to define a set of passwords. You can +use the `generate-passwords.yml` playbook to automate this process: + +``` +cd +ansible-playbook ansible/adhoc/generate-passwords.yml # can actually be run from anywhere once environment activated +``` + +This will output a set of passwords `inventory/group_vars/all/secrets.yml`. +Placing them in the inventory means that they will be defined for all playbooks. + +It is recommended to encrypt the contents of this file prior to commiting to git: + +``` +ansible-vault encrypt inventory/group_vars/all/secrets.yml +``` + +You will then need to provide a password when running the playbooks e.g: + +``` +ansible-playbook ../ansible/site.yml --tags grafana --ask-vault-password +``` + +See the [Ansible vault documentation](https://docs.ansible.com/ansible/latest/user_guide/vault.html) for more details. + + +# Deploy nodes with Terraform + +- Modify the keypair in `main.tf` and ensure the required Centos images are available on OpenStack. +- Activate the virtualenv and create the instances: + + . venv/bin/activate + cd environments/sausage-autoscale/ + terraform apply + +This creates an ansible inventory file `./inventory`. + +Note that this terraform deploys instances onto an existing network - for production use you probably want to create a network for the cluster. + +# Create and configure cluster with Ansible + +Now run one or more playbooks using: + + cd + ansible-playbook ansible/site.yml + +This provides: +- grafana at `http://:3000` - username `grafana`, password as set above +- prometheus at `http://:9090` + +NB: if grafana's yum repos are down you will see `Errors during downloading metadata for repository 'grafana' ...`. You can work around this using: + + ssh centos@ + sudo rm -rf /etc/yum.repos.d/grafana.repo + wget https://dl.grafana.com/oss/release/grafana-7.3.1-1.x86_64.rpm + sudo yum install grafana-7.3.1-1.x86_64.rpm + exit + ansible-playbook -i inventory monitoring.yml -e grafana_password= --skip-tags grafana_install + +# rebuild.yml + +# FIXME: outdated + +Enable the compute nodes of a Slurm-based OpenHPC cluster on Openstack to be reimaged from Slurm. + +For full details including the Slurm commmands to use see the [role's README](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools/blob/main/roles/rebuild/README.md) + +Ensure you have `~/.config/openstack/clouds.yaml` defining authentication for a a single Openstack cloud (see above README to change location). + +Then run: + + ansible-playbook -i inventory rebuild.yml + +Note this does not rebuild the nodes, only deploys the tools to do so. + +# test.yml + +This runs MPI-based tests on the cluster: +- `pingpong`: Runs Intel MPI Benchmark's IMB-MPI1 pingpong between a pair of (scheduler-selected) nodes. Reports zero-size message latency and maximum bandwidth. +- `pingmatrix`: Runs a similar pingpong test but between all pairs of nodes. Reports zero-size message latency & maximum bandwidth. +- `hpl-solo`: Runs HPL **separately** on all nodes, using 80% of memory, reporting Gflops on each node. + +These names can be used as tags to run only a subset of tests. For full details see the [role's README](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools/blob/main/roles/test/README.md). + +Note these are intended as post-deployment tests for a cluster to which you have root access - they are **not** intended for use on a system running production jobs: +- Test directories are created within `openhpc_tests_rootdir` (here `/mnt/nfs/ohcp-tests`) which must be on a shared filesystem (read/write from login/control and compute nodes) +- Generally, packages are only installed on the control/login node, and `/opt` is exported via NFS to the compute nodes. +- The exception is the `slurm-libpmi-ohpc` package (required for `srun` with Intel MPI) which is installed on all nodes. + +To achieve best performance for HPL set `openhpc_tests_hpl_NB` in [test.yml](test.yml) to the appropriate the HPL blocksize 'NB' for the compute node processor - for Intel CPUs see [here](https://software.intel.com/content/www/us/en/develop/documentation/mkl-linux-developer-guide/top/intel-math-kernel-library-benchmarks/intel-distribution-for-linpack-benchmark/configuring-parameters.html). + +Then run: + + ansible-playbook ../ansible/adhoc/test.yml + +Results will be reported in the ansible stdout - the pingmatrix test also writes an html results file onto the ansible host. + +Note that you can still use the `test.yml` playbook even if the terraform/ansible in this repo wasn't used to deploy the cluster - as long as it's running OpenHPC v2. Simply create an appropriate `inventory` file, e.g: + + [all:vars] + ansible_user=centos + + [cluster:children] + cluster_login + cluster_compute + + [cluster_login] + slurm-control + + [cluster_compute] + cpu-h21a5-u3-svn2 + cpu-h21a5-u3-svn4 + ... + +And run the `test.yml` playbook as described above. If you want to run tests only on a group from this inventory, rather than an entire partition, you can +use ``--limit`` + +Then running the tests passing this file as extra_vars: + + ansible-playbook ../ansible/test.yml --limit group-in-inventory + +# Destroying the cluster + +When finished, run: + + terraform destroy --auto-approve diff --git a/environments/sausage-autoscale/activate b/environments/sausage-autoscale/activate new file mode 100644 index 000000000..e74031095 --- /dev/null +++ b/environments/sausage-autoscale/activate @@ -0,0 +1,23 @@ +export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) +echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" + +APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) +export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" + +export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") +echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" + +export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") +echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root" + +export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") +echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root" + +export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT") +echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" + +if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then + export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg +fi + + diff --git a/environments/sausage-autoscale/ansible.cfg b/environments/sausage-autoscale/ansible.cfg new file mode 100644 index 000000000..c243e9958 --- /dev/null +++ b/environments/sausage-autoscale/ansible.cfg @@ -0,0 +1,14 @@ +[defaults] +any_errors_fatal = True +stdout_callback = debug +stderr_callback = debug +gathering = smart +forks = 30 +host_key_checking = False +inventory = ../common/inventory,inventory +collections_path = ../../ansible/collections +roles_path = ../../ansible/roles + +[ssh_connection] +ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +pipelining = True diff --git a/environments/sausage-autoscale/hooks/.gitkeep b/environments/sausage-autoscale/hooks/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/environments/sausage-autoscale/inventory/group_vars/all/.gitkeep b/environments/sausage-autoscale/inventory/group_vars/all/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/environments/sausage-autoscale/inventory/group_vars/all/secrets.yml b/environments/sausage-autoscale/inventory/group_vars/all/secrets.yml new file mode 100644 index 000000000..29f86e2dd --- /dev/null +++ b/environments/sausage-autoscale/inventory/group_vars/all/secrets.yml @@ -0,0 +1,43 @@ +--- +# Ansible managed +secrets_openhpc_elasticsearch_admin_password: pC2:.3c1QShckz1BxB5u +secrets_openhpc_elasticsearch_kibana_password: TVevBQ2JXTyoSW.bT_ba +secrets_openhpc_grafana_admin_password: QMiyymbFD.YC7M.39mCa +secrets_openhpc_mungekey: + content: 'YCbG2tA9UWIezb3xRRqK2vNzd6tOnGMs0kpPO+quHuqGXLsXagFjXq8Kqnd1/UIqoRW/TH3AfRNF + + yhuvemkRd3TlZLrvBZzMcQw0jAP2sI/+4hPpeIEk1kcQFVgE9A1HppLc0CxI7SskDPmY3vGwnuo7 + + 4K19jYxgPkIb9FUKCNetKgHR7L78LsbZxWUYkmvO6rCDUeLgMchFkjoi5Lwr+i1JJAxoGhT0yLmW + + D0hEenYePgsefzopwEnKEHByhnx0ROlJ86S58bh+rOnAqBWWJ8Im71NeJS58Moyrh9VLOkmRUCIj + + e0bhEKd7+/a5I4GN6KIo1oXRT74TxVHkwypSqFgAbVF5KMSsuY+5eG4JLcpTOGZYQpbAY9ICtnjM + + U6T6YhVXYvurVcb7N2ybub8veIwWeS98Yr2C9ZwsBzvpA2Fk3wxCFjo6vxe47U2nsezIUAUxVH7U + + V6jNMVoT4GZMQcKRsTp0zoAVAund6jMjsQ6h6Ly8EYyiKz6itTq4L5OqotZ0tUCX6xnVxtaD4LZb + + tQfZbcxPdd1C7NtTfImUsxHDp2CBIu/VDZWen/iafGaPeI83XVkC8Kk1QwhjBcRnlJEw2cK4TdBW + + 6Soy8CnNZMd92iqlqIZs7iZHu9FLyLiLCrkjaDnxM0UH0RP9CPTihiE47w874HVOQioacNX6U3Dz + + 3I0vxUAd/AF6ZrmBtOh3EekbxPtFNY7Gw3qPCbbJM6v5+XFjz//Lj1GFYzGK1DA7jhekzk5vtOWe + + k2vZcyqPYOIxFlqtm3OGt+f6V9G/xvYvRofE1EbO9qU1vqVRbW8Z7dqOR4AwomW2UlhH9G/ijLZZ + + EKqOWiCVONfMEe+Cndi/WH80R/nASx8hMJrTp0VOPtNjN+LWb/pPE/cSY9hbuA2EvqJB4gFQzmqz + + sFpQAqPVS8/+vesiKKVcnxUeMoRBx8g9CmdFTIvz5fU6M9lh7DjYoKcKx7eKtQhCAktyeI21o+Tn + + 2gyALzcxX29VCJy/8n/8qC26T9wLjN2URpO95yT2+k+Uv96R6Uj4zK4CD2c7yXm/0CmyrUWf3EPp + + VeaaWhy+KKR7T923TCEETiwSlwOynwb4lHLPmE17t8XBqYAqWGL2e8uDuLYhPAf+U5Bwt+LiXO5j + + hjg6szwpdSirDl1vpkqTDaOGdzVcTb+j8NfHDCdVOJbWu2I8sAHkjDRl+faagwxeMIGpTjoRi225 + + mj9rJdCbZxCSrwbE1r1koHrJZ+XN4AG4OrmTXdXMSLhpJuptyeNsRmvWxBe665tAxktRZ/kQUY3c + + W1zq03n3wtBkilL1wh/Fata4XrN5UZhpVSwT+7Z3gPacJVt5UjedkqpW8br+Pxw4efQExeDH2g==' +secrets_openhpc_mysql_root_password: XuY4ATpIzRje.PhkXI-t +secrets_openhpc_mysql_slurm_password: bS7GCWoTtsf4cjLo70S5 diff --git a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml new file mode 100644 index 000000000..6976f8117 --- /dev/null +++ b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml @@ -0,0 +1 @@ +openhpc_autoscale_clouds: ~/steveb-openrc.sh \ No newline at end of file diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml new file mode 100644 index 000000000..40ada5b98 --- /dev/null +++ b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml @@ -0,0 +1,19 @@ +openhpc_login_only_nodes: '' +openhpc_slurm_conf: + SlurmctldDebug: debug5 + SlurmctldLogFile: /var/log/slurmctld.log + SlurmdDebug: debug5 + SlurmdLogFile: /var/log/slurmd.log + SuspendExcNodes: "{{ groups['compute'] | join(',') }}" # i.e. all tf-defined nodes in the partition + # BELOW FOR TESTING ONLY, NOT PRODUCTION VALUES! + SuspendTime: 120 + SuspendTimeout: 300 + SuspendProgram: /opt/slurm/suspend.sh + ResumeProgram: /opt/slurm/resume.sh + ResumeTimeout: 300 + SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns,power_save_interval=10,power_save_min_interval=0 + NOTES: + # enable_configless: required (as set in template) + # cloud_dns: requires working DNS + # power_save_*interval: options are defaults but should enable changes + # reboot_from_controller: should be really useful but actually we're already setup for rebuild on computes, so use that diff --git a/environments/sausage-autoscale/inventory/group_vars/rebuild.yml b/environments/sausage-autoscale/inventory/group_vars/rebuild.yml new file mode 100644 index 000000000..b2eba881a --- /dev/null +++ b/environments/sausage-autoscale/inventory/group_vars/rebuild.yml @@ -0,0 +1 @@ +openhpc_rebuild_clouds: ~/steveb-openrc.sh diff --git a/environments/sausage-autoscale/inventory/groups b/environments/sausage-autoscale/inventory/groups new file mode 100644 index 000000000..4cec8ab8f --- /dev/null +++ b/environments/sausage-autoscale/inventory/groups @@ -0,0 +1,17 @@ +[control:children] +login + +[nfs:children] +cluster + +[openhpc:children] +cluster + +[mysql:children] +control + +[rebuild:children] +compute + +[autoscale:children] +login # actually controller diff --git a/environments/sausage-autoscale/inventory/hosts b/environments/sausage-autoscale/inventory/hosts new file mode 100755 index 000000000..203a41376 --- /dev/null +++ b/environments/sausage-autoscale/inventory/hosts @@ -0,0 +1,31 @@ +[all:vars] +ansible_user=centos +ssh_proxy=10.0.3.100 +openhpc_cluster_name=sbscale + +[sbscale_login] +sbscale-login-0 ansible_host=10.0.3.100 server_networks='{"stackhpc":["10.0.3.100"]}' + +[sbscale_compute] +sbscale-compute-0 ansible_host=10.0.3.107 server_networks='{"stackhpc":["10.0.3.107"]}' +sbscale-compute-1 ansible_host=10.0.3.71 server_networks='{"stackhpc":["10.0.3.71"]}' + +[sbscale_compute:vars] +ansible_ssh_common_args='-o ProxyCommand="ssh centos@10.0.3.100 -W %h:%p"' + +[cluster_login:children] +sbscale_login + +# NOTE: This is hardcoded in the tests role +[cluster_compute:children] +sbscale_compute + +[login:children] +cluster_login + +[compute:children] +cluster_compute + +[cluster:children] +login +compute \ No newline at end of file diff --git a/environments/sausage-autoscale/terraform/.terraform.lock.hcl b/environments/sausage-autoscale/terraform/.terraform.lock.hcl new file mode 100644 index 000000000..8f9e2298d --- /dev/null +++ b/environments/sausage-autoscale/terraform/.terraform.lock.hcl @@ -0,0 +1,39 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/local" { + version = "2.1.0" + hashes = [ + "h1:EYZdckuGU3n6APs97nS2LxZm3dDtGqyM4qaIvsmac8o=", + "zh:0f1ec65101fa35050978d483d6e8916664b7556800348456ff3d09454ac1eae2", + "zh:36e42ac19f5d68467aacf07e6adcf83c7486f2e5b5f4339e9671f68525fc87ab", + "zh:6db9db2a1819e77b1642ec3b5e95042b202aee8151a0256d289f2e141bf3ceb3", + "zh:719dfd97bb9ddce99f7d741260b8ece2682b363735c764cac83303f02386075a", + "zh:7598bb86e0378fd97eaa04638c1a4c75f960f62f69d3662e6d80ffa5a89847fe", + "zh:ad0a188b52517fec9eca393f1e2c9daea362b33ae2eb38a857b6b09949a727c1", + "zh:c46846c8df66a13fee6eff7dc5d528a7f868ae0dcf92d79deaac73cc297ed20c", + "zh:dc1a20a2eec12095d04bf6da5321f535351a594a636912361db20eb2a707ccc4", + "zh:e57ab4771a9d999401f6badd8b018558357d3cbdf3d33cc0c4f83e818ca8e94b", + "zh:ebdcde208072b4b0f8d305ebf2bfdc62c926e0717599dcf8ec2fd8c5845031c3", + "zh:ef34c52b68933bedd0868a13ccfd59ff1c820f299760b3c02e008dc95e2ece91", + ] +} + +provider "registry.terraform.io/terraform-provider-openstack/openstack" { + version = "1.40.0" + hashes = [ + "h1:gBrsytNqUG1ZQPKys8KAvZkjesjimXb7vcrTmyFUTM0=", + "zh:278a878a256ec5447e1e64b5d9a691e3a1f7d5c247e536500c97c5b996bc2531", + "zh:5c7ae8cfe0831557c8c1988581f3fd0bdf182d15bcefbe645bb91564027e67d4", + "zh:944d75fc1e3d54df4c47e5d34007927abf4fa79e2107b05d14f11b52970a6164", + "zh:a50922d05185598a9264a25eff6f01ce7671c70a562a3ef93e9bb7a449e358b0", + "zh:adb87ad3782f1f7a5eaeedbcffa0e5559d2372502f9af91781aa13c11cf4b47b", + "zh:c0e4218259a37f16c10b4779009f0b0b5d467e4d347fc2aa3a212f1ee3a71d63", + "zh:c2eb4f40cbd78238500a3a84ba995060bfc50f770bd13732ae50b73687f3dce6", + "zh:ca8a38fe932972d0d7fdc51f84ae775648b7aff3c96b8ead085007e880ee987f", + "zh:ce4f703719d646507d6006085dc1114954c75710226df43078169b2b01993537", + "zh:e29542a492bbf55613d20b5f68ed4357cbc8bb09d61a1752d2976e5e1608879d", + "zh:e68d47b85b9da089f8f7102c23545331c15a9e6ea99875926d2ebf6e38bf2073", + "zh:fdb10cb345250d7c47e342def106bd10ef75493ef6edf15809e10e6367a0d9f6", + ] +} diff --git a/environments/sausage-autoscale/terraform/inventory.tpl b/environments/sausage-autoscale/terraform/inventory.tpl new file mode 100644 index 000000000..361a359ec --- /dev/null +++ b/environments/sausage-autoscale/terraform/inventory.tpl @@ -0,0 +1,32 @@ +[all:vars] +ansible_user=centos +ssh_proxy=${login.network[0].fixed_ip_v4} +openhpc_cluster_name=${cluster_name} + +[${cluster_name}_login] +${login.name} ansible_host=${login.network[0].fixed_ip_v4} server_networks='${jsonencode({for net in login.network: net.name => [ net.fixed_ip_v4 ] })}' + +[${cluster_name}_compute] +%{ for compute in computes ~} +${compute.name} ansible_host=${compute.network[0].fixed_ip_v4} server_networks='${jsonencode({for net in compute.network: net.name => [ net.fixed_ip_v4 ] })}' +%{ endfor ~} + +[${cluster_name}_compute:vars] +ansible_ssh_common_args='-o ProxyCommand="ssh centos@${login.network[0].fixed_ip_v4} -W %h:%p"' + +[cluster_login:children] +${cluster_name}_login + +# NOTE: This is hardcoded in the tests role +[cluster_compute:children] +${cluster_name}_compute + +[login:children] +cluster_login + +[compute:children] +cluster_compute + +[cluster:children] +login +compute \ No newline at end of file diff --git a/environments/sausage-autoscale/terraform/main.tf b/environments/sausage-autoscale/terraform/main.tf new file mode 100644 index 000000000..1523eeae5 --- /dev/null +++ b/environments/sausage-autoscale/terraform/main.tf @@ -0,0 +1,131 @@ +terraform { + required_version = ">= 0.14" + required_providers { + openstack = { + source = "terraform-provider-openstack/openstack" + } + } +} + +variable "environment_root" { + type = string +} + +variable "compute_names" { + default = ["compute-0", "compute-1"] +} + +variable "cluster_name" { + default = "testohpc" +} + +variable "key_pair" { + type = string +} + +variable "network" { + type = string +} + +variable "login_flavor" { + type = string +} + +variable "login_image" { + type = string +} + +variable "compute_flavor" { + type = string +} + +variable "compute_image" { + type = string +} + +resource "openstack_networking_secgroup_v2" "secgroup_slurm_login" { + name = "secgroup_slurm_login" + description = "Rules for the slurm login node" + # Fully manage with terraform + delete_default_rules = true +} + +resource "openstack_networking_secgroup_v2" "secgroup_slurm_compute" { + name = "secgroup_slurm_compute" + description = "Rules for the slurm compute node" + # Fully manage with terraform + delete_default_rules = true +} + +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_egress_v4" { + direction = "egress" + ethertype = "IPv4" + security_group_id = openstack_networking_secgroup_v2.secgroup_slurm_login.id +} + +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_ingress_tcp_v4" { + direction = "ingress" + ethertype = "IPv4" + # NOTE: You will want to lock down the ports in a production environment. This will require + # setting of static ports for the NFS server see: + # https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/6/html/storage_administration_guide/s2-nfs-nfs-firewall-config + port_range_min = 1 + protocol = "tcp" + port_range_max = 65535 + security_group_id = openstack_networking_secgroup_v2.secgroup_slurm_login.id +} + +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_compute_rule_egress_v4" { + direction = "egress" + ethertype = "IPv4" + security_group_id = openstack_networking_secgroup_v2.secgroup_slurm_compute.id +} + +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_compute_rule_ingress_tcp_v4" { + direction = "ingress" + ethertype = "IPv4" + port_range_min = 1 + protocol = "tcp" + port_range_max = 65535 + security_group_id = openstack_networking_secgroup_v2.secgroup_slurm_compute.id +} + +resource "openstack_compute_instance_v2" "login" { + + name = "${var.cluster_name}-login-0" + image_name = var.login_image + flavor_name = var.login_flavor + key_pair = var.key_pair + network { + name = var.network + } + security_groups = [openstack_networking_secgroup_v2.secgroup_slurm_login.name] +} + + +resource "openstack_compute_instance_v2" "compute" { + + for_each = toset(var.compute_names) + + name = "${var.cluster_name}-${each.value}" + image_name = var.compute_image + flavor_name = var.compute_flavor + #flavor_name = "compute-A" + key_pair = var.key_pair + network { + name = var.network + } + security_groups = [openstack_networking_secgroup_v2.secgroup_slurm_compute.name] +} + +# TODO: needs fixing for case where creation partially fails resulting in "compute.network is empty list of object" +resource "local_file" "hosts" { + content = templatefile("${path.module}/inventory.tpl", + { + "cluster_name": var.cluster_name + "login": openstack_compute_instance_v2.login, + "computes": openstack_compute_instance_v2.compute, + }, + ) + filename = "${var.environment_root}/inventory/hosts" +} \ No newline at end of file diff --git a/environments/sausage-autoscale/terraform/terraform.tfvars b/environments/sausage-autoscale/terraform/terraform.tfvars new file mode 100644 index 000000000..04bfb7ade --- /dev/null +++ b/environments/sausage-autoscale/terraform/terraform.tfvars @@ -0,0 +1,10 @@ +compute_names = ["compute-0", "compute-1"] +cluster_name = "sbscale" +key_pair = "steveb-local" +network = "stackhpc" + +login_image = "CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64" +login_flavor = "chipolata" + +compute_image = "CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64" +compute_flavor = "chipolata" From c9c9bfc6795131a9cef6434e12e7d73e2dd944a6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 31 Mar 2021 06:57:50 +0000 Subject: [PATCH 002/133] add IMB package to allow testing --- .../sausage-autoscale/inventory/group_vars/openhpc.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml index 40ada5b98..f15cc50d4 100644 --- a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml +++ b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml @@ -1,4 +1,9 @@ openhpc_login_only_nodes: '' +openhpc_packages: + - slurm-libpmi-ohpc + - wget + - lmod-defaults-gnu9-openmpi4-ohpc + - imb-gnu9-openmpi4-ohpc openhpc_slurm_conf: SlurmctldDebug: debug5 SlurmctldLogFile: /var/log/slurmctld.log From ab14526837d954cbf992bbf9e3dc679eaec2f2c9 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 31 Mar 2021 07:36:40 +0000 Subject: [PATCH 003/133] move cloud_nodes config to right environment --- environments/common/inventory/group_vars/all/openhpc.yml | 6 ++++++ .../sausage-autoscale/inventory/group_vars/openhpc.yml | 7 +++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index b6d8abacf..f1bae139b 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -15,6 +15,7 @@ openhpc_slurmdbd_mysql_username: slurm openhpc_slurm_control_host: "{{ groups['control'] | first }}" openhpc_slurm_partitions: - name: "compute" +<<<<<<< HEAD # cloud_nodes: 2 openhpc_default_packages: - slurm-libpmi-ohpc # to allow intel mpi to work properly @@ -22,5 +23,10 @@ openhpc_default_packages: openhpc_extra_packages: [] openhpc_packages: "{{ openhpc_default_packages + openhpc_extra_packages }}" openhpc_munge_key: "{{ secrets_openhpc_mungekey | b64decode }}" +======= +openhpc_packages: + - slurm-libpmi-ohpc +slurm_munge_key: "{{ secrets_openhpc_mungekey }}" +>>>>>>> b8d9eba... move cloud_nodes config to right environment openhpc_slurm_configless: true openhpc_login_only_nodes: login \ No newline at end of file diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml index f15cc50d4..9f69b0f9f 100644 --- a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml +++ b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml @@ -1,4 +1,7 @@ openhpc_login_only_nodes: '' +openhpc_slurm_partitions: + - name: "compute" + cloud_nodes: 2 openhpc_packages: - slurm-libpmi-ohpc - wget @@ -13,8 +16,8 @@ openhpc_slurm_conf: # BELOW FOR TESTING ONLY, NOT PRODUCTION VALUES! SuspendTime: 120 SuspendTimeout: 300 - SuspendProgram: /opt/slurm/suspend.sh - ResumeProgram: /opt/slurm/resume.sh + SuspendProgram: /opt/slurm-tools/bin/suspend.sh + ResumeProgram: /opt/slurm-tools/bin/resume ResumeTimeout: 300 SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns,power_save_interval=10,power_save_min_interval=0 NOTES: From 67b16a4d98feabfa5b8d90afc7cb26fc2d3344b5 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 31 Mar 2021 10:03:30 +0000 Subject: [PATCH 004/133] fix /etc/openstack permissions for resume --- ansible/autoscale.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ansible/autoscale.yml b/ansible/autoscale.yml index 1e179deeb..9a4bb4879 100644 --- a/ansible/autoscale.yml +++ b/ansible/autoscale.yml @@ -5,6 +5,13 @@ owner: slurm group: slurm mode: '0400' +- name: Ensure /etc/openstack/ is readable by slurm # TODO: think this clashes with rebuild? + file: + path: /etc/openstack/ + state: directory + owner: slurm + group: slurm + mode: u=rx - name: Setup slurm tools include_role: name: stackhpc.slurm_openstack_tools.pytools @@ -22,3 +29,4 @@ owner: slurm group: slurm mode: u=rwx,go= + tags: resume From a618acac172048f7be5c295ed9542d34671f18cf Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 31 Mar 2021 10:03:54 +0000 Subject: [PATCH 005/133] fix clouds.yaml --- .../sausage-autoscale/inventory/group_vars/autoscale.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml index 6976f8117..5216ebbc8 100644 --- a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml +++ b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml @@ -1 +1 @@ -openhpc_autoscale_clouds: ~/steveb-openrc.sh \ No newline at end of file +openhpc_autoscale_clouds: /home/centos/steveb-clouds.yaml \ No newline at end of file From 341a5c99668c33522a77d35ce70322ef90edde11 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 31 Mar 2021 12:50:12 +0000 Subject: [PATCH 006/133] get resume/suspend scripts working manually --- ansible/autoscale.yml | 3 +- ansible/templates/resume.j2 | 50 +++++++++++++++---- ansible/templates/suspend.j2 | 43 +++++++++++++--- .../inventory/group_vars/autoscale.yml | 7 ++- 4 files changed, 83 insertions(+), 20 deletions(-) diff --git a/ansible/autoscale.yml b/ansible/autoscale.yml index 9a4bb4879..d9dc9b057 100644 --- a/ansible/autoscale.yml +++ b/ansible/autoscale.yml @@ -18,10 +18,11 @@ - name: Create SuspendProgram template: src: suspend.j2 - dest: /opt/slurm-tools/bin/suspend.sh + dest: /opt/slurm-tools/bin/suspend owner: slurm group: slurm mode: u=rwx,go= + tags: suspend - name: Create ResumeProgram template: src: resume.j2 diff --git a/ansible/templates/resume.j2 b/ansible/templates/resume.j2 index 748b5f270..bf79c6c04 100644 --- a/ansible/templates/resume.j2 +++ b/ansible/templates/resume.j2 @@ -1,22 +1,52 @@ #!/opt/slurm-tools/bin/python3 """ Create OpenStack instances """ -import sys, subprocess +import sys, subprocess, logging.handlers +import openstack +import pprint -# configure logging to syslog - by default only "info" -# and above categories appear +# all take a name or ID: +IMAGE = "{{ openhpc_autoscale_image }}" +NETWORK = "{{ openhpc_autoscale_network }}" +FLAVOR = "{{ openhpc_autoscale_flavor }}" +KEYPAIR = "{{ openhpc_autoscale_keypair }}" + +# configure logging to syslog - by default only "info" and above categories appear logger = logging.getLogger("syslogger") logger.setLevel(logging.DEBUG) handler = logging.handlers.SysLogHandler("/dev/log") logger.addHandler(handler) def expand_nodes(hostlist_expr): - scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) # TODO: pass full path to binary + scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) return scontrol.stdout.strip().split('\n') -def launch(): - hostlist_expr = sys.argv[1:] - logger.info(f"Resume invoked for %{hostexphostlist_expr}") - nodes = expand_nodes(hostlist_expr) - for node in nodes: - logger.info(f"TODO: Resume node %{node}") +def create_server(conn, name): + + image = conn.compute.find_image(IMAGE) + flavor = conn.compute.find_flavor(FLAVOR) + network = conn.network.find_network(NETWORK) + keypair = conn.compute.find_keypair(KEYPAIR) + + server = conn.compute.create_server( + name=name, image_id=image.id, flavor_id=flavor.id, + networks=[{"uuid": network.id}], key_name=keypair.name) + + #server = conn.compute.wait_for_server(server) + return server + +def resume(): + hostlist_expr = sys.argv[1] + logger.info(f"Slurmctld invoked resume {hostlist_expr}") + new_nodes = expand_nodes(hostlist_expr) + + conn = openstack.connection.from_config() + logger.info(f"Got openstack connection {conn}") + + for node in new_nodes: + logger.info(f"creating node {node}") + server = create_server(conn, node) + logger.info(f"server: {server}") + +if __name__ == "__main__": + sys.exit(resume()) diff --git a/ansible/templates/suspend.j2 b/ansible/templates/suspend.j2 index 1df641b6d..02d09bc0d 100644 --- a/ansible/templates/suspend.j2 +++ b/ansible/templates/suspend.j2 @@ -1,8 +1,35 @@ -#!/bin/bash -# Example SuspendProgram -echo "`date` Suspend invoked $0 $*" >>/var/log/power_save.log -hosts=`scontrol show hostnames $1` -for host in $hosts -do - openstack server delete $host -done +#!/opt/slurm-tools/bin/python3 +""" Delete openstack instances """ + +import sys, subprocess, logging, logging.handlers +import openstack +import pprint + +# configure logging to syslog - by default only "info" and above categories appear +logger = logging.getLogger("syslogger") +logger.setLevel(logging.DEBUG) +handler = logging.handlers.SysLogHandler("/dev/log") +logger.addHandler(handler) + +def expand_nodes(hostlist_expr): + scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) + return scontrol.stdout.strip().split('\n') + +def delete_server(conn, name): + server = conn.compute.find_server(name) + conn.compute.delete_server(server) + +def suspend(): + hostlist_expr = sys.argv[1] + logger.info(f"Slurmctld invoked suspend {hostlist_expr}") + remove_nodes = expand_nodes(hostlist_expr) + + conn = openstack.connection.from_config() + logger.info(f"Got openstack connection {conn}") + + for node in remove_nodes: + logger.info(f"deleting node {node}") + delete_server(conn, node) + +if __name__ == "__main__": + sys.exit(suspend()) diff --git a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml index 5216ebbc8..a87c5afe9 100644 --- a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml +++ b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml @@ -1 +1,6 @@ -openhpc_autoscale_clouds: /home/centos/steveb-clouds.yaml \ No newline at end of file +openhpc_autoscale_clouds: /home/centos/steveb-clouds.yaml +# TODO: change below to be defined somewhere else +openhpc_autoscale_image: CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64 # TODO change to built image +openhpc_autoscale_network: stackhpc +openhpc_autoscale_flavor: chipolata +openhpc_autoscale_keypair: steveb-local From 99fe7adfa24a5a8b5282147770bf030a49e9e3f7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 08:59:48 +0000 Subject: [PATCH 007/133] note issue with adhoc slurm restart for combined headnode --- ansible/adhoc/restart-slurm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/adhoc/restart-slurm.yml b/ansible/adhoc/restart-slurm.yml index 41b9dcb50..cf523ddee 100644 --- a/ansible/adhoc/restart-slurm.yml +++ b/ansible/adhoc/restart-slurm.yml @@ -20,7 +20,7 @@ name: slurmctld state: restarted -- hosts: compute,login +- hosts: compute,login # FIXME: doesn't work if using `login` as combined slurmctld become: yes gather_facts: no tasks: From a956a549bb61505986e6fd2a1fc810bdca1c5a00 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 09:06:31 +0000 Subject: [PATCH 008/133] fix openhpc variables for autoscale --- .../sausage-autoscale/inventory/group_vars/openhpc.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml index 9f69b0f9f..7a9795419 100644 --- a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml +++ b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml @@ -16,11 +16,12 @@ openhpc_slurm_conf: # BELOW FOR TESTING ONLY, NOT PRODUCTION VALUES! SuspendTime: 120 SuspendTimeout: 300 - SuspendProgram: /opt/slurm-tools/bin/suspend.sh + SuspendProgram: /opt/slurm-tools/bin/suspend ResumeProgram: /opt/slurm-tools/bin/resume ResumeTimeout: 300 SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns,power_save_interval=10,power_save_min_interval=0 - NOTES: + # FIXME: need to set TreeWidth to >= number of nodes (default: 50) + # NOTES: # enable_configless: required (as set in template) # cloud_dns: requires working DNS # power_save_*interval: options are defaults but should enable changes From 4ea81c5fc015a899b9c0cf91df85c57a9b50656c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 09:06:47 +0000 Subject: [PATCH 009/133] set new image ID --- .../sausage-autoscale/inventory/group_vars/autoscale.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml index a87c5afe9..275b938e7 100644 --- a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml +++ b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml @@ -1,6 +1,6 @@ openhpc_autoscale_clouds: /home/centos/steveb-clouds.yaml # TODO: change below to be defined somewhere else -openhpc_autoscale_image: CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64 # TODO change to built image +openhpc_autoscale_image: 1c3025f4-8384-4f3f-946e-8ce3b8e32292 openhpc_autoscale_network: stackhpc openhpc_autoscale_flavor: chipolata openhpc_autoscale_keypair: steveb-local From 354c67a36886ef605dd70aa3a18e57ee6dca085a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 09:07:19 +0000 Subject: [PATCH 010/133] set autoscale branch for openhpc role requirements --- requirements.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.yml b/requirements.yml index c945cb931..90f280107 100644 --- a/requirements.yml +++ b/requirements.yml @@ -2,6 +2,7 @@ roles: - src: stackhpc.nfs - src: stackhpc.openhpc + version: feature/autoscale - src: cloudalchemy.node_exporter - src: cloudalchemy.blackbox-exporter - src: cloudalchemy.prometheus From c74a271c11b2587bda26a8f3baabb45060517021 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 16:17:59 +0000 Subject: [PATCH 011/133] fix /etc/openstack for autoscale --- ansible/autoscale.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ansible/autoscale.yml b/ansible/autoscale.yml index d9dc9b057..2d5fba68a 100644 --- a/ansible/autoscale.yml +++ b/ansible/autoscale.yml @@ -1,3 +1,10 @@ +- name: Ensure /etc/openstack/ exists and is readable by slurm # TODO: think this clashes with rebuild? + file: + path: /etc/openstack/ + state: directory + owner: slurm + group: slurm + mode: u=rx - name: Copy out clouds.yaml copy: src: "{{ openhpc_autoscale_clouds }}" @@ -5,13 +12,6 @@ owner: slurm group: slurm mode: '0400' -- name: Ensure /etc/openstack/ is readable by slurm # TODO: think this clashes with rebuild? - file: - path: /etc/openstack/ - state: directory - owner: slurm - group: slurm - mode: u=rx - name: Setup slurm tools include_role: name: stackhpc.slurm_openstack_tools.pytools From 73eed3962b5d3ecf590aee357a7f6a1b210ad4a4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 16:18:40 +0000 Subject: [PATCH 012/133] remove SlurmctldParameters unsupported in slurm 20.02.5 --- environments/sausage-autoscale/inventory/group_vars/openhpc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml index 7a9795419..5dc8bedfe 100644 --- a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml +++ b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml @@ -19,7 +19,7 @@ openhpc_slurm_conf: SuspendProgram: /opt/slurm-tools/bin/suspend ResumeProgram: /opt/slurm-tools/bin/resume ResumeTimeout: 300 - SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns,power_save_interval=10,power_save_min_interval=0 + SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns # FIXME: need to set TreeWidth to >= number of nodes (default: 50) # NOTES: # enable_configless: required (as set in template) From 967d1077e9ffebc6a281716c117854a7390cec32 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 16:19:48 +0000 Subject: [PATCH 013/133] use openhpc_munge_key parameter --- environments/common/inventory/group_vars/all/openhpc.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index f1bae139b..b6d8abacf 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -15,7 +15,6 @@ openhpc_slurmdbd_mysql_username: slurm openhpc_slurm_control_host: "{{ groups['control'] | first }}" openhpc_slurm_partitions: - name: "compute" -<<<<<<< HEAD # cloud_nodes: 2 openhpc_default_packages: - slurm-libpmi-ohpc # to allow intel mpi to work properly @@ -23,10 +22,5 @@ openhpc_default_packages: openhpc_extra_packages: [] openhpc_packages: "{{ openhpc_default_packages + openhpc_extra_packages }}" openhpc_munge_key: "{{ secrets_openhpc_mungekey | b64decode }}" -======= -openhpc_packages: - - slurm-libpmi-ohpc -slurm_munge_key: "{{ secrets_openhpc_mungekey }}" ->>>>>>> b8d9eba... move cloud_nodes config to right environment openhpc_slurm_configless: true openhpc_login_only_nodes: login \ No newline at end of file From 94de0995710bb82d469f2bf619f6f409ccb4832c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 18:36:30 +0000 Subject: [PATCH 014/133] don't cache node ips in slurm --- environments/sausage-autoscale/inventory/group_vars/openhpc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml index 5dc8bedfe..266f1fe65 100644 --- a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml +++ b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml @@ -20,6 +20,7 @@ openhpc_slurm_conf: ResumeProgram: /opt/slurm-tools/bin/resume ResumeTimeout: 300 SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns + CommunicationParameters: NoAddrCache # FIXME: need to set TreeWidth to >= number of nodes (default: 50) # NOTES: # enable_configless: required (as set in template) From 99793ad8214e799bb8029c217c42f7c24fd907b1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 Apr 2021 18:38:02 +0000 Subject: [PATCH 015/133] tune slurm debug info for powersave only --- .../sausage-autoscale/inventory/group_vars/openhpc.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml index 266f1fe65..c7415bb45 100644 --- a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml +++ b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml @@ -10,7 +10,8 @@ openhpc_packages: openhpc_slurm_conf: SlurmctldDebug: debug5 SlurmctldLogFile: /var/log/slurmctld.log - SlurmdDebug: debug5 + # SlurmdDebug: debug5 + DebugFlags: PowerSave SlurmdLogFile: /var/log/slurmd.log SuspendExcNodes: "{{ groups['compute'] | join(',') }}" # i.e. all tf-defined nodes in the partition # BELOW FOR TESTING ONLY, NOT PRODUCTION VALUES! From 1a3fd48c012057b2c3f7b1bca9d6c586245cff16 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 6 Apr 2021 12:40:19 +0000 Subject: [PATCH 016/133] use default security groups --- .../sausage-autoscale/terraform/main.tf | 49 ------------------- 1 file changed, 49 deletions(-) diff --git a/environments/sausage-autoscale/terraform/main.tf b/environments/sausage-autoscale/terraform/main.tf index 1523eeae5..fb9931dec 100644 --- a/environments/sausage-autoscale/terraform/main.tf +++ b/environments/sausage-autoscale/terraform/main.tf @@ -43,53 +43,6 @@ variable "compute_image" { type = string } -resource "openstack_networking_secgroup_v2" "secgroup_slurm_login" { - name = "secgroup_slurm_login" - description = "Rules for the slurm login node" - # Fully manage with terraform - delete_default_rules = true -} - -resource "openstack_networking_secgroup_v2" "secgroup_slurm_compute" { - name = "secgroup_slurm_compute" - description = "Rules for the slurm compute node" - # Fully manage with terraform - delete_default_rules = true -} - -resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_egress_v4" { - direction = "egress" - ethertype = "IPv4" - security_group_id = openstack_networking_secgroup_v2.secgroup_slurm_login.id -} - -resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_ingress_tcp_v4" { - direction = "ingress" - ethertype = "IPv4" - # NOTE: You will want to lock down the ports in a production environment. This will require - # setting of static ports for the NFS server see: - # https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/6/html/storage_administration_guide/s2-nfs-nfs-firewall-config - port_range_min = 1 - protocol = "tcp" - port_range_max = 65535 - security_group_id = openstack_networking_secgroup_v2.secgroup_slurm_login.id -} - -resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_compute_rule_egress_v4" { - direction = "egress" - ethertype = "IPv4" - security_group_id = openstack_networking_secgroup_v2.secgroup_slurm_compute.id -} - -resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_compute_rule_ingress_tcp_v4" { - direction = "ingress" - ethertype = "IPv4" - port_range_min = 1 - protocol = "tcp" - port_range_max = 65535 - security_group_id = openstack_networking_secgroup_v2.secgroup_slurm_compute.id -} - resource "openstack_compute_instance_v2" "login" { name = "${var.cluster_name}-login-0" @@ -99,7 +52,6 @@ resource "openstack_compute_instance_v2" "login" { network { name = var.network } - security_groups = [openstack_networking_secgroup_v2.secgroup_slurm_login.name] } @@ -115,7 +67,6 @@ resource "openstack_compute_instance_v2" "compute" { network { name = var.network } - security_groups = [openstack_networking_secgroup_v2.secgroup_slurm_compute.name] } # TODO: needs fixing for case where creation partially fails resulting in "compute.network is empty list of object" From b9921610d71adea7911263d06895f291ba89231b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 6 Apr 2021 12:40:54 +0000 Subject: [PATCH 017/133] remove ssh proxying from inventory --- environments/sausage-autoscale/terraform/inventory.tpl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/environments/sausage-autoscale/terraform/inventory.tpl b/environments/sausage-autoscale/terraform/inventory.tpl index 361a359ec..965f1f330 100644 --- a/environments/sausage-autoscale/terraform/inventory.tpl +++ b/environments/sausage-autoscale/terraform/inventory.tpl @@ -1,6 +1,5 @@ [all:vars] ansible_user=centos -ssh_proxy=${login.network[0].fixed_ip_v4} openhpc_cluster_name=${cluster_name} [${cluster_name}_login] @@ -11,13 +10,9 @@ ${login.name} ansible_host=${login.network[0].fixed_ip_v4} server_networks='${js ${compute.name} ansible_host=${compute.network[0].fixed_ip_v4} server_networks='${jsonencode({for net in compute.network: net.name => [ net.fixed_ip_v4 ] })}' %{ endfor ~} -[${cluster_name}_compute:vars] -ansible_ssh_common_args='-o ProxyCommand="ssh centos@${login.network[0].fixed_ip_v4} -W %h:%p"' - [cluster_login:children] ${cluster_name}_login -# NOTE: This is hardcoded in the tests role [cluster_compute:children] ${cluster_name}_compute From 0ebba2033ea18d5426562ddc225b2811d5127596 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 6 Apr 2021 12:41:46 +0000 Subject: [PATCH 018/133] add helloworld MPI program setup --- environments/sausage-autoscale/hooks/post.yml | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 environments/sausage-autoscale/hooks/post.yml diff --git a/environments/sausage-autoscale/hooks/post.yml b/environments/sausage-autoscale/hooks/post.yml new file mode 100644 index 000000000..fa23fb4a6 --- /dev/null +++ b/environments/sausage-autoscale/hooks/post.yml @@ -0,0 +1,54 @@ +- hosts: login + gather_facts: false + tasks: + - name: make helloworld directory + file: + path: /mnt/nfs/helloworld + state: directory + owner: centos + group: centos + become: yes + + - name: make helloworld source + copy: + dest: /mnt/nfs/helloworld/helloworld.c + content: | + #include + #include + + int main(int argc, char** argv) { + // Initialize the MPI environment + MPI_Init(NULL, NULL); + + // Get the number of processes + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + // Get the rank of the process + int world_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + // Get the name of the processor + char processor_name[MPI_MAX_PROCESSOR_NAME]; + int name_len; + MPI_Get_processor_name(processor_name, &name_len); + // Print off a hello world message + printf("Hello world from processor %s, rank %d out of %d processors\n", + processor_name, world_rank, world_size); + // Finalize the MPI environment. + MPI_Finalize(); + } + + - name: compile helloworld + shell: + cmd: mpicc -o helloworld helloworld.c + chdir: /mnt/nfs/helloworld/ + + - name: make helloworld sbatch script + copy: + dest: /mnt/nfs/helloworld/helloworld.sh + content: | + #!/bin/bash + #SBATCH --ntasks-per-node=1 + #SBATCH --time=0:10:0 + #SBATCH --exclusive + export SLURM_MPI_TYPE=pmix_v3 + srun helloworld From 79b0516d770ccb134a8386227c48414a323a4da3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 6 Apr 2021 12:42:31 +0000 Subject: [PATCH 019/133] specify NFS server by hostname not IP --- environments/sausage-autoscale/inventory/group_vars/nfs.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 environments/sausage-autoscale/inventory/group_vars/nfs.yml diff --git a/environments/sausage-autoscale/inventory/group_vars/nfs.yml b/environments/sausage-autoscale/inventory/group_vars/nfs.yml new file mode 100644 index 000000000..68b31e8b6 --- /dev/null +++ b/environments/sausage-autoscale/inventory/group_vars/nfs.yml @@ -0,0 +1 @@ +nfs_server_default: "{{ groups['control'] | first }}" \ No newline at end of file From 9f9430a86a61945eaa89b4b43368275aef359096 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 6 Apr 2021 12:44:51 +0000 Subject: [PATCH 020/133] update to latest built image --- .../sausage-autoscale/inventory/group_vars/autoscale.yml | 2 +- environments/sausage-autoscale/terraform/terraform.tfvars | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml index 275b938e7..88f8f57d6 100644 --- a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml +++ b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml @@ -1,6 +1,6 @@ openhpc_autoscale_clouds: /home/centos/steveb-clouds.yaml # TODO: change below to be defined somewhere else -openhpc_autoscale_image: 1c3025f4-8384-4f3f-946e-8ce3b8e32292 +openhpc_autoscale_image: ohpc-compute-210406-1108.qcow2 openhpc_autoscale_network: stackhpc openhpc_autoscale_flavor: chipolata openhpc_autoscale_keypair: steveb-local diff --git a/environments/sausage-autoscale/terraform/terraform.tfvars b/environments/sausage-autoscale/terraform/terraform.tfvars index 04bfb7ade..3e4a4d92e 100644 --- a/environments/sausage-autoscale/terraform/terraform.tfvars +++ b/environments/sausage-autoscale/terraform/terraform.tfvars @@ -6,5 +6,5 @@ network = "stackhpc" login_image = "CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64" login_flavor = "chipolata" -compute_image = "CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64" +compute_image = "ohpc-compute-210406-1108.qcow2" compute_flavor = "chipolata" From 95a8ed27a076c4b589ea67f4d8a4c41325b45f51 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 6 Apr 2021 13:03:26 +0000 Subject: [PATCH 021/133] remove inventory hosts file from git --- .../sausage-autoscale/inventory/.gitignore | 1 + .../sausage-autoscale/inventory/hosts | 31 ------------------- 2 files changed, 1 insertion(+), 31 deletions(-) create mode 100644 environments/sausage-autoscale/inventory/.gitignore delete mode 100755 environments/sausage-autoscale/inventory/hosts diff --git a/environments/sausage-autoscale/inventory/.gitignore b/environments/sausage-autoscale/inventory/.gitignore new file mode 100644 index 000000000..9b0b900ab --- /dev/null +++ b/environments/sausage-autoscale/inventory/.gitignore @@ -0,0 +1 @@ +hosts \ No newline at end of file diff --git a/environments/sausage-autoscale/inventory/hosts b/environments/sausage-autoscale/inventory/hosts deleted file mode 100755 index 203a41376..000000000 --- a/environments/sausage-autoscale/inventory/hosts +++ /dev/null @@ -1,31 +0,0 @@ -[all:vars] -ansible_user=centos -ssh_proxy=10.0.3.100 -openhpc_cluster_name=sbscale - -[sbscale_login] -sbscale-login-0 ansible_host=10.0.3.100 server_networks='{"stackhpc":["10.0.3.100"]}' - -[sbscale_compute] -sbscale-compute-0 ansible_host=10.0.3.107 server_networks='{"stackhpc":["10.0.3.107"]}' -sbscale-compute-1 ansible_host=10.0.3.71 server_networks='{"stackhpc":["10.0.3.71"]}' - -[sbscale_compute:vars] -ansible_ssh_common_args='-o ProxyCommand="ssh centos@10.0.3.100 -W %h:%p"' - -[cluster_login:children] -sbscale_login - -# NOTE: This is hardcoded in the tests role -[cluster_compute:children] -sbscale_compute - -[login:children] -cluster_login - -[compute:children] -cluster_compute - -[cluster:children] -login -compute \ No newline at end of file From 510a1bf85a6c26967a49ae5f4b6c5a1a6237d6bc Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 8 Apr 2021 12:36:02 +0000 Subject: [PATCH 022/133] show cloud nodes even when powered off --- environments/sausage-autoscale/inventory/group_vars/openhpc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml index c7415bb45..3ac9948f8 100644 --- a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml +++ b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml @@ -22,6 +22,7 @@ openhpc_slurm_conf: ResumeTimeout: 300 SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns CommunicationParameters: NoAddrCache + PrivateData: cloud # FIXME: need to set TreeWidth to >= number of nodes (default: 50) # NOTES: # enable_configless: required (as set in template) From 9392c39f2512f5f6157de32e1a47ec886a934fda Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 8 Apr 2021 13:58:05 +0000 Subject: [PATCH 023/133] revert compute image to vanilla cento8.2 --- environments/sausage-autoscale/terraform/terraform.tfvars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/sausage-autoscale/terraform/terraform.tfvars b/environments/sausage-autoscale/terraform/terraform.tfvars index 3e4a4d92e..04bfb7ade 100644 --- a/environments/sausage-autoscale/terraform/terraform.tfvars +++ b/environments/sausage-autoscale/terraform/terraform.tfvars @@ -6,5 +6,5 @@ network = "stackhpc" login_image = "CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64" login_flavor = "chipolata" -compute_image = "ohpc-compute-210406-1108.qcow2" +compute_image = "CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64" compute_flavor = "chipolata" From 9467973554645dfc39bb4251a7c3db88dde97dce Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 8 Sep 2021 13:55:11 +0000 Subject: [PATCH 024/133] remove sausagecloud environment --- environments/sausage-autoscale/README.md | 176 ------------------ environments/sausage-autoscale/activate | 23 --- environments/sausage-autoscale/ansible.cfg | 14 -- environments/sausage-autoscale/hooks/.gitkeep | 0 environments/sausage-autoscale/hooks/post.yml | 54 ------ .../sausage-autoscale/inventory/.gitignore | 1 - .../inventory/group_vars/all/.gitkeep | 0 .../inventory/group_vars/all/secrets.yml | 43 ----- .../inventory/group_vars/autoscale.yml | 6 - .../inventory/group_vars/nfs.yml | 1 - .../inventory/group_vars/openhpc.yml | 31 --- .../inventory/group_vars/rebuild.yml | 1 - .../sausage-autoscale/inventory/groups | 17 -- .../terraform/.terraform.lock.hcl | 39 ---- .../sausage-autoscale/terraform/inventory.tpl | 27 --- .../sausage-autoscale/terraform/main.tf | 82 -------- .../terraform/terraform.tfvars | 10 - 17 files changed, 525 deletions(-) delete mode 100644 environments/sausage-autoscale/README.md delete mode 100644 environments/sausage-autoscale/activate delete mode 100644 environments/sausage-autoscale/ansible.cfg delete mode 100644 environments/sausage-autoscale/hooks/.gitkeep delete mode 100644 environments/sausage-autoscale/hooks/post.yml delete mode 100644 environments/sausage-autoscale/inventory/.gitignore delete mode 100644 environments/sausage-autoscale/inventory/group_vars/all/.gitkeep delete mode 100644 environments/sausage-autoscale/inventory/group_vars/all/secrets.yml delete mode 100644 environments/sausage-autoscale/inventory/group_vars/autoscale.yml delete mode 100644 environments/sausage-autoscale/inventory/group_vars/nfs.yml delete mode 100644 environments/sausage-autoscale/inventory/group_vars/openhpc.yml delete mode 100644 environments/sausage-autoscale/inventory/group_vars/rebuild.yml delete mode 100644 environments/sausage-autoscale/inventory/groups delete mode 100644 environments/sausage-autoscale/terraform/.terraform.lock.hcl delete mode 100644 environments/sausage-autoscale/terraform/inventory.tpl delete mode 100644 environments/sausage-autoscale/terraform/main.tf delete mode 100644 environments/sausage-autoscale/terraform/terraform.tfvars diff --git a/environments/sausage-autoscale/README.md b/environments/sausage-autoscale/README.md deleted file mode 100644 index 69f25a09e..000000000 --- a/environments/sausage-autoscale/README.md +++ /dev/null @@ -1,176 +0,0 @@ -# Sausage-Autoscale cluster - -Dev env for autoscaling on sausagecloud - -# Directory structure - -## terraform - -Contains terraform configuration to deploy infrastructure. - -## inventory - -Ansible inventory for configuring the infrastructure. - -# Setup - -In the repo root, run: - - python3 -m venv venv # TODO: do we need system-site-packages? - . venv/bin/activate - pip install -U upgrade pip - pip install requirements.txt - ansible-galaxy install -r requirements.yml -p ansible/roles - ansible-galaxy collection install -r requirements.yml -p ansible/collections # don't worry about collections path warning - -# Activating the environment - -There is a small environment file that you must `source` which defines environment -variables that reference the configuration path. This is so that we can locate -resources relative the environment directory. - - . environments/sausage-autoscale/activate - -The pattern we use is that all resources referenced in the inventory -are located in the environment directory containing the inventory that -references them. - -# Common configuration - -Configuarion is shared by specifiying multiple inventories. We reference the `common` -inventory from `ansible.cfg`, including it before the environment specific -inventory, located at `./inventory`. - -Inventories specified later in the list can override values set in the inventories -that appear earlier. This allows you to override values set by the `common` inventory. - -Any variables that would be identical for all environments should be defined in the `common` inventory. - -# Passwords - -Prior to running any other playbooks, you need to define a set of passwords. You can -use the `generate-passwords.yml` playbook to automate this process: - -``` -cd -ansible-playbook ansible/adhoc/generate-passwords.yml # can actually be run from anywhere once environment activated -``` - -This will output a set of passwords `inventory/group_vars/all/secrets.yml`. -Placing them in the inventory means that they will be defined for all playbooks. - -It is recommended to encrypt the contents of this file prior to commiting to git: - -``` -ansible-vault encrypt inventory/group_vars/all/secrets.yml -``` - -You will then need to provide a password when running the playbooks e.g: - -``` -ansible-playbook ../ansible/site.yml --tags grafana --ask-vault-password -``` - -See the [Ansible vault documentation](https://docs.ansible.com/ansible/latest/user_guide/vault.html) for more details. - - -# Deploy nodes with Terraform - -- Modify the keypair in `main.tf` and ensure the required Centos images are available on OpenStack. -- Activate the virtualenv and create the instances: - - . venv/bin/activate - cd environments/sausage-autoscale/ - terraform apply - -This creates an ansible inventory file `./inventory`. - -Note that this terraform deploys instances onto an existing network - for production use you probably want to create a network for the cluster. - -# Create and configure cluster with Ansible - -Now run one or more playbooks using: - - cd - ansible-playbook ansible/site.yml - -This provides: -- grafana at `http://:3000` - username `grafana`, password as set above -- prometheus at `http://:9090` - -NB: if grafana's yum repos are down you will see `Errors during downloading metadata for repository 'grafana' ...`. You can work around this using: - - ssh centos@ - sudo rm -rf /etc/yum.repos.d/grafana.repo - wget https://dl.grafana.com/oss/release/grafana-7.3.1-1.x86_64.rpm - sudo yum install grafana-7.3.1-1.x86_64.rpm - exit - ansible-playbook -i inventory monitoring.yml -e grafana_password= --skip-tags grafana_install - -# rebuild.yml - -# FIXME: outdated - -Enable the compute nodes of a Slurm-based OpenHPC cluster on Openstack to be reimaged from Slurm. - -For full details including the Slurm commmands to use see the [role's README](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools/blob/main/roles/rebuild/README.md) - -Ensure you have `~/.config/openstack/clouds.yaml` defining authentication for a a single Openstack cloud (see above README to change location). - -Then run: - - ansible-playbook -i inventory rebuild.yml - -Note this does not rebuild the nodes, only deploys the tools to do so. - -# test.yml - -This runs MPI-based tests on the cluster: -- `pingpong`: Runs Intel MPI Benchmark's IMB-MPI1 pingpong between a pair of (scheduler-selected) nodes. Reports zero-size message latency and maximum bandwidth. -- `pingmatrix`: Runs a similar pingpong test but between all pairs of nodes. Reports zero-size message latency & maximum bandwidth. -- `hpl-solo`: Runs HPL **separately** on all nodes, using 80% of memory, reporting Gflops on each node. - -These names can be used as tags to run only a subset of tests. For full details see the [role's README](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools/blob/main/roles/test/README.md). - -Note these are intended as post-deployment tests for a cluster to which you have root access - they are **not** intended for use on a system running production jobs: -- Test directories are created within `openhpc_tests_rootdir` (here `/mnt/nfs/ohcp-tests`) which must be on a shared filesystem (read/write from login/control and compute nodes) -- Generally, packages are only installed on the control/login node, and `/opt` is exported via NFS to the compute nodes. -- The exception is the `slurm-libpmi-ohpc` package (required for `srun` with Intel MPI) which is installed on all nodes. - -To achieve best performance for HPL set `openhpc_tests_hpl_NB` in [test.yml](test.yml) to the appropriate the HPL blocksize 'NB' for the compute node processor - for Intel CPUs see [here](https://software.intel.com/content/www/us/en/develop/documentation/mkl-linux-developer-guide/top/intel-math-kernel-library-benchmarks/intel-distribution-for-linpack-benchmark/configuring-parameters.html). - -Then run: - - ansible-playbook ../ansible/adhoc/test.yml - -Results will be reported in the ansible stdout - the pingmatrix test also writes an html results file onto the ansible host. - -Note that you can still use the `test.yml` playbook even if the terraform/ansible in this repo wasn't used to deploy the cluster - as long as it's running OpenHPC v2. Simply create an appropriate `inventory` file, e.g: - - [all:vars] - ansible_user=centos - - [cluster:children] - cluster_login - cluster_compute - - [cluster_login] - slurm-control - - [cluster_compute] - cpu-h21a5-u3-svn2 - cpu-h21a5-u3-svn4 - ... - -And run the `test.yml` playbook as described above. If you want to run tests only on a group from this inventory, rather than an entire partition, you can -use ``--limit`` - -Then running the tests passing this file as extra_vars: - - ansible-playbook ../ansible/test.yml --limit group-in-inventory - -# Destroying the cluster - -When finished, run: - - terraform destroy --auto-approve diff --git a/environments/sausage-autoscale/activate b/environments/sausage-autoscale/activate deleted file mode 100644 index e74031095..000000000 --- a/environments/sausage-autoscale/activate +++ /dev/null @@ -1,23 +0,0 @@ -export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) -echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" - -APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) -export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" - -export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") -echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" - -export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") -echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root" - -export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") -echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root" - -export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT") -echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" - -if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then - export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg -fi - - diff --git a/environments/sausage-autoscale/ansible.cfg b/environments/sausage-autoscale/ansible.cfg deleted file mode 100644 index c243e9958..000000000 --- a/environments/sausage-autoscale/ansible.cfg +++ /dev/null @@ -1,14 +0,0 @@ -[defaults] -any_errors_fatal = True -stdout_callback = debug -stderr_callback = debug -gathering = smart -forks = 30 -host_key_checking = False -inventory = ../common/inventory,inventory -collections_path = ../../ansible/collections -roles_path = ../../ansible/roles - -[ssh_connection] -ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null -pipelining = True diff --git a/environments/sausage-autoscale/hooks/.gitkeep b/environments/sausage-autoscale/hooks/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/environments/sausage-autoscale/hooks/post.yml b/environments/sausage-autoscale/hooks/post.yml deleted file mode 100644 index fa23fb4a6..000000000 --- a/environments/sausage-autoscale/hooks/post.yml +++ /dev/null @@ -1,54 +0,0 @@ -- hosts: login - gather_facts: false - tasks: - - name: make helloworld directory - file: - path: /mnt/nfs/helloworld - state: directory - owner: centos - group: centos - become: yes - - - name: make helloworld source - copy: - dest: /mnt/nfs/helloworld/helloworld.c - content: | - #include - #include - - int main(int argc, char** argv) { - // Initialize the MPI environment - MPI_Init(NULL, NULL); - - // Get the number of processes - int world_size; - MPI_Comm_size(MPI_COMM_WORLD, &world_size); - // Get the rank of the process - int world_rank; - MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); - // Get the name of the processor - char processor_name[MPI_MAX_PROCESSOR_NAME]; - int name_len; - MPI_Get_processor_name(processor_name, &name_len); - // Print off a hello world message - printf("Hello world from processor %s, rank %d out of %d processors\n", - processor_name, world_rank, world_size); - // Finalize the MPI environment. - MPI_Finalize(); - } - - - name: compile helloworld - shell: - cmd: mpicc -o helloworld helloworld.c - chdir: /mnt/nfs/helloworld/ - - - name: make helloworld sbatch script - copy: - dest: /mnt/nfs/helloworld/helloworld.sh - content: | - #!/bin/bash - #SBATCH --ntasks-per-node=1 - #SBATCH --time=0:10:0 - #SBATCH --exclusive - export SLURM_MPI_TYPE=pmix_v3 - srun helloworld diff --git a/environments/sausage-autoscale/inventory/.gitignore b/environments/sausage-autoscale/inventory/.gitignore deleted file mode 100644 index 9b0b900ab..000000000 --- a/environments/sausage-autoscale/inventory/.gitignore +++ /dev/null @@ -1 +0,0 @@ -hosts \ No newline at end of file diff --git a/environments/sausage-autoscale/inventory/group_vars/all/.gitkeep b/environments/sausage-autoscale/inventory/group_vars/all/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/environments/sausage-autoscale/inventory/group_vars/all/secrets.yml b/environments/sausage-autoscale/inventory/group_vars/all/secrets.yml deleted file mode 100644 index 29f86e2dd..000000000 --- a/environments/sausage-autoscale/inventory/group_vars/all/secrets.yml +++ /dev/null @@ -1,43 +0,0 @@ ---- -# Ansible managed -secrets_openhpc_elasticsearch_admin_password: pC2:.3c1QShckz1BxB5u -secrets_openhpc_elasticsearch_kibana_password: TVevBQ2JXTyoSW.bT_ba -secrets_openhpc_grafana_admin_password: QMiyymbFD.YC7M.39mCa -secrets_openhpc_mungekey: - content: 'YCbG2tA9UWIezb3xRRqK2vNzd6tOnGMs0kpPO+quHuqGXLsXagFjXq8Kqnd1/UIqoRW/TH3AfRNF - - yhuvemkRd3TlZLrvBZzMcQw0jAP2sI/+4hPpeIEk1kcQFVgE9A1HppLc0CxI7SskDPmY3vGwnuo7 - - 4K19jYxgPkIb9FUKCNetKgHR7L78LsbZxWUYkmvO6rCDUeLgMchFkjoi5Lwr+i1JJAxoGhT0yLmW - - D0hEenYePgsefzopwEnKEHByhnx0ROlJ86S58bh+rOnAqBWWJ8Im71NeJS58Moyrh9VLOkmRUCIj - - e0bhEKd7+/a5I4GN6KIo1oXRT74TxVHkwypSqFgAbVF5KMSsuY+5eG4JLcpTOGZYQpbAY9ICtnjM - - U6T6YhVXYvurVcb7N2ybub8veIwWeS98Yr2C9ZwsBzvpA2Fk3wxCFjo6vxe47U2nsezIUAUxVH7U - - V6jNMVoT4GZMQcKRsTp0zoAVAund6jMjsQ6h6Ly8EYyiKz6itTq4L5OqotZ0tUCX6xnVxtaD4LZb - - tQfZbcxPdd1C7NtTfImUsxHDp2CBIu/VDZWen/iafGaPeI83XVkC8Kk1QwhjBcRnlJEw2cK4TdBW - - 6Soy8CnNZMd92iqlqIZs7iZHu9FLyLiLCrkjaDnxM0UH0RP9CPTihiE47w874HVOQioacNX6U3Dz - - 3I0vxUAd/AF6ZrmBtOh3EekbxPtFNY7Gw3qPCbbJM6v5+XFjz//Lj1GFYzGK1DA7jhekzk5vtOWe - - k2vZcyqPYOIxFlqtm3OGt+f6V9G/xvYvRofE1EbO9qU1vqVRbW8Z7dqOR4AwomW2UlhH9G/ijLZZ - - EKqOWiCVONfMEe+Cndi/WH80R/nASx8hMJrTp0VOPtNjN+LWb/pPE/cSY9hbuA2EvqJB4gFQzmqz - - sFpQAqPVS8/+vesiKKVcnxUeMoRBx8g9CmdFTIvz5fU6M9lh7DjYoKcKx7eKtQhCAktyeI21o+Tn - - 2gyALzcxX29VCJy/8n/8qC26T9wLjN2URpO95yT2+k+Uv96R6Uj4zK4CD2c7yXm/0CmyrUWf3EPp - - VeaaWhy+KKR7T923TCEETiwSlwOynwb4lHLPmE17t8XBqYAqWGL2e8uDuLYhPAf+U5Bwt+LiXO5j - - hjg6szwpdSirDl1vpkqTDaOGdzVcTb+j8NfHDCdVOJbWu2I8sAHkjDRl+faagwxeMIGpTjoRi225 - - mj9rJdCbZxCSrwbE1r1koHrJZ+XN4AG4OrmTXdXMSLhpJuptyeNsRmvWxBe665tAxktRZ/kQUY3c - - W1zq03n3wtBkilL1wh/Fata4XrN5UZhpVSwT+7Z3gPacJVt5UjedkqpW8br+Pxw4efQExeDH2g==' -secrets_openhpc_mysql_root_password: XuY4ATpIzRje.PhkXI-t -secrets_openhpc_mysql_slurm_password: bS7GCWoTtsf4cjLo70S5 diff --git a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml b/environments/sausage-autoscale/inventory/group_vars/autoscale.yml deleted file mode 100644 index 88f8f57d6..000000000 --- a/environments/sausage-autoscale/inventory/group_vars/autoscale.yml +++ /dev/null @@ -1,6 +0,0 @@ -openhpc_autoscale_clouds: /home/centos/steveb-clouds.yaml -# TODO: change below to be defined somewhere else -openhpc_autoscale_image: ohpc-compute-210406-1108.qcow2 -openhpc_autoscale_network: stackhpc -openhpc_autoscale_flavor: chipolata -openhpc_autoscale_keypair: steveb-local diff --git a/environments/sausage-autoscale/inventory/group_vars/nfs.yml b/environments/sausage-autoscale/inventory/group_vars/nfs.yml deleted file mode 100644 index 68b31e8b6..000000000 --- a/environments/sausage-autoscale/inventory/group_vars/nfs.yml +++ /dev/null @@ -1 +0,0 @@ -nfs_server_default: "{{ groups['control'] | first }}" \ No newline at end of file diff --git a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml b/environments/sausage-autoscale/inventory/group_vars/openhpc.yml deleted file mode 100644 index 3ac9948f8..000000000 --- a/environments/sausage-autoscale/inventory/group_vars/openhpc.yml +++ /dev/null @@ -1,31 +0,0 @@ -openhpc_login_only_nodes: '' -openhpc_slurm_partitions: - - name: "compute" - cloud_nodes: 2 -openhpc_packages: - - slurm-libpmi-ohpc - - wget - - lmod-defaults-gnu9-openmpi4-ohpc - - imb-gnu9-openmpi4-ohpc -openhpc_slurm_conf: - SlurmctldDebug: debug5 - SlurmctldLogFile: /var/log/slurmctld.log - # SlurmdDebug: debug5 - DebugFlags: PowerSave - SlurmdLogFile: /var/log/slurmd.log - SuspendExcNodes: "{{ groups['compute'] | join(',') }}" # i.e. all tf-defined nodes in the partition - # BELOW FOR TESTING ONLY, NOT PRODUCTION VALUES! - SuspendTime: 120 - SuspendTimeout: 300 - SuspendProgram: /opt/slurm-tools/bin/suspend - ResumeProgram: /opt/slurm-tools/bin/resume - ResumeTimeout: 300 - SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns - CommunicationParameters: NoAddrCache - PrivateData: cloud - # FIXME: need to set TreeWidth to >= number of nodes (default: 50) - # NOTES: - # enable_configless: required (as set in template) - # cloud_dns: requires working DNS - # power_save_*interval: options are defaults but should enable changes - # reboot_from_controller: should be really useful but actually we're already setup for rebuild on computes, so use that diff --git a/environments/sausage-autoscale/inventory/group_vars/rebuild.yml b/environments/sausage-autoscale/inventory/group_vars/rebuild.yml deleted file mode 100644 index b2eba881a..000000000 --- a/environments/sausage-autoscale/inventory/group_vars/rebuild.yml +++ /dev/null @@ -1 +0,0 @@ -openhpc_rebuild_clouds: ~/steveb-openrc.sh diff --git a/environments/sausage-autoscale/inventory/groups b/environments/sausage-autoscale/inventory/groups deleted file mode 100644 index 4cec8ab8f..000000000 --- a/environments/sausage-autoscale/inventory/groups +++ /dev/null @@ -1,17 +0,0 @@ -[control:children] -login - -[nfs:children] -cluster - -[openhpc:children] -cluster - -[mysql:children] -control - -[rebuild:children] -compute - -[autoscale:children] -login # actually controller diff --git a/environments/sausage-autoscale/terraform/.terraform.lock.hcl b/environments/sausage-autoscale/terraform/.terraform.lock.hcl deleted file mode 100644 index 8f9e2298d..000000000 --- a/environments/sausage-autoscale/terraform/.terraform.lock.hcl +++ /dev/null @@ -1,39 +0,0 @@ -# This file is maintained automatically by "terraform init". -# Manual edits may be lost in future updates. - -provider "registry.terraform.io/hashicorp/local" { - version = "2.1.0" - hashes = [ - "h1:EYZdckuGU3n6APs97nS2LxZm3dDtGqyM4qaIvsmac8o=", - "zh:0f1ec65101fa35050978d483d6e8916664b7556800348456ff3d09454ac1eae2", - "zh:36e42ac19f5d68467aacf07e6adcf83c7486f2e5b5f4339e9671f68525fc87ab", - "zh:6db9db2a1819e77b1642ec3b5e95042b202aee8151a0256d289f2e141bf3ceb3", - "zh:719dfd97bb9ddce99f7d741260b8ece2682b363735c764cac83303f02386075a", - "zh:7598bb86e0378fd97eaa04638c1a4c75f960f62f69d3662e6d80ffa5a89847fe", - "zh:ad0a188b52517fec9eca393f1e2c9daea362b33ae2eb38a857b6b09949a727c1", - "zh:c46846c8df66a13fee6eff7dc5d528a7f868ae0dcf92d79deaac73cc297ed20c", - "zh:dc1a20a2eec12095d04bf6da5321f535351a594a636912361db20eb2a707ccc4", - "zh:e57ab4771a9d999401f6badd8b018558357d3cbdf3d33cc0c4f83e818ca8e94b", - "zh:ebdcde208072b4b0f8d305ebf2bfdc62c926e0717599dcf8ec2fd8c5845031c3", - "zh:ef34c52b68933bedd0868a13ccfd59ff1c820f299760b3c02e008dc95e2ece91", - ] -} - -provider "registry.terraform.io/terraform-provider-openstack/openstack" { - version = "1.40.0" - hashes = [ - "h1:gBrsytNqUG1ZQPKys8KAvZkjesjimXb7vcrTmyFUTM0=", - "zh:278a878a256ec5447e1e64b5d9a691e3a1f7d5c247e536500c97c5b996bc2531", - "zh:5c7ae8cfe0831557c8c1988581f3fd0bdf182d15bcefbe645bb91564027e67d4", - "zh:944d75fc1e3d54df4c47e5d34007927abf4fa79e2107b05d14f11b52970a6164", - "zh:a50922d05185598a9264a25eff6f01ce7671c70a562a3ef93e9bb7a449e358b0", - "zh:adb87ad3782f1f7a5eaeedbcffa0e5559d2372502f9af91781aa13c11cf4b47b", - "zh:c0e4218259a37f16c10b4779009f0b0b5d467e4d347fc2aa3a212f1ee3a71d63", - "zh:c2eb4f40cbd78238500a3a84ba995060bfc50f770bd13732ae50b73687f3dce6", - "zh:ca8a38fe932972d0d7fdc51f84ae775648b7aff3c96b8ead085007e880ee987f", - "zh:ce4f703719d646507d6006085dc1114954c75710226df43078169b2b01993537", - "zh:e29542a492bbf55613d20b5f68ed4357cbc8bb09d61a1752d2976e5e1608879d", - "zh:e68d47b85b9da089f8f7102c23545331c15a9e6ea99875926d2ebf6e38bf2073", - "zh:fdb10cb345250d7c47e342def106bd10ef75493ef6edf15809e10e6367a0d9f6", - ] -} diff --git a/environments/sausage-autoscale/terraform/inventory.tpl b/environments/sausage-autoscale/terraform/inventory.tpl deleted file mode 100644 index 965f1f330..000000000 --- a/environments/sausage-autoscale/terraform/inventory.tpl +++ /dev/null @@ -1,27 +0,0 @@ -[all:vars] -ansible_user=centos -openhpc_cluster_name=${cluster_name} - -[${cluster_name}_login] -${login.name} ansible_host=${login.network[0].fixed_ip_v4} server_networks='${jsonencode({for net in login.network: net.name => [ net.fixed_ip_v4 ] })}' - -[${cluster_name}_compute] -%{ for compute in computes ~} -${compute.name} ansible_host=${compute.network[0].fixed_ip_v4} server_networks='${jsonencode({for net in compute.network: net.name => [ net.fixed_ip_v4 ] })}' -%{ endfor ~} - -[cluster_login:children] -${cluster_name}_login - -[cluster_compute:children] -${cluster_name}_compute - -[login:children] -cluster_login - -[compute:children] -cluster_compute - -[cluster:children] -login -compute \ No newline at end of file diff --git a/environments/sausage-autoscale/terraform/main.tf b/environments/sausage-autoscale/terraform/main.tf deleted file mode 100644 index fb9931dec..000000000 --- a/environments/sausage-autoscale/terraform/main.tf +++ /dev/null @@ -1,82 +0,0 @@ -terraform { - required_version = ">= 0.14" - required_providers { - openstack = { - source = "terraform-provider-openstack/openstack" - } - } -} - -variable "environment_root" { - type = string -} - -variable "compute_names" { - default = ["compute-0", "compute-1"] -} - -variable "cluster_name" { - default = "testohpc" -} - -variable "key_pair" { - type = string -} - -variable "network" { - type = string -} - -variable "login_flavor" { - type = string -} - -variable "login_image" { - type = string -} - -variable "compute_flavor" { - type = string -} - -variable "compute_image" { - type = string -} - -resource "openstack_compute_instance_v2" "login" { - - name = "${var.cluster_name}-login-0" - image_name = var.login_image - flavor_name = var.login_flavor - key_pair = var.key_pair - network { - name = var.network - } -} - - -resource "openstack_compute_instance_v2" "compute" { - - for_each = toset(var.compute_names) - - name = "${var.cluster_name}-${each.value}" - image_name = var.compute_image - flavor_name = var.compute_flavor - #flavor_name = "compute-A" - key_pair = var.key_pair - network { - name = var.network - } -} - -# TODO: needs fixing for case where creation partially fails resulting in "compute.network is empty list of object" -resource "local_file" "hosts" { - content = templatefile("${path.module}/inventory.tpl", - { - "cluster_name": var.cluster_name - "login": openstack_compute_instance_v2.login, - "computes": openstack_compute_instance_v2.compute, - }, - ) - filename = "${var.environment_root}/inventory/hosts" -} \ No newline at end of file diff --git a/environments/sausage-autoscale/terraform/terraform.tfvars b/environments/sausage-autoscale/terraform/terraform.tfvars deleted file mode 100644 index 04bfb7ade..000000000 --- a/environments/sausage-autoscale/terraform/terraform.tfvars +++ /dev/null @@ -1,10 +0,0 @@ -compute_names = ["compute-0", "compute-1"] -cluster_name = "sbscale" -key_pair = "steveb-local" -network = "stackhpc" - -login_image = "CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64" -login_flavor = "chipolata" - -compute_image = "CentOS-8-GenericCloud-8.2.2004-20200611.2.x86_64" -compute_flavor = "chipolata" From 000a4e773a0ffbdc0bf9bc6d1405c2b0bb4d6e2d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 8 Sep 2021 13:55:49 +0000 Subject: [PATCH 025/133] move autoscale into slurm --- ansible/autoscale.yml | 33 ------------------------ ansible/slurm.yml | 59 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 50 insertions(+), 42 deletions(-) delete mode 100644 ansible/autoscale.yml diff --git a/ansible/autoscale.yml b/ansible/autoscale.yml deleted file mode 100644 index 2d5fba68a..000000000 --- a/ansible/autoscale.yml +++ /dev/null @@ -1,33 +0,0 @@ -- name: Ensure /etc/openstack/ exists and is readable by slurm # TODO: think this clashes with rebuild? - file: - path: /etc/openstack/ - state: directory - owner: slurm - group: slurm - mode: u=rx -- name: Copy out clouds.yaml - copy: - src: "{{ openhpc_autoscale_clouds }}" - dest: /etc/openstack/clouds.yaml - owner: slurm - group: slurm - mode: '0400' -- name: Setup slurm tools - include_role: - name: stackhpc.slurm_openstack_tools.pytools -- name: Create SuspendProgram - template: - src: suspend.j2 - dest: /opt/slurm-tools/bin/suspend - owner: slurm - group: slurm - mode: u=rwx,go= - tags: suspend -- name: Create ResumeProgram - template: - src: resume.j2 - dest: /opt/slurm-tools/bin/resume - owner: slurm - group: slurm - mode: u=rwx,go= - tags: resume diff --git a/ansible/slurm.yml b/ansible/slurm.yml index f94145f81..6fa5c8a50 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -15,14 +15,14 @@ tags: - openhpc tasks: - - name: Add CentOS 8.3 Vault repo for OpenHPC hwloc dependency - # NB: REMOVE THIS once OpenHPC works on CentOS 8.4 - yum_repository: - name: vault - file: CentOS-Linux-Vault8.3 - description: CentOS 8.3 packages from Vault - baseurl: https://vault.centos.org/8.3.2011/BaseOS/$basearch/os/ - gpgkey: file:///etc/pki/rpm-gpg/RPM-GPG-KEY-centosofficial + # - name: Add CentOS 8.3 Vault repo for OpenHPC hwloc dependency + # # NB: REMOVE THIS once OpenHPC works on CentOS 8.4 + # yum_repository: + # name: vault + # file: CentOS-Linux-Vault8.3 + # description: CentOS 8.3 packages from Vault + # baseurl: https://vault.centos.org/8.3.2011/BaseOS/$basearch/os/ + # gpgkey: file:///etc/pki/rpm-gpg/RPM-GPG-KEY-centosofficial - import_role: name: stackhpc.openhpc @@ -41,7 +41,48 @@ tags: - autoscale tasks: - - import_tasks: autoscale.yml +- name: Configure autoscale + hosts: autoscale + become: yes + tags: + - autoscale + tasks: + - name: Create /etc/openstack + file: + path: /etc/openstack + state: directory + owner: root + group: root + mode: '0400' + - name: Copy out clouds.yaml + copy: + src: "{{ autoscale_clouds }}" + dest: /etc/openstack/clouds.yaml + owner: root + group: root + mode: '0400' + - name: Setup slurm tools (to get venv) + include_role: + name: stackhpc.slurm_openstack_tools.pytools + - name: Create SuspendProgram + template: + src: suspend.j2 + dest: /opt/slurm-tools/bin/suspend + owner: slurm + group: slurm + mode: u=rwx,go= + tags: suspend + - name: Create ResumeProgram + template: + src: resume.j2 + dest: /opt/slurm-tools/bin/resume + owner: slurm + group: slurm + mode: u=rwx,go= + tags: resume + - name: Reconfigure slurm + command: + cmd: scontrol reconfigure - name: Set locked memory limits on user-facing nodes hosts: From f6514e6f8940707486297e4146ea025c8ffa2f0f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 8 Sep 2021 13:57:08 +0000 Subject: [PATCH 026/133] allow for overriding slurm config in appliance --- environments/common/inventory/group_vars/all/openhpc.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index b6d8abacf..9c6ad2925 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -23,4 +23,8 @@ openhpc_extra_packages: [] openhpc_packages: "{{ openhpc_default_packages + openhpc_extra_packages }}" openhpc_munge_key: "{{ secrets_openhpc_mungekey | b64decode }}" openhpc_slurm_configless: true -openhpc_login_only_nodes: login \ No newline at end of file +openhpc_login_only_nodes: login + +openhpc_extra_config_overrides: {} +appliance_openhpc_extra_config: "{{ autoscale_openhpc_extra_config if groups['autoscale'] else {} }}" +openhpc_extra_config: "{{ appliance_openhpc_extra_config | combine(openhpc_extra_config_overrides) }}" From b285620e3d123aca983dc34bf6980470addd2732 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 8 Sep 2021 13:57:41 +0000 Subject: [PATCH 027/133] add autoscale group/group_vars --- ansible/templates/resume.j2 | 8 +++---- .../inventory/group_vars/all/autoscale.yml | 23 +++++++++++++++++++ environments/common/inventory/groups | 3 +++ 3 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 environments/common/inventory/group_vars/all/autoscale.yml diff --git a/ansible/templates/resume.j2 b/ansible/templates/resume.j2 index bf79c6c04..55a03d1ca 100644 --- a/ansible/templates/resume.j2 +++ b/ansible/templates/resume.j2 @@ -6,10 +6,10 @@ import openstack import pprint # all take a name or ID: -IMAGE = "{{ openhpc_autoscale_image }}" -NETWORK = "{{ openhpc_autoscale_network }}" -FLAVOR = "{{ openhpc_autoscale_flavor }}" -KEYPAIR = "{{ openhpc_autoscale_keypair }}" +IMAGE = "{{ autoscale_image }}" +NETWORK = "{{ autoscale_network }}" +FLAVOR = "{{ autoscale_flavor }}" +KEYPAIR = "{{ autoscale_keypair }}" # configure logging to syslog - by default only "info" and above categories appear logger = logging.getLogger("syslogger") diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml new file mode 100644 index 000000000..c5580d173 --- /dev/null +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -0,0 +1,23 @@ +autoscale_clouds: ~/.config/openstack/clouds.yaml +# TODO: change below to be defined somewhere else, poss as part of slurm config for partition?? +autoscale_image: ohpc-compute-210406-1108.qcow2 +autoscale_network: stackhpc +autoscale_flavor: chipolata +autoscale_keypair: steveb-local + +autoscale_openhpc_extra_config: + # required parameters: + SuspendExcNodes: "{{ groups['compute'] | join(',') }}" # i.e. all tf-defined nodes in the partition # TODO: fixme: hardcoded "compute" + SuspendProgram: /opt/slurm-tools/bin/suspend # TODO: fixme: hijacking slurm-tools + ResumeProgram: /opt/slurm-tools/bin/resume # TODO: fixme: hijacking slurm-tools + SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns + CommunicationParameters: NoAddrCache + # recommended: + PrivateData: cloud # shows cloud node state + # TODO: for testing only, not production: + DebugFlags: PowerSave + SuspendTime: 120 + SuspendTimeout: 300 + ResumeTimeout: 300 + # FIXME: maybe need to set TreeWidth to >= number of nodes (default: 50) + # power_save_*interval: options are defaults but should enable changes diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 4d86f8e7d..140fd509f 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -74,3 +74,6 @@ cluster [update] # All hosts to (optionally) run yum update on. + +[autoscale] +# Add control to enable autoscaling From 7ae504218720abd9ebfa9ff44493633db5a61c93 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 8 Sep 2021 13:57:58 +0000 Subject: [PATCH 028/133] use autoscale branch of openhpc role --- requirements.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 90f280107..afa1ab90e 100644 --- a/requirements.yml +++ b/requirements.yml @@ -1,8 +1,9 @@ --- roles: - src: stackhpc.nfs - - src: stackhpc.openhpc + - src: https://github.com/stackhpc/ansible-role-openhpc version: feature/autoscale + name: stackhpc.openhpc - src: cloudalchemy.node_exporter - src: cloudalchemy.blackbox-exporter - src: cloudalchemy.prometheus From ea5c3bcd4c87619232bd630fb18dc9627770cd35 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Sep 2021 11:27:39 +0000 Subject: [PATCH 029/133] Add podman_cidr to allow changing podman network range --- ansible/roles/filebeat/templates/filebeat.service.j2 | 2 +- ansible/roles/kibana/templates/kibana.service.j2 | 2 +- ansible/roles/opendistro/templates/opendistro.service.j2 | 2 +- ansible/roles/podman/tasks/validate.yml | 5 +++++ environments/common/inventory/group_vars/all/podman.yml | 1 + requirements.txt | 1 + 6 files changed, 10 insertions(+), 3 deletions(-) diff --git a/ansible/roles/filebeat/templates/filebeat.service.j2 b/ansible/roles/filebeat/templates/filebeat.service.j2 index 9553784a2..454ed2339 100644 --- a/ansible/roles/filebeat/templates/filebeat.service.j2 +++ b/ansible/roles/filebeat/templates/filebeat.service.j2 @@ -12,7 +12,7 @@ After=network-online.target [Service] Environment=PODMAN_SYSTEMD_UNIT=%n Restart=always -ExecStart=/usr/bin/podman run --sdnotify=conmon --cgroups=no-conmon --replace --name filebeat --user root --restart=always --security-opt label=disable --volume /var/log/:/logs:ro --volume /etc/filebeat/filebeat.yml:/usr/share/filebeat/filebeat.yml:ro --detach=True docker.elastic.co/beats/filebeat-oss:7.9.3 -e -strict.perms=false -d "*" +ExecStart=/usr/bin/podman run --network slirp4netns:cidr={{ podman_cidr }} --sdnotify=conmon --cgroups=no-conmon --replace --name filebeat --user root --restart=always --security-opt label=disable --volume /var/log/:/logs:ro --volume /etc/filebeat/filebeat.yml:/usr/share/filebeat/filebeat.yml:ro --detach=True docker.elastic.co/beats/filebeat-oss:7.9.3 -e -strict.perms=false -d "*" ExecStop=/usr/bin/podman stop --ignore filebeat -t 10 ExecStopPost=/usr/bin/podman rm --ignore -f filebeat KillMode=none diff --git a/ansible/roles/kibana/templates/kibana.service.j2 b/ansible/roles/kibana/templates/kibana.service.j2 index 91011344a..4658e4cb3 100644 --- a/ansible/roles/kibana/templates/kibana.service.j2 +++ b/ansible/roles/kibana/templates/kibana.service.j2 @@ -9,7 +9,7 @@ After=network-online.target [Service] Environment=PODMAN_SYSTEMD_UNIT=%n Restart=always -ExecStart=/usr/bin/podman run --sdnotify=conmon --cgroups=no-conmon -d --replace --name kibana --restart=no --env ELASTICSEARCH_URL=https://{{ elasticsearch_address }}:9200 --env ELASTICSEARCH_HOSTS=https://{{ elasticsearch_address}}:9200 --env ELASTICSEARCH_USERNAME=admin --env ELASTICSEARCH_PASSWORD="{{ vault_elasticsearch_admin_password }}" --publish 5601:5601 --detach=True amazon/opendistro-for-elasticsearch-kibana:1.12.0 +ExecStart=/usr/bin/podman run --network slirp4netns:cidr={{ podman_cidr }} --sdnotify=conmon --cgroups=no-conmon -d --replace --name kibana --restart=no --env ELASTICSEARCH_URL=https://{{ elasticsearch_address }}:9200 --env ELASTICSEARCH_HOSTS=https://{{ elasticsearch_address}}:9200 --env ELASTICSEARCH_USERNAME=admin --env ELASTICSEARCH_PASSWORD="{{ vault_elasticsearch_admin_password }}" --publish 5601:5601 --detach=True amazon/opendistro-for-elasticsearch-kibana:1.12.0 ExecStop=/usr/bin/podman stop --ignore kibana -t 10 ExecStopPost=/usr/bin/podman rm --ignore -f kibana KillMode=none diff --git a/ansible/roles/opendistro/templates/opendistro.service.j2 b/ansible/roles/opendistro/templates/opendistro.service.j2 index 1b2095795..ddf99aea6 100644 --- a/ansible/roles/opendistro/templates/opendistro.service.j2 +++ b/ansible/roles/opendistro/templates/opendistro.service.j2 @@ -9,7 +9,7 @@ After=network-online.target [Service] Environment=PODMAN_SYSTEMD_UNIT=%n Restart=always -ExecStart=/usr/bin/podman run --sdnotify=conmon --cgroups=no-conmon -d --replace --name opendistro --restart=no --user elasticsearch --ulimit memlock=-1:-1 --ulimit nofile=65536:65536 --volume opendistro:/usr/share/elasticsearch/data --volume /etc/elastic/internal_users.yml:/usr/share/elasticsearch/plugins/opendistro_security/securityconfig/internal_users.yml:ro --env node.name=opendistro --env discovery.type=single-node --env bootstrap.memory_lock=true --env "ES_JAVA_OPTS=-Xms512m -Xmx512m" --publish 9200:9200 amazon/opendistro-for-elasticsearch:1.12.0 +ExecStart=/usr/bin/podman run --network slirp4netns:cidr={{ podman_cidr }} --sdnotify=conmon --cgroups=no-conmon -d --replace --name opendistro --restart=no --user elasticsearch --ulimit memlock=-1:-1 --ulimit nofile=65536:65536 --volume opendistro:/usr/share/elasticsearch/data --volume /etc/elastic/internal_users.yml:/usr/share/elasticsearch/plugins/opendistro_security/securityconfig/internal_users.yml:ro --env node.name=opendistro --env discovery.type=single-node --env bootstrap.memory_lock=true --env "ES_JAVA_OPTS=-Xms512m -Xmx512m" --publish 9200:9200 amazon/opendistro-for-elasticsearch:1.12.0 ExecStop=/usr/bin/podman stop --ignore opendistro -t 10 # note for some reason this returns status=143 which makes systemd show the unit as failed, not stopped ExecStopPost=/usr/bin/podman rm --ignore -f opendistro diff --git a/ansible/roles/podman/tasks/validate.yml b/ansible/roles/podman/tasks/validate.yml index 14b13d11f..7edd84ee9 100644 --- a/ansible/roles/podman/tasks/validate.yml +++ b/ansible/roles/podman/tasks/validate.yml @@ -7,3 +7,8 @@ assert: that: podman_tmp_fstype.stdout == 'tmpfs' fail_msg: "{{ podman_tmp_fstype }} (variable podman_tmp_fstype) must be on tmpfs" + +- name: Check host IP is not within podman network CIDR + assert: + that: ( podman_cidr | ansible.netcommon.network_in_network(ansible_default_ipv4.address)) == false + fail_msg: "Default ipv4 address {{ ansible_default_ipv4.address }} for {{ inventory_hostname }} is in podman network range {{ podman_cidr }} - set `podman_cidr` to avoid host network address range" \ No newline at end of file diff --git a/environments/common/inventory/group_vars/all/podman.yml b/environments/common/inventory/group_vars/all/podman.yml index 10ece8cff..866b81090 100644 --- a/environments/common/inventory/group_vars/all/podman.yml +++ b/environments/common/inventory/group_vars/all/podman.yml @@ -1 +1,2 @@ podman_users: "{{ appliances_local_users_podman }}" +podman_cidr: 10.0.2.0/24 # see slirp4netns:cidr= at https://docs.podman.io/en/latest/markdown/podman-run.1.html diff --git a/requirements.txt b/requirements.txt index 6895fa3b8..57cca7e83 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ passlib[bcrypt] cookiecutter vagranttoansible selinux # this is a shim to avoid having to use --system-site-packages, you still need sudo yum install libselinux-python3 +netaddr From 237b0698c13e0ec9ebe6fb9fa0f25896d2fd9c39 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 9 Sep 2021 13:32:25 +0000 Subject: [PATCH 030/133] fix order of slurm.conf changes and {Resume,Suspend}Program creation (workaround) --- ansible/slurm.yml | 10 ++++++++++ .../common/inventory/group_vars/all/autoscale.yml | 2 -- .../common/inventory/group_vars/all/openhpc.yml | 1 - 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 6fa5c8a50..444de81eb 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -36,6 +36,7 @@ name: stackhpc.slurm_openstack_tools.rebuild - name: Setup autoscaling suspend/resume programs + # has to happen *after* slurm user has been created hosts: autoscale # this is the *controller* become: yes tags: @@ -80,6 +81,15 @@ group: slurm mode: u=rwx,go= tags: resume + - name: Add Resume/SuspendProgram parameters + community.general.ini_file: + path: /etc/slurm/slurm.conf + option: "{{ item.key }}" + section: null + value: "{{ item.value }}" + no_extra_spaces: true + create: no + loop: "{{ {'SuspendProgram':'/opt/slurm-tools/bin/suspend', 'ResumeProgram':'/opt/slurm-tools/bin/resume'} | dict2items }}" # TODO: fixme: hijacking slurm-tools - name: Reconfigure slurm command: cmd: scontrol reconfigure diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index c5580d173..a16d93659 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -8,8 +8,6 @@ autoscale_keypair: steveb-local autoscale_openhpc_extra_config: # required parameters: SuspendExcNodes: "{{ groups['compute'] | join(',') }}" # i.e. all tf-defined nodes in the partition # TODO: fixme: hardcoded "compute" - SuspendProgram: /opt/slurm-tools/bin/suspend # TODO: fixme: hijacking slurm-tools - ResumeProgram: /opt/slurm-tools/bin/resume # TODO: fixme: hijacking slurm-tools SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns CommunicationParameters: NoAddrCache # recommended: diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 165a86ee0..70d9289c9 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -15,7 +15,6 @@ openhpc_slurmdbd_mysql_username: slurm openhpc_slurm_control_host: "{{ groups['control'] | first }}" openhpc_slurm_partitions: - name: "compute" - # cloud_nodes: 2 openhpc_default_packages: - slurm-libpmi-ohpc # to allow intel mpi to work properly - ohpc-gnu9-openmpi4-perf-tools # for hpctests From 82e4fac29c28f99c43fb7916453f494f6abeee77 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Sep 2021 08:25:59 +0000 Subject: [PATCH 031/133] turn up slurmctld logging --- environments/common/inventory/group_vars/all/autoscale.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index a16d93659..649007e15 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -14,6 +14,7 @@ autoscale_openhpc_extra_config: PrivateData: cloud # shows cloud node state # TODO: for testing only, not production: DebugFlags: PowerSave + SlurmctldSyslogDebug: info SuspendTime: 120 SuspendTimeout: 300 ResumeTimeout: 300 From 1353f86a9a24ae8c3d6230f8d68070caec279654 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Sep 2021 08:40:34 +0000 Subject: [PATCH 032/133] add extension to templates --- ansible/templates/{resume.j2 => resume.py.j2} | 0 ansible/templates/{suspend.j2 => suspend.py.j2} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename ansible/templates/{resume.j2 => resume.py.j2} (100%) rename ansible/templates/{suspend.j2 => suspend.py.j2} (100%) diff --git a/ansible/templates/resume.j2 b/ansible/templates/resume.py.j2 similarity index 100% rename from ansible/templates/resume.j2 rename to ansible/templates/resume.py.j2 diff --git a/ansible/templates/suspend.j2 b/ansible/templates/suspend.py.j2 similarity index 100% rename from ansible/templates/suspend.j2 rename to ansible/templates/suspend.py.j2 From 60473139d22659ec57a1314d2b069e3913d94738 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Sep 2021 08:45:24 +0000 Subject: [PATCH 033/133] log exception tracebacks from resume/suspend programs --- ansible/slurm.yml | 4 ++-- ansible/templates/resume.py.j2 | 6 +++++- ansible/templates/suspend.py.j2 | 6 +++++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 444de81eb..a8a324624 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -67,7 +67,7 @@ name: stackhpc.slurm_openstack_tools.pytools - name: Create SuspendProgram template: - src: suspend.j2 + src: suspend.py.j2 dest: /opt/slurm-tools/bin/suspend owner: slurm group: slurm @@ -75,7 +75,7 @@ tags: suspend - name: Create ResumeProgram template: - src: resume.j2 + src: resume.py.j2 dest: /opt/slurm-tools/bin/resume owner: slurm group: slurm diff --git a/ansible/templates/resume.py.j2 b/ansible/templates/resume.py.j2 index 55a03d1ca..4f116d8e2 100644 --- a/ansible/templates/resume.py.j2 +++ b/ansible/templates/resume.py.j2 @@ -49,4 +49,8 @@ def resume(): logger.info(f"server: {server}") if __name__ == "__main__": - sys.exit(resume()) + try: + sys.exit(resume()) + except: + logger.exception('Exception in main:') + raise diff --git a/ansible/templates/suspend.py.j2 b/ansible/templates/suspend.py.j2 index 02d09bc0d..c003ad3ae 100644 --- a/ansible/templates/suspend.py.j2 +++ b/ansible/templates/suspend.py.j2 @@ -32,4 +32,8 @@ def suspend(): delete_server(conn, node) if __name__ == "__main__": - sys.exit(suspend()) + try: + sys.exit(suspend()) + except: + logger.exception('Exception in main:') + raise From 919ff5031c996e148abe9ffca5f0fcb3e42b64f0 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Sep 2021 13:25:54 +0000 Subject: [PATCH 034/133] chhange appcred owner --- ansible/slurm.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index a8a324624..aa5515b52 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -52,15 +52,15 @@ file: path: /etc/openstack state: directory - owner: root - group: root - mode: '0400' + owner: slurm + group: slurm + mode: '0500' - name: Copy out clouds.yaml copy: src: "{{ autoscale_clouds }}" dest: /etc/openstack/clouds.yaml - owner: root - group: root + owner: slurm + group: slurm mode: '0400' - name: Setup slurm tools (to get venv) include_role: From 02377b110569b6dfe7c9b98d684ac2837e368605 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Sep 2021 13:26:14 +0000 Subject: [PATCH 035/133] fix try/except in resume/suspend --- ansible/templates/resume.py.j2 | 2 +- ansible/templates/suspend.py.j2 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/templates/resume.py.j2 b/ansible/templates/resume.py.j2 index 4f116d8e2..2722780d6 100644 --- a/ansible/templates/resume.py.j2 +++ b/ansible/templates/resume.py.j2 @@ -53,4 +53,4 @@ if __name__ == "__main__": sys.exit(resume()) except: logger.exception('Exception in main:') - raise + raise diff --git a/ansible/templates/suspend.py.j2 b/ansible/templates/suspend.py.j2 index c003ad3ae..7a4c70f9c 100644 --- a/ansible/templates/suspend.py.j2 +++ b/ansible/templates/suspend.py.j2 @@ -36,4 +36,4 @@ if __name__ == "__main__": sys.exit(suspend()) except: logger.exception('Exception in main:') - raise + raise From b0622d96a6ea7b0b3666d867a1b9a85b5b0c34ef Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Sep 2021 13:45:11 +0000 Subject: [PATCH 036/133] handle incorrect resume config --- ansible/templates/resume.py.j2 | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/ansible/templates/resume.py.j2 b/ansible/templates/resume.py.j2 index 2722780d6..b0f869bd7 100644 --- a/ansible/templates/resume.py.j2 +++ b/ansible/templates/resume.py.j2 @@ -6,10 +6,12 @@ import openstack import pprint # all take a name or ID: -IMAGE = "{{ autoscale_image }}" -NETWORK = "{{ autoscale_network }}" -FLAVOR = "{{ autoscale_flavor }}" -KEYPAIR = "{{ autoscale_keypair }}" +config = { + 'image': "{{ autoscale_image }}", + 'network': "{{ autoscale_network }}", + 'flavor': "{{ autoscale_flavor }}", + 'keypair': "{{ autoscale_keypair }}", +} # configure logging to syslog - by default only "info" and above categories appear logger = logging.getLogger("syslogger") @@ -23,10 +25,14 @@ def expand_nodes(hostlist_expr): def create_server(conn, name): - image = conn.compute.find_image(IMAGE) - flavor = conn.compute.find_flavor(FLAVOR) - network = conn.network.find_network(NETWORK) - keypair = conn.compute.find_keypair(KEYPAIR) + image = conn.compute.find_image(config['image']) + flavor = conn.compute.find_flavor(config['flavor']) + network = conn.network.find_network(config['network']) + keypair = conn.compute.find_keypair(config['keypair']) + + for ix, item in enumerate((image, flavor, network, keypair)): + if item is None: + raise ValueError(f'Specified {list(config)[ix]} {config[list(config)[ix]]} was not found') server = conn.compute.create_server( name=name, image_id=image.id, flavor_id=flavor.id, @@ -45,7 +51,7 @@ def resume(): for node in new_nodes: logger.info(f"creating node {node}") - server = create_server(conn, node) + server = create_server(conn, node) # TODO: save id to disk so can use it instead of name on deletion (to cope with multiple instances with same name) logger.info(f"server: {server}") if __name__ == "__main__": From d1ba38e378951f50b3f25caacd1c634c3d07e632 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Sep 2021 13:47:11 +0000 Subject: [PATCH 037/133] fix autoscale config for smslabs --- .../common/inventory/group_vars/all/autoscale.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index 649007e15..cdf4a5eb6 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -1,9 +1,9 @@ autoscale_clouds: ~/.config/openstack/clouds.yaml # TODO: change below to be defined somewhere else, poss as part of slurm config for partition?? -autoscale_image: ohpc-compute-210406-1108.qcow2 -autoscale_network: stackhpc -autoscale_flavor: chipolata -autoscale_keypair: steveb-local +autoscale_image: ohpc-compute-210909-1316.qcow2 +autoscale_network: stackhpc-ipv4-geneve +autoscale_flavor: general.v1.small +autoscale_keypair: centos-at-steveb-ansible autoscale_openhpc_extra_config: # required parameters: From 8e2a8270c091e54054374f4572b8995a2c43df34 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 10 Sep 2021 14:26:05 +0000 Subject: [PATCH 038/133] avoid suspend/resume exceptions on successful run --- ansible/templates/resume.py.j2 | 3 ++- ansible/templates/suspend.py.j2 | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ansible/templates/resume.py.j2 b/ansible/templates/resume.py.j2 index b0f869bd7..d1fb85bcd 100644 --- a/ansible/templates/resume.py.j2 +++ b/ansible/templates/resume.py.j2 @@ -53,10 +53,11 @@ def resume(): logger.info(f"creating node {node}") server = create_server(conn, node) # TODO: save id to disk so can use it instead of name on deletion (to cope with multiple instances with same name) logger.info(f"server: {server}") + # Don't need scontrol update nodename={node} nodeaddr={server_ip} as using SlurmctldParameters=cloud_dns if __name__ == "__main__": try: - sys.exit(resume()) + resume() except: logger.exception('Exception in main:') raise diff --git a/ansible/templates/suspend.py.j2 b/ansible/templates/suspend.py.j2 index 7a4c70f9c..dadfc2e4a 100644 --- a/ansible/templates/suspend.py.j2 +++ b/ansible/templates/suspend.py.j2 @@ -33,7 +33,7 @@ def suspend(): if __name__ == "__main__": try: - sys.exit(suspend()) + suspend() except: logger.exception('Exception in main:') raise From 37055b50abe7cae6ac4fee8afdd0fb765b86e465 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 23 Sep 2021 11:33:31 +0000 Subject: [PATCH 039/133] basic (messy) working autoscale --- ansible/slurm.yml | 99 +++++++------------ .../inventory/group_vars/all/autoscale.yml | 8 +- .../inventory/group_vars/all/openhpc.yml | 3 +- .../inventory/group_vars/all/rebuild.yml | 3 + 4 files changed, 46 insertions(+), 67 deletions(-) create mode 100644 environments/common/inventory/group_vars/all/rebuild.yml diff --git a/ansible/slurm.yml b/ansible/slurm.yml index a724c4854..3574c97d2 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -9,91 +9,62 @@ - include_role: name: geerlingguy.mysql -- name: Setup slurm - hosts: openhpc - become: yes - tags: - - openhpc - tasks: - # - name: Add CentOS 8.3 Vault repo for OpenHPC hwloc dependency - # # NB: REMOVE THIS once OpenHPC works on CentOS 8.4 - # yum_repository: - # name: vault - # file: CentOS-Linux-Vault8.3 - # description: CentOS 8.3 packages from Vault - # baseurl: https://vault.centos.org/8.3.2011/BaseOS/$basearch/os/ - # gpgkey: file:///etc/pki/rpm-gpg/RPM-GPG-KEY-centosofficial - - import_role: - name: stackhpc.openhpc - -- name: Setup slurm-driven reimage - hosts: rebuild - become: yes - tags: +- name: Enable Slurm-controlled instance changes + hosts: - rebuild - - openhpc - tasks: - - import_role: - name: stackhpc.slurm_openstack_tools.rebuild - -- name: Setup autoscaling suspend/resume programs - # has to happen *after* slurm user has been created - hosts: autoscale # this is the *controller* - become: yes - tags: - autoscale - tasks: -- name: Configure autoscale - hosts: autoscale - become: yes + become: true tags: + - rebuild - autoscale + - openhpc tasks: - name: Create /etc/openstack file: path: /etc/openstack state: directory - owner: slurm - group: slurm - mode: '0500' + owner: root # This will be changed later + group: root + mode: u=r - name: Copy out clouds.yaml copy: - src: "{{ autoscale_clouds }}" + src: "{{ openhpc_rebuild_clouds }}" # TODO: name is wrong really! dest: /etc/openstack/clouds.yaml - owner: slurm - group: slurm - mode: '0400' - - name: Setup slurm tools (to get venv) + mode: u=rx + - name: Setup slurm tools # this adds reboot script only at present include_role: - name: stackhpc.slurm_openstack_tools.pytools - - name: Create SuspendProgram + name: stackhpc.slurm_openstack_tools.pytools # TODO: could just move this into the repo? + - name: Create SuspendProgram # TODO: FIXME: add to slurm-tools template: src: suspend.py.j2 dest: /opt/slurm-tools/bin/suspend - owner: slurm - group: slurm - mode: u=rwx,go= + mode: u=rwx,g=rx,o=rx # is OK as clouds.yaml is protected tags: suspend - - name: Create ResumeProgram + when: "'autoscale' in group_names" + - name: Create ResumeProgram # TODO: FIXME: add to slurm-tools template: src: resume.py.j2 dest: /opt/slurm-tools/bin/resume - owner: slurm - group: slurm - mode: u=rwx,go= + mode: u=rwx,g=rx,o=rx # is OK as clouds.yaml is protected tags: resume - - name: Add Resume/SuspendProgram parameters - community.general.ini_file: - path: /etc/slurm/slurm.conf - option: "{{ item.key }}" - section: null - value: "{{ item.value }}" - no_extra_spaces: true - create: no - loop: "{{ {'SuspendProgram':'/opt/slurm-tools/bin/suspend', 'ResumeProgram':'/opt/slurm-tools/bin/resume'} | dict2items }}" # TODO: fixme: hijacking slurm-tools - - name: Reconfigure slurm - command: - cmd: scontrol reconfigure + when: "'autoscale' in group_names" + +- name: Setup slurm + hosts: openhpc + become: yes + tags: + - openhpc + tasks: + # - name: Add CentOS 8.3 Vault repo for OpenHPC hwloc dependency + # # NB: REMOVE THIS once OpenHPC works on CentOS 8.4 + # yum_repository: + # name: vault + # file: CentOS-Linux-Vault8.3 + # description: CentOS 8.3 packages from Vault + # baseurl: https://vault.centos.org/8.3.2011/BaseOS/$basearch/os/ + # gpgkey: file:///etc/pki/rpm-gpg/RPM-GPG-KEY-centosofficial + - import_role: + name: stackhpc.openhpc - name: Set locked memory limits on user-facing nodes hosts: diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index cdf4a5eb6..ab8516ca7 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -10,13 +10,19 @@ autoscale_openhpc_extra_config: SuspendExcNodes: "{{ groups['compute'] | join(',') }}" # i.e. all tf-defined nodes in the partition # TODO: fixme: hardcoded "compute" SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns CommunicationParameters: NoAddrCache + SuspendProgram: /opt/slurm-tools/bin/suspend + ResumeProgram: /opt/slurm-tools/bin/resume # recommended: PrivateData: cloud # shows cloud node state # TODO: for testing only, not production: DebugFlags: PowerSave SlurmctldSyslogDebug: info SuspendTime: 120 - SuspendTimeout: 300 + SuspendTimeout: 30 ResumeTimeout: 300 # FIXME: maybe need to set TreeWidth to >= number of nodes (default: 50) # power_save_*interval: options are defaults but should enable changes +openhpc_slurm_dirs: + - /etc/openstack + - /opt/slurm-tools +openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml \ No newline at end of file diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 70d9289c9..8bf38293f 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -26,5 +26,4 @@ openhpc_slurm_configless: true openhpc_login_only_nodes: login openhpc_extra_config_overrides: {} -appliance_openhpc_extra_config: "{{ autoscale_openhpc_extra_config if groups['autoscale'] else {} }}" -openhpc_extra_config: "{{ appliance_openhpc_extra_config | combine(openhpc_extra_config_overrides) }}" +openhpc_extra_config: "{{ {} | combine(rebuild_openhpc_extra_config, autoscale_openhpc_extra_config, openhpc_extra_config_overrides) }}" # TODO: handle case where groups aren't defined! diff --git a/environments/common/inventory/group_vars/all/rebuild.yml b/environments/common/inventory/group_vars/all/rebuild.yml new file mode 100644 index 000000000..4026a0e21 --- /dev/null +++ b/environments/common/inventory/group_vars/all/rebuild.yml @@ -0,0 +1,3 @@ +rebuild_openhpc_extra_config: + RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild +openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml \ No newline at end of file From 6a37f50975a836df352e7d39b67f3a425d5f45c5 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 24 Sep 2021 13:20:17 +0000 Subject: [PATCH 040/133] make clouds.yaml idemponent (TODO: fix for rebuild nodes) --- ansible/slurm.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 3574c97d2..cb2cfa974 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -19,21 +19,31 @@ - autoscale - openhpc tasks: + - name: Check if slurm user exists + command: + cmd: "id slurm" + register: id_slurm + failed_when: false + changed_when: false - name: Create /etc/openstack file: path: /etc/openstack state: directory - owner: root # This will be changed later - group: root + owner: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task + group: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task mode: u=r - name: Copy out clouds.yaml copy: src: "{{ openhpc_rebuild_clouds }}" # TODO: name is wrong really! dest: /etc/openstack/clouds.yaml mode: u=rx + owner: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task + group: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task - name: Setup slurm tools # this adds reboot script only at present include_role: name: stackhpc.slurm_openstack_tools.pytools # TODO: could just move this into the repo? + vars: # TODO: debug + pytools_editable: true # git repo in /opt/slurm-tools/src/slurm-openstack-tools - name: Create SuspendProgram # TODO: FIXME: add to slurm-tools template: src: suspend.py.j2 From 49a76cc67592b9982795ae67f435c6e362368413 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 24 Sep 2021 15:14:29 +0000 Subject: [PATCH 041/133] fix /etc/openstack permissions for autoscale --- ansible/slurm.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index cb2cfa974..69cf11d96 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -31,12 +31,12 @@ state: directory owner: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task group: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task - mode: u=r + mode: u=rX,go= - name: Copy out clouds.yaml copy: src: "{{ openhpc_rebuild_clouds }}" # TODO: name is wrong really! dest: /etc/openstack/clouds.yaml - mode: u=rx + mode: u=r,go= owner: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task group: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task - name: Setup slurm tools # this adds reboot script only at present From 9c9a69e116face2742d0c24dc1ba0fc23badb0b8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 24 Sep 2021 15:15:06 +0000 Subject: [PATCH 042/133] use openhpc_suspend_exc_nodes to prevent login nodes autoscaling --- environments/common/inventory/group_vars/all/autoscale.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index ab8516ca7..fb5ac3cbd 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -5,9 +5,9 @@ autoscale_network: stackhpc-ipv4-geneve autoscale_flavor: general.v1.small autoscale_keypair: centos-at-steveb-ansible +openhpc_suspend_exc_nodes: "{{ (groups['compute'] + groups.get('login', [])) }}" # i.e. all tf-defined nodes in the partition # TODO: fixme: hardcoded "compute" autoscale_openhpc_extra_config: # required parameters: - SuspendExcNodes: "{{ groups['compute'] | join(',') }}" # i.e. all tf-defined nodes in the partition # TODO: fixme: hardcoded "compute" SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns CommunicationParameters: NoAddrCache SuspendProgram: /opt/slurm-tools/bin/suspend From 10a20363636c23d54c7d616e9cf8ea499ae64f87 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 28 Sep 2021 08:11:17 +0000 Subject: [PATCH 043/133] install slurm user before adding slurm tools --- ansible/slurm.yml | 22 +++++++++---------- .../inventory/group_vars/all/autoscale.yml | 3 --- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 69cf11d96..73fe05216 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -9,7 +9,7 @@ - include_role: name: geerlingguy.mysql -- name: Enable Slurm-controlled instance changes +- name: Enable Slurm/OpenStack integrations hosts: - rebuild - autoscale @@ -19,31 +19,31 @@ - autoscale - openhpc tasks: - - name: Check if slurm user exists - command: - cmd: "id slurm" - register: id_slurm - failed_when: false - changed_when: false + - name: Install slurm packages to create slurm user + import_role: + name: stackhpc.openhpc + tasks_from: install.yml - name: Create /etc/openstack file: path: /etc/openstack state: directory - owner: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task - group: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task + owner: slurm # TODO: check if this works for rebuild too? + group: slurm mode: u=rX,go= - name: Copy out clouds.yaml copy: src: "{{ openhpc_rebuild_clouds }}" # TODO: name is wrong really! dest: /etc/openstack/clouds.yaml mode: u=r,go= - owner: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task - group: "{{ 'root' if id_slurm.rc else 'slurm' }}" # TODO: FIXME: for rebuild task + owner: slurm # TODO: check if this works for rebuild too? + group: slurm - name: Setup slurm tools # this adds reboot script only at present include_role: name: stackhpc.slurm_openstack_tools.pytools # TODO: could just move this into the repo? vars: # TODO: debug pytools_editable: true # git repo in /opt/slurm-tools/src/slurm-openstack-tools + become_user: slurm # TODO: check if this works for rebuild too? + become_flags: '-s /bin/bash' - name: Create SuspendProgram # TODO: FIXME: add to slurm-tools template: src: suspend.py.j2 diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index fb5ac3cbd..9c0eefc9d 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -22,7 +22,4 @@ autoscale_openhpc_extra_config: ResumeTimeout: 300 # FIXME: maybe need to set TreeWidth to >= number of nodes (default: 50) # power_save_*interval: options are defaults but should enable changes -openhpc_slurm_dirs: - - /etc/openstack - - /opt/slurm-tools openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml \ No newline at end of file From 7de823fbb22c1499de1ee758aab2a290e00396c9 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 28 Sep 2021 13:16:56 +0000 Subject: [PATCH 044/133] read node Features to get openstack instance information --- ansible/templates/resume.py.j2 | 101 +++++++++++++++++++++++++-------- 1 file changed, 78 insertions(+), 23 deletions(-) diff --git a/ansible/templates/resume.py.j2 b/ansible/templates/resume.py.j2 index d1fb85bcd..e36d044f6 100644 --- a/ansible/templates/resume.py.j2 +++ b/ansible/templates/resume.py.j2 @@ -1,47 +1,75 @@ #!/opt/slurm-tools/bin/python3 -""" Create OpenStack instances """ +""" A Slurm ResumeProgram to create OpenStack instances. + + Usage: + + resume HOSTLIST_EXPRESSION [debug] + + where: + HOSTLIST_EXPRESSION: Name(s) of node(s) to create, using Slurm's hostlist expression, as per [1]. + debug: Any 2nd argument puts this in debug mode which is more verbose but does not actually create nodes. + + Output and exceptions are written to the syslog. + + The flavor, image, network and keypair to be used must be defined as node Features [2] in the format "parameter=value". + + OpenStack credentials must be available to this script (e.g. via an application credential in /etc/openstack/clouds.yaml readable by the slurm user) + + [1]: https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeProgram + [2]: https://slurm.schedmd.com/slurm.conf.html#OPT_Features +""" import sys, subprocess, logging.handlers import openstack import pprint -# all take a name or ID: -config = { - 'image': "{{ autoscale_image }}", - 'network': "{{ autoscale_network }}", - 'flavor': "{{ autoscale_flavor }}", - 'keypair': "{{ autoscale_keypair }}", -} +REQUIRED_PARAMS = ('image', 'flavor', 'keypair', 'network') # configure logging to syslog - by default only "info" and above categories appear logger = logging.getLogger("syslogger") logger.setLevel(logging.DEBUG) handler = logging.handlers.SysLogHandler("/dev/log") +handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) logger.addHandler(handler) def expand_nodes(hostlist_expr): scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) return scontrol.stdout.strip().split('\n') -def create_server(conn, name): - - image = conn.compute.find_image(config['image']) - flavor = conn.compute.find_flavor(config['flavor']) - network = conn.network.find_network(config['network']) - keypair = conn.compute.find_keypair(config['keypair']) +def get_features(nodenames): + """ Retrieve the features specified for given node(s). + + Returns a dict with a key/value pair for each node. Keys are node names, values are lists of strings, one string per feature. + """ + + scontrol = subprocess.run(['scontrol', 'show', 'node', nodenames], stdout=subprocess.PIPE, universal_newlines=True) + features = {} + for line in scontrol.stdout.splitlines(): + line = line.strip() + if line.startswith('NodeName'): # NodeName=dev-small-cloud-1 CoresPerSocket=1 + node = line.split()[0].split('=')[1] + if line.startswith('AvailableFeatures'): + feature_args = line.split('=', 1)[1] + features[node] = feature_args.split(',') + break + + return features - for ix, item in enumerate((image, flavor, network, keypair)): - if item is None: - raise ValueError(f'Specified {list(config)[ix]} {config[list(config)[ix]]} was not found') +def create_server(conn, name, image, flavor, network, keypair): server = conn.compute.create_server( name=name, image_id=image.id, flavor_id=flavor.id, - networks=[{"uuid": network.id}], key_name=keypair.name) + networks=[{"uuid": network.id}], key_name=keypair.name, + ) + #server = conn.compute.wait_for_server(...) - #server = conn.compute.wait_for_server(server) return server def resume(): + debug = False + if len(sys.argv) > 2: + logger.info(f"Running in debug mode - won't actually create nodes") + debug = True hostlist_expr = sys.argv[1] logger.info(f"Slurmctld invoked resume {hostlist_expr}") new_nodes = expand_nodes(hostlist_expr) @@ -49,11 +77,38 @@ def resume(): conn = openstack.connection.from_config() logger.info(f"Got openstack connection {conn}") + features = get_features(hostlist_expr) + logger.info(f"Read feature information from slurm") + logger.info(f"Features: {features}") + for node in new_nodes: - logger.info(f"creating node {node}") - server = create_server(conn, node) # TODO: save id to disk so can use it instead of name on deletion (to cope with multiple instances with same name) - logger.info(f"server: {server}") - # Don't need scontrol update nodename={node} nodeaddr={server_ip} as using SlurmctldParameters=cloud_dns + # extract the openstack parameters from node features: + if node not in features: + logger.error(f"No Feature definitions found for node {node}: {features}") + os_parameters = dict(feature.split('=') for feature in features[node]) + if debug: + logger.info(f"os_parameters for {node}: {os_parameters}") + missing = set(REQUIRED_PARAMS).difference(os_parameters.keys()) + if missing: + logger.error(f"Missing {','.join(missing)} from feature definition for node {node}: {os_parameters}") + + # get openstack objects: + os_objects = { + 'image': conn.compute.find_image(os_parameters['image']), + 'flavor': conn.compute.find_flavor(os_parameters['flavor']), + 'network': conn.network.find_network(os_parameters['network']), + 'keypair': conn.compute.find_keypair(os_parameters['keypair']), + } + not_found = dict((k, v) for (k, v) in os_objects.items() if v is None) + if not_found: + raise ValueError('Could not find openstack objects for: %s' % ', '.join(not_found)) + if debug: + logger.info(f"os_objects for {node} : {os_objects}") + if not debug: + logger.info(f"creating node {node}") + server = create_server(conn, node, **os_objects) # TODO: save id to disk so can use it instead of name on deletion (to cope with multiple instances with same name) + logger.info(f"server: {server}") + # Don't need scontrol update nodename={node} nodeaddr={server_ip} as using SlurmctldParameters=cloud_dns if __name__ == "__main__": try: From d7bfa7547931d337ead4166cb420f1e3078adf53 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 28 Sep 2021 14:43:54 +0000 Subject: [PATCH 045/133] move autoscale node info to openhpc_slurm_partitions --- .../common/inventory/group_vars/all/autoscale.yml | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index 9c0eefc9d..f9fafa715 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -1,11 +1,5 @@ autoscale_clouds: ~/.config/openstack/clouds.yaml -# TODO: change below to be defined somewhere else, poss as part of slurm config for partition?? -autoscale_image: ohpc-compute-210909-1316.qcow2 -autoscale_network: stackhpc-ipv4-geneve -autoscale_flavor: general.v1.small -autoscale_keypair: centos-at-steveb-ansible - -openhpc_suspend_exc_nodes: "{{ (groups['compute'] + groups.get('login', [])) }}" # i.e. all tf-defined nodes in the partition # TODO: fixme: hardcoded "compute" +autoscale_openhpc_suspend_exc_nodes: "{{ (groups['compute'] + groups.get('login', [])) }}" # i.e. all tf-defined nodes in the partition # TODO: fixme: hardcoded "compute" autoscale_openhpc_extra_config: # required parameters: SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns @@ -22,4 +16,4 @@ autoscale_openhpc_extra_config: ResumeTimeout: 300 # FIXME: maybe need to set TreeWidth to >= number of nodes (default: 50) # power_save_*interval: options are defaults but should enable changes -openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml \ No newline at end of file +openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: fix name here? From 544b1abcc2b28154f7cb562e1cbac0d7ccfdd21d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 08:21:03 +0000 Subject: [PATCH 046/133] rename openhpc vars --- .../inventory/group_vars/all/openhpc.yml | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 8bf38293f..7ce64defd 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -15,15 +15,24 @@ openhpc_slurmdbd_mysql_username: slurm openhpc_slurm_control_host: "{{ groups['control'] | first }}" openhpc_slurm_partitions: - name: "compute" -openhpc_default_packages: + +# TODO: WIP PR to change/deprecate name here: +openhpc_packages_default: - slurm-libpmi-ohpc # to allow intel mpi to work properly - ohpc-gnu9-openmpi4-perf-tools # for hpctests - openblas-gnu9-ohpc # for hpctests (HPL) -openhpc_extra_packages: [] -openhpc_packages: "{{ openhpc_default_packages + openhpc_extra_packages }}" +openhpc_packages_extra: [] +openhpc_packages: "{{ openhpc_packages_default + openhpc_packages_extra }}" + openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}" openhpc_slurm_configless: true openhpc_login_only_nodes: login -openhpc_extra_config_overrides: {} -openhpc_extra_config: "{{ {} | combine(rebuild_openhpc_extra_config, autoscale_openhpc_extra_config, openhpc_extra_config_overrides) }}" # TODO: handle case where groups aren't defined! +openhpc_config_default: "{{ rebuild_openhpc_extra_config | combine(autoscale_openhpc_extra_config) }}" # TODO: handle case where groups aren't defined! +openhpc_config_extra: {} +# TODO: WIP PR for openhpc_extra_config -> openhpc_config +openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra) }}" + +openhpc_suspend_exc_nodes_default: "{{ autoscale_openhpc_suspend_exc_nodes }}" # TODO: handle cases where groups aren't defined +openhpc_env_suspend_exc_nodes_extra: [] +openhpc_suspend_exc_nodes: "{{ openhpc_suspend_exc_nodes_default + openhpc_env_suspend_exc_nodes_extra }}" From 31d8e848b6e475ef3de859d580c019679a006b8b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 08:28:57 +0000 Subject: [PATCH 047/133] add vars from smslabs environment as demo --- environments/smslabs/activate | 23 ++++++++++ environments/smslabs/hooks/post.yml | 19 ++++++++ .../smslabs/inventory/group_vars/all/.gitkeep | 0 .../inventory/group_vars/all/users.yml | 3 ++ .../group_vars/openhpc/overrides.yml | 9 ++++ .../group_vars/openhpc/partitions.yml | 19 ++++++++ .../inventory/group_vars/podman/overrides.yml | 1 + environments/smslabs/inventory/groups | 43 +++++++++++++++++++ environments/smslabs/inventory/hosts | 18 ++++++++ 9 files changed, 135 insertions(+) create mode 100644 environments/smslabs/activate create mode 100644 environments/smslabs/hooks/post.yml create mode 100644 environments/smslabs/inventory/group_vars/all/.gitkeep create mode 100644 environments/smslabs/inventory/group_vars/all/users.yml create mode 100644 environments/smslabs/inventory/group_vars/openhpc/overrides.yml create mode 100755 environments/smslabs/inventory/group_vars/openhpc/partitions.yml create mode 100644 environments/smslabs/inventory/group_vars/podman/overrides.yml create mode 100644 environments/smslabs/inventory/groups create mode 100755 environments/smslabs/inventory/hosts diff --git a/environments/smslabs/activate b/environments/smslabs/activate new file mode 100644 index 000000000..e74031095 --- /dev/null +++ b/environments/smslabs/activate @@ -0,0 +1,23 @@ +export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) +echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" + +APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) +export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" + +export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") +echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" + +export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") +echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root" + +export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") +echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root" + +export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT") +echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" + +if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then + export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg +fi + + diff --git a/environments/smslabs/hooks/post.yml b/environments/smslabs/hooks/post.yml new file mode 100644 index 000000000..87e637f8c --- /dev/null +++ b/environments/smslabs/hooks/post.yml @@ -0,0 +1,19 @@ +- hosts: control + become: true + tasks: + - name: Prevent ansible_user's processes being killed on compute nodes at job completion + replace: + path: /etc/slurm/slurm.epilog.clean + regexp: 'if \[ \$SLURM_UID -lt 100 \] ; then' + replace: "if [[ $SLURM_UID -lt 100 || $SLURM_JOB_USER -eq {{ ansible_user }} ]] ; then" + - name: Make a /home/test directory for centos + file: + path: /home/test + state: directory + owner: centos + group: centos + - name: Install ewatch + git: + repo: https://github.com/sjpb/ewatch.git + dest: /home/test/ewatch + force: yes diff --git a/environments/smslabs/inventory/group_vars/all/.gitkeep b/environments/smslabs/inventory/group_vars/all/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/environments/smslabs/inventory/group_vars/all/users.yml b/environments/smslabs/inventory/group_vars/all/users.yml new file mode 100644 index 000000000..3de23fee4 --- /dev/null +++ b/environments/smslabs/inventory/group_vars/all/users.yml @@ -0,0 +1,3 @@ +users: + - name: stig + pubkey: ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDTXW9Y0r0cTW/ay6FEIlIejuRPZZ+ObzR08XFzp4x8ecCW//WSZAjo1fD/u/CQGoV552QCjWj+tP9Cy9UcsI3WLAx+n4i48oHqvpRLO1CLgJazNpQ8Bc7GveF78xhD5EoL/IpcAFKIad3CU7gb8HLRJIQpER1OsY96T9ViKe9lDWy8mk2WjoYoU1niMtmbs549Gqwl+fGNdBVUsGS5k7Xy4D/0T8TitthN3W6UbMHXVCUzdd3v9TNl7hgyeq6dCvRS6g8Vmlp2Ia0NLkrWF+bqP2RhRuqWOj71PD3auPAq0hF4yqdW9awMuZY8vBesnjE3iC2h34jvFkYaolGTfDZUa48s7yBTpjWoINUSbg105KJoPg55lWwXj58MMhvyX6hyYl3oJMiG3eq48jAAA4n80EKK4IBXrg/yjpuoDiNGqVe9hDAoT94j3+s8Smz5rohsKQVS+l266eyjo2VLUVR2NaOnw5fW86MEUyTicvHjSN4xOCGjSK2j1k6hXT7EiuM= stig@nrel-jumphost.novalocal \ No newline at end of file diff --git a/environments/smslabs/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs/inventory/group_vars/openhpc/overrides.yml new file mode 100644 index 000000000..4bed1823f --- /dev/null +++ b/environments/smslabs/inventory/group_vars/openhpc/overrides.yml @@ -0,0 +1,9 @@ +openhpc_extra_packages: + - git + - python3 +openhpc_extra_config_overrides: + SlurmctldDebug: debug + SlurmdDebug: debug + +#example_list: "{{ example_list + [7] }}" # FAILS - recursive +#example_dict: "{{ example_dict | combine({c: 4} ) }}" # FAILS - recursive diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml new file mode 100755 index 000000000..e7df7b946 --- /dev/null +++ b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml @@ -0,0 +1,19 @@ +cloud_spec: + image: ohpc-compute-210909-1316.qcow2 + flavor: general.v1.small + keypair: centos-at-steveb-ansible + network: stackhpc-ipv4-geneve + +openhpc_slurm_partitions: +- name: small + cloud_nodes: 2 + features: "{{ cloud_spec.items() | map('join', '=') }}" + default: yes + # TODO: consider adding suspend_exc: true here?? + +- name: cloud_only + cloud_nodes: 3 + ram_mb: 9996 + cpus: 2 + features: "{{ cloud_spec.items() | map('join', '=') }}" + default: no diff --git a/environments/smslabs/inventory/group_vars/podman/overrides.yml b/environments/smslabs/inventory/group_vars/podman/overrides.yml new file mode 100644 index 000000000..18e712665 --- /dev/null +++ b/environments/smslabs/inventory/group_vars/podman/overrides.yml @@ -0,0 +1 @@ +podman_cidr: 192.168.1.0/24 diff --git a/environments/smslabs/inventory/groups b/environments/smslabs/inventory/groups new file mode 100644 index 000000000..acf3ca6bc --- /dev/null +++ b/environments/smslabs/inventory/groups @@ -0,0 +1,43 @@ +[nfs:children] +openhpc + +[hpctests:children] +# Login node to use for running mpi-based testing. +login + +[mysql:children] +control + +[prometheus:children] +control + +[grafana:children] +control + +[alertmanager:children] +control + +[node_exporter:children] +# disabled node_exporter on control to avoid noise in syslog +login +compute + +[opendistro:children] +control + +[kibana:children] +control + +[slurm_stats:children] +control + +[filebeat:children] +slurm_stats + +# NB: [rebuild] not defined here as this template is used in CI, which does not run in openstack + +[update:children] +cluster + +[autoscale:children] +control diff --git a/environments/smslabs/inventory/hosts b/environments/smslabs/inventory/hosts new file mode 100755 index 000000000..5ab90d3b8 --- /dev/null +++ b/environments/smslabs/inventory/hosts @@ -0,0 +1,18 @@ +[all:vars] +ansible_user=centos +openhpc_cluster_name=dev + +[control] +dev-control ansible_host=10.0.3.182 server_networks='{"stackhpc-ipv4-geneve":["10.0.3.182"]}' + +[login] +dev-login-1 ansible_host=10.0.1.54 server_networks='{"stackhpc-ipv4-geneve":["10.0.1.54"]}' + +[compute] +dev-small-0 ansible_host=10.0.1.217 server_networks='{"stackhpc-ipv4-geneve":["10.0.1.217"]}' +dev-small-1 ansible_host=10.0.3.253 server_networks='{"stackhpc-ipv4-geneve":["10.0.3.253"]}' + +# Define groups for slurm parititions: +[dev_small] +dev-small-0 +dev-small-1 From 3257a8566b23a15efecde19be92d7878d2240b65 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 09:00:29 +0000 Subject: [PATCH 048/133] cope with no non-cloud nodes in suspend_exc defaults --- environments/common/inventory/group_vars/all/autoscale.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index f9fafa715..06ed90034 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -1,5 +1,5 @@ autoscale_clouds: ~/.config/openstack/clouds.yaml -autoscale_openhpc_suspend_exc_nodes: "{{ (groups['compute'] + groups.get('login', [])) }}" # i.e. all tf-defined nodes in the partition # TODO: fixme: hardcoded "compute" +autoscale_openhpc_suspend_exc_nodes: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all nodes in inventory, i.e. not in State=CLOUD initially autoscale_openhpc_extra_config: # required parameters: SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns From 75a00693539317430567d355dab3015c93e27f74 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 09:00:50 +0000 Subject: [PATCH 049/133] smslabs: more complex partition example --- .../group_vars/openhpc/partitions.yml | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml index e7df7b946..e750bef08 100755 --- a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml +++ b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml @@ -6,14 +6,21 @@ cloud_spec: openhpc_slurm_partitions: - name: small - cloud_nodes: 2 - features: "{{ cloud_spec.items() | map('join', '=') }}" default: yes - # TODO: consider adding suspend_exc: true here?? - -- name: cloud_only - cloud_nodes: 3 - ram_mb: 9996 - cpus: 2 + cloud_nodes: '-[2-3]' features: "{{ cloud_spec.items() | map('join', '=') }}" + +- name: burst default: no + groups: + - name: smallmem + cloud_nodes: '[0-3]' + ram_mb: 9996 + cpus: 2 + features: "{{ cloud_spec.items() | map('join', '=') }}" + - name: bigmem + cloud_nodes: '[4-6]' + ram_mb: 9992 + cpus: 2 + features: "{{ cloud_spec.items() | map('join', '=') }}" + From 4a61c5dadb07064de96f204350735ea6d91b37ae Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 09:27:14 +0000 Subject: [PATCH 050/133] use cloud_features support --- .../smslabs/inventory/group_vars/openhpc/partitions.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml index e750bef08..4d632a001 100755 --- a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml +++ b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml @@ -8,19 +8,19 @@ openhpc_slurm_partitions: - name: small default: yes cloud_nodes: '-[2-3]' - features: "{{ cloud_spec.items() | map('join', '=') }}" + cloud_features: "{{ cloud_spec }}" - name: burst default: no groups: - name: smallmem cloud_nodes: '[0-3]' + cloud_features: "{{ cloud_spec }}" ram_mb: 9996 cpus: 2 - features: "{{ cloud_spec.items() | map('join', '=') }}" - name: bigmem cloud_nodes: '[4-6]' + cloud_features: "{{ cloud_spec }}" ram_mb: 9992 cpus: 2 - features: "{{ cloud_spec.items() | map('join', '=') }}" From 74404c2bc7e729da8f20a8b03adfa76352075998 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 10:20:16 +0000 Subject: [PATCH 051/133] fix feature extraction for multiple nodes --- ansible/templates/resume.py.j2 | 2 -- 1 file changed, 2 deletions(-) diff --git a/ansible/templates/resume.py.j2 b/ansible/templates/resume.py.j2 index e36d044f6..559beca93 100644 --- a/ansible/templates/resume.py.j2 +++ b/ansible/templates/resume.py.j2 @@ -51,7 +51,6 @@ def get_features(nodenames): if line.startswith('AvailableFeatures'): feature_args = line.split('=', 1)[1] features[node] = feature_args.split(',') - break return features @@ -79,7 +78,6 @@ def resume(): features = get_features(hostlist_expr) logger.info(f"Read feature information from slurm") - logger.info(f"Features: {features}") for node in new_nodes: # extract the openstack parameters from node features: From 7d13831f82b2f4b29e307ddb63230f355c0a7930 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 10:20:37 +0000 Subject: [PATCH 052/133] smslabs: testable (default) burst partition --- .../group_vars/openhpc/partitions.yml | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml index 4d632a001..d2ed22fa3 100755 --- a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml +++ b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml @@ -1,26 +1,27 @@ -cloud_spec: +general_v1_small: image: ohpc-compute-210909-1316.qcow2 flavor: general.v1.small keypair: centos-at-steveb-ansible network: stackhpc-ipv4-geneve +general_v1_medium: + image: ohpc-compute-210909-1316.qcow2 + flavor: general.v1.medium + keypair: centos-at-steveb-ansible + network: stackhpc-ipv4-geneve + openhpc_slurm_partitions: - name: small - default: yes + default: no cloud_nodes: '-[2-3]' - cloud_features: "{{ cloud_spec }}" + cloud_features: "{{ general_v1_small }}" - name: burst - default: no + default: yes groups: - - name: smallmem - cloud_nodes: '[0-3]' - cloud_features: "{{ cloud_spec }}" - ram_mb: 9996 - cpus: 2 - - name: bigmem + - name: medium cloud_nodes: '[4-6]' - cloud_features: "{{ cloud_spec }}" - ram_mb: 9992 - cpus: 2 + cloud_features: "{{ general_v1_medium }}" + ram_mb: "{{ (15258 * 0.95) | int }}" + sockets_per_board: 4 From 8d627f4fedfbe7e870cecff65f577208b01961e0 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 13:43:36 +0000 Subject: [PATCH 053/133] write instance ID to StateSaveLocation on creation --- ansible/templates/resume.py.j2 | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/ansible/templates/resume.py.j2 b/ansible/templates/resume.py.j2 index 559beca93..c17890608 100644 --- a/ansible/templates/resume.py.j2 +++ b/ansible/templates/resume.py.j2 @@ -19,7 +19,7 @@ [2]: https://slurm.schedmd.com/slurm.conf.html#OPT_Features """ -import sys, subprocess, logging.handlers +import sys, os, subprocess, logging.handlers import openstack import pprint @@ -32,6 +32,13 @@ handler = logging.handlers.SysLogHandler("/dev/log") handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) logger.addHandler(handler) +def get_statesavelocation(): + """ Return the path for Slurm's StateSaveLocation """ + scontrol = subprocess.run(['scontrol', 'show', 'config'], stdout=subprocess.PIPE, universal_newlines=True) + for line in scontrol.stdout.splitlines(): + if line.startswith('StateSaveLocation'): # StateSaveLocation = /var/spool/slurm + return line.split()[-1] + def expand_nodes(hostlist_expr): scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) return scontrol.stdout.strip().split('\n') @@ -78,6 +85,8 @@ def resume(): features = get_features(hostlist_expr) logger.info(f"Read feature information from slurm") + + statedir = get_statesavelocation() for node in new_nodes: # extract the openstack parameters from node features: @@ -106,6 +115,8 @@ def resume(): logger.info(f"creating node {node}") server = create_server(conn, node, **os_objects) # TODO: save id to disk so can use it instead of name on deletion (to cope with multiple instances with same name) logger.info(f"server: {server}") + with open(os.path.join(statedir, node), 'w') as f: + f.write(server.id) # Don't need scontrol update nodename={node} nodeaddr={server_ip} as using SlurmctldParameters=cloud_dns if __name__ == "__main__": From 8b3118968e8fca2209c439a327e83ab582ccb693 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 29 Sep 2021 13:59:36 +0000 Subject: [PATCH 054/133] use instance id on deletion --- ansible/templates/suspend.py.j2 | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/ansible/templates/suspend.py.j2 b/ansible/templates/suspend.py.j2 index dadfc2e4a..e296604ab 100644 --- a/ansible/templates/suspend.py.j2 +++ b/ansible/templates/suspend.py.j2 @@ -1,7 +1,7 @@ #!/opt/slurm-tools/bin/python3 """ Delete openstack instances """ -import sys, subprocess, logging, logging.handlers +import sys, os, subprocess, logging, logging.handlers import openstack import pprint @@ -9,8 +9,16 @@ import pprint logger = logging.getLogger("syslogger") logger.setLevel(logging.DEBUG) handler = logging.handlers.SysLogHandler("/dev/log") +handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) logger.addHandler(handler) +def get_statesavelocation(): + """ Return the path for Slurm's StateSaveLocation """ + scontrol = subprocess.run(['scontrol', 'show', 'config'], stdout=subprocess.PIPE, universal_newlines=True) + for line in scontrol.stdout.splitlines(): + if line.startswith('StateSaveLocation'): # StateSaveLocation = /var/spool/slurm + return line.split()[-1] + def expand_nodes(hostlist_expr): scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) return scontrol.stdout.strip().split('\n') @@ -28,8 +36,17 @@ def suspend(): logger.info(f"Got openstack connection {conn}") for node in remove_nodes: - logger.info(f"deleting node {node}") - delete_server(conn, node) + instance_id = False + statedir = get_statesavelocation() + instance_file = os.path.join(statedir, node) + try: + with open(instance_file) as f: + instance_id = f.read() + except FileNotFoundError: + logger.info(f"no instance file found in {statedir} for node {node}") + + logger.info(f"deleting node {instance_id or node}") + delete_server(conn, (instance_id or node)) if __name__ == "__main__": try: From a1ba9ead1f1b7ef4d900acb82514d5da1ac85ce9 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 30 Sep 2021 08:37:42 +0000 Subject: [PATCH 055/133] fixup rebuild/autoscale variable names --- .../common/inventory/group_vars/all/autoscale.yml | 3 +-- environments/common/inventory/group_vars/all/openhpc.yml | 9 ++++----- environments/common/inventory/group_vars/all/rebuild.yml | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index 06ed90034..26edefea1 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -1,6 +1,5 @@ -autoscale_clouds: ~/.config/openstack/clouds.yaml autoscale_openhpc_suspend_exc_nodes: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all nodes in inventory, i.e. not in State=CLOUD initially -autoscale_openhpc_extra_config: +autoscale_openhpc_config: # required parameters: SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns CommunicationParameters: NoAddrCache diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 7ce64defd..fc540fd4e 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -28,11 +28,10 @@ openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}" openhpc_slurm_configless: true openhpc_login_only_nodes: login -openhpc_config_default: "{{ rebuild_openhpc_extra_config | combine(autoscale_openhpc_extra_config) }}" # TODO: handle case where groups aren't defined! +openhpc_config_default: "{{ rebuild_openhpc_config | combine(autoscale_openhpc_config) }}" # TODO: handle case where groups aren't defined! openhpc_config_extra: {} -# TODO: WIP PR for openhpc_extra_config -> openhpc_config openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra) }}" -openhpc_suspend_exc_nodes_default: "{{ autoscale_openhpc_suspend_exc_nodes }}" # TODO: handle cases where groups aren't defined -openhpc_env_suspend_exc_nodes_extra: [] -openhpc_suspend_exc_nodes: "{{ openhpc_suspend_exc_nodes_default + openhpc_env_suspend_exc_nodes_extra }}" +openhpc_suspend_exc_nodes_default: "{{ autoscale_openhpc_suspend_exc_nodes }}" +openhpc_suspend_exc_nodes_extra: [] +openhpc_suspend_exc_nodes: "{{ openhpc_suspend_exc_nodes_default + openhpc_suspend_exc_nodes_extra }}" diff --git a/environments/common/inventory/group_vars/all/rebuild.yml b/environments/common/inventory/group_vars/all/rebuild.yml index 4026a0e21..b1162ffc3 100644 --- a/environments/common/inventory/group_vars/all/rebuild.yml +++ b/environments/common/inventory/group_vars/all/rebuild.yml @@ -1,3 +1,3 @@ -rebuild_openhpc_extra_config: +rebuild_openhpc_config: RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml \ No newline at end of file From ebf3dd9265b721c3739e096eb9b8212a5fbee5f8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 30 Sep 2021 12:49:50 +0000 Subject: [PATCH 056/133] create autoscale role with auto-modification of openhpc_slurm_partitions --- ansible/.gitignore | 2 + ansible/roles/autoscale/.travis.yml | 29 +++++++ ansible/roles/autoscale/README.md | 74 ++++++++++++++++++ ansible/roles/autoscale/defaults/main.yml | 2 + .../openhpc_partitions.cpython-36.pyc | Bin 0 -> 1462 bytes .../filter_plugins/openhpc_partitions.py | 48 ++++++++++++ ansible/roles/autoscale/meta.empty/main.yml | 52 ++++++++++++ ansible/roles/autoscale/tasks/main.yml | 19 +++++ ansible/roles/autoscale/tasks/validate.yml | 5 ++ .../autoscale}/templates/resume.py.j2 | 0 .../autoscale}/templates/suspend.py.j2 | 0 ansible/slurm.yml | 28 +++---- ansible/validate.yml | 9 +++ .../group_vars/openhpc/partitions.yml | 15 ++-- 14 files changed, 256 insertions(+), 27 deletions(-) create mode 100644 ansible/roles/autoscale/.travis.yml create mode 100644 ansible/roles/autoscale/README.md create mode 100644 ansible/roles/autoscale/defaults/main.yml create mode 100644 ansible/roles/autoscale/filter_plugins/__pycache__/openhpc_partitions.cpython-36.pyc create mode 100644 ansible/roles/autoscale/filter_plugins/openhpc_partitions.py create mode 100644 ansible/roles/autoscale/meta.empty/main.yml create mode 100644 ansible/roles/autoscale/tasks/main.yml create mode 100644 ansible/roles/autoscale/tasks/validate.yml rename ansible/{ => roles/autoscale}/templates/resume.py.j2 (100%) rename ansible/{ => roles/autoscale}/templates/suspend.py.j2 (100%) diff --git a/ansible/.gitignore b/ansible/.gitignore index bf07028ab..fd78abade 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -22,3 +22,5 @@ roles/* !roles/block_devices/** !roles/basic_users/ !roles/basic_users/** +!roles/autoscale/ +!roles/autoscale/** \ No newline at end of file diff --git a/ansible/roles/autoscale/.travis.yml b/ansible/roles/autoscale/.travis.yml new file mode 100644 index 000000000..36bbf6208 --- /dev/null +++ b/ansible/roles/autoscale/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/ansible/roles/autoscale/README.md b/ansible/roles/autoscale/README.md new file mode 100644 index 000000000..63c6c13ab --- /dev/null +++ b/ansible/roles/autoscale/README.md @@ -0,0 +1,74 @@ +# autoscale + +Support autoscaling nodes on OpenStack clouds, i.e. creating nodes when necessary to service the queue and deleting them when they are no longer needed. + +This is implemented using Slurm's ["elastic computing"](https://slurm.schedmd.com/elastic_computing.html) features which are based on Slurm's [power saving](https://slurm.schedmd.com/power_save.html) features. + + +NOTES TODO: +- Won't get monitoring for autoscaling nodes +- Describe autoscale vs `State=CLOUD` and powersaving enablement. +- Describe groups. +- Describe cpu/memory info requirements (inc. for mixed partitions) +- Describe what happens on failure. + + +## Requirements + +- Role `stackhpc.slurm_openstack_tools.pytools`. Installs [slurm-openstack-tools](github.com/stackhpc/slurm-openstack-tools) which provides a venv with the `openstacksdk`. +- Role `stackhpc.openhpc` to create a Slurm cluster. +- This role should be run on the Slurm controller only, i.e. add the `control` group to the `autoscale` group to activate this functionality. + +## Role Variables + +- `openhpc_slurm_partitions`: This role modifies what the partitions/groups defined [openhpc_slurm_partitions](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) in the by `stackhpc.openhpc` role accept: + - `cloud_nodes`: Optional. As per the `stackhpc.openhpc` docs this defines nodes in a ["CLOUD" state](https://slurm.schedmd.com/slurm.conf.html#OPT_CLOUD), i.e. treated as powered down/not existing when the Slurm control daemon starts. The value is a suffix for the group/partition's node names in Slurm's hostlist expression format (e.g. `-[11-25]`) and therefore defines the number of CLOUD-state nodes. + - `cloud_instances`: Required if `cloud_nodes` is defined. A dict defining the `flavor`, `image`, `keypair` and `network` to use for CLOUD-state instances in this partition/group. Values for these parameters may be either names (if unique in the cloud) or IDs. + +Some examples are given below. + +### Processor/memory information +Non-CLOUD-state nodes in a group/partition are defined by the hosts in an inventory group named `_` as per `stackhpc.openhpc` [docs](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) and processor/memory information is automatically retrieved from them. + +- If a group/partition contains both CLOUD and non-CLOUD nodes the processor/memory information for the CLOUD nodes is assumed to match that retrieved for the non-CLOUD nodes. +- If a group/partition only contains CLOUD-state nodes (i.e. no matching inventory group or it is empty) then processor/memory information must be specified using the `ram_mb`, `sockets`, `cores_per_socket` and `threads_per_core` options. + + + + + + ```yaml + cloud_instances: + flavor: general.v1.medium + image: ohpc-compute-210909-1316.qcow2 + keypair: centos-at-steveb-ansible + network: "{{ autoscale_network }}" + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + + + + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/ansible/roles/autoscale/defaults/main.yml b/ansible/roles/autoscale/defaults/main.yml new file mode 100644 index 000000000..e85b2db21 --- /dev/null +++ b/ansible/roles/autoscale/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# defaults file for autoscale diff --git a/ansible/roles/autoscale/filter_plugins/__pycache__/openhpc_partitions.cpython-36.pyc b/ansible/roles/autoscale/filter_plugins/__pycache__/openhpc_partitions.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..100545137c74508e603d7c79d3de87c9f5cc88b6 GIT binary patch literal 1462 zcmaJ>PjBNy6rUM8w(F#@D%G-E5E!IYh?XV@ZcC`P+7`4DWJTLmC5x5U^-QvM9owB5 zmo{=FMEU{P3m3F^xFAlP_#|`X#8>EvH%`+=fEa7uyz#uhzxQT7SzUEkUOo+fvl03e zEe#X+hcHDS2t^d9XpDV~HBVCFlX^9LgEoDW+SIu~e&Y*7SEze|sLQOG;oFq#qfYBD zuz)&PS&Z|Xi-EF+X?U7*)riO8h$;J+ofl!um7Ov9ZO)G)tTkXXU^*}#!4!W0Is3gr zJLuW3Q+)Pwg%b#?!XrE*@bAYdsR;Zb)R>W}LCuOBTYQ5yV72*|5+{UOmGKjz_A6tC zzsJ-`3~9>73@27aXyY!L;TfKqmC3(bl3uAy&~aaZ9OyJ_Yd?$-Xf*pZy^9i?w&?1G z-hq|GnL&)&7DguvQL=^%cUYp$V4xa2hdEn7J60QeBStFj8g-~V|j@hKyFEh%9 zamMJD-;C0{q(PQbCj8gun>Z6P%pxWx??2BAmc1yVK%^xf2SvzbEaN;A&%N5F*PeXZ z_C&#=co;L`wU^Aby`xfkvGB&R5OFqI9A2AmcQ5gWllyHimTdg*(`8jHAs%=Nca<|_ zp)5HQ*9A17X1a*`x`1y30pQTH^>4@&PoaYaa1 zQutWmQ{Do-np__CUgTrei&!Re(W?{R42xouveRCeiTEgGJ)Wmb^uiJ(q7e4OIF*bC zMOuy^z1{+~8@#&3`K8-WVYzf!G3Oq1#ze z#)wI-)z#YeJfhF*0CsinD>KRCOc{bn_2%C0*H0hs?d}AR2m1%xgGal;_QAp4zH-6k zkZ}fOhAVR>Sw`WeXp}n*<=(`tOwFrCU2WyoOQ74xU%e`85K76#4SxKVj_a-eN$QSH zeggENqHnS!Omf2Ly3iz_xkwXPW=ll5g!-7A3s zgx~};C5YspPuL zxdo)-DEq1x=Y3!$ah8N1E5aGSt)(=8%F2%t7RlPse|c0+e;(u38i)%QNL>wL66cM- E0Z#me!T') + if 'cloud_nodes' in group: + if 'cloud_instances' not in group: + raise errors.AnsibleFilterError(f"`openhpc_slurm_partitions` group '{group_name}' specifies 'cloud_nodes' but is missing 'cloud_instances'.") + missing_attrs = ','.join(set(REQUIRED_INSTANCE_ATTRS).difference(group['cloud_instances'])) + if missing_attrs: + raise errors.AnsibleFilterError(f"`openhpc_slurm_partitions` group '{group_name}' item 'cloud_instances' is missing items: {missing_attrs}.") + if 'features' not in group: + group['features'] = [] + group['features'].extend(['%s=%s' % (k, v) for (k, v) in group['cloud_instances'].items()]) + + return partitions + +class FilterModule(object): + + def filters(self): + return { + 'modify_autoscale_partitions': modify_autoscale_partitions, + } diff --git a/ansible/roles/autoscale/meta.empty/main.yml b/ansible/roles/autoscale/meta.empty/main.yml new file mode 100644 index 000000000..c572acc9f --- /dev/null +++ b/ansible/roles/autoscale/meta.empty/main.yml @@ -0,0 +1,52 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.1 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. diff --git a/ansible/roles/autoscale/tasks/main.yml b/ansible/roles/autoscale/tasks/main.yml new file mode 100644 index 000000000..86ff4c438 --- /dev/null +++ b/ansible/roles/autoscale/tasks/main.yml @@ -0,0 +1,19 @@ +--- + +- name: Create SuspendProgram + template: + src: suspend.py.j2 + dest: /opt/slurm-tools/bin/suspend + mode: u=rx,go= + tags: suspend + when: "'autoscale' in group_names" +- name: Create ResumeProgram # TODO: FIXME: add to slurm-tools + template: + src: resume.py.j2 + dest: /opt/slurm-tools/bin/resume + mode: u=rx,go= + #was: mode: u=rwx,g=rx,o=rx # is OK as clouds.yaml is protected + tags: resume +- name: Modify openhpc_slurm_partitions + set_fact: + openhpc_slurm_partitions: "{{ openhpc_slurm_partitions | modify_autoscale_partitions }}" diff --git a/ansible/roles/autoscale/tasks/validate.yml b/ansible/roles/autoscale/tasks/validate.yml new file mode 100644 index 000000000..5a56fa019 --- /dev/null +++ b/ansible/roles/autoscale/tasks/validate.yml @@ -0,0 +1,5 @@ +--- + +- name: Check openhpc_slurm_partitions information + debug: + msg: "{{ openhpc_slurm_partitions | modify_autoscale_partitions | to_nice_yaml }}" diff --git a/ansible/templates/resume.py.j2 b/ansible/roles/autoscale/templates/resume.py.j2 similarity index 100% rename from ansible/templates/resume.py.j2 rename to ansible/roles/autoscale/templates/resume.py.j2 diff --git a/ansible/templates/suspend.py.j2 b/ansible/roles/autoscale/templates/suspend.py.j2 similarity index 100% rename from ansible/templates/suspend.py.j2 rename to ansible/roles/autoscale/templates/suspend.py.j2 diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 73fe05216..174a3cf55 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -27,7 +27,7 @@ file: path: /etc/openstack state: directory - owner: slurm # TODO: check if this works for rebuild too? + owner: slurm # TODO: check this works for rebuild too group: slurm mode: u=rX,go= - name: Copy out clouds.yaml @@ -35,29 +35,19 @@ src: "{{ openhpc_rebuild_clouds }}" # TODO: name is wrong really! dest: /etc/openstack/clouds.yaml mode: u=r,go= - owner: slurm # TODO: check if this works for rebuild too? + owner: slurm # TODO: check this works for rebuild too group: slurm - - name: Setup slurm tools # this adds reboot script only at present + - name: Setup slurm tools include_role: name: stackhpc.slurm_openstack_tools.pytools # TODO: could just move this into the repo? vars: # TODO: debug - pytools_editable: true # git repo in /opt/slurm-tools/src/slurm-openstack-tools - become_user: slurm # TODO: check if this works for rebuild too? - become_flags: '-s /bin/bash' - - name: Create SuspendProgram # TODO: FIXME: add to slurm-tools - template: - src: suspend.py.j2 - dest: /opt/slurm-tools/bin/suspend - mode: u=rwx,g=rx,o=rx # is OK as clouds.yaml is protected - tags: suspend - when: "'autoscale' in group_names" - - name: Create ResumeProgram # TODO: FIXME: add to slurm-tools - template: - src: resume.py.j2 - dest: /opt/slurm-tools/bin/resume - mode: u=rwx,g=rx,o=rx # is OK as clouds.yaml is protected - tags: resume + become_user: slurm # TODO: check this works for rebuild too + become_flags: '-s /bin/bash' # as has shell specified as /sbin/nologin + - name: Configure autoscale programs and parameters + include_role: + name: autoscale when: "'autoscale' in group_names" + # TODO: rebuild - name: Setup slurm hosts: openhpc diff --git a/ansible/validate.yml b/ansible/validate.yml index 0c0ba8f38..805f66164 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -2,6 +2,15 @@ # Fail early if configuration is invalid +- name: Validate autoscale configuration + hosts: autoscale + tags: autoscale + tasks: + - import_role: + name: autoscale + tasks_from: validate.yml + tags: validate + - name: Validate podman configuration hosts: podman tags: podman diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml index d2ed22fa3..1180bf9e5 100755 --- a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml +++ b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml @@ -14,14 +14,13 @@ openhpc_slurm_partitions: - name: small default: no cloud_nodes: '-[2-3]' - cloud_features: "{{ general_v1_small }}" + cloud_instances: "{{ general_v1_small }}" - name: burst default: yes - groups: - - name: medium - cloud_nodes: '[4-6]' - cloud_features: "{{ general_v1_medium }}" - ram_mb: "{{ (15258 * 0.95) | int }}" - sockets_per_board: 4 - + cloud_nodes: '-[1-4]' + cloud_instances: "{{ general_v1_medium }}" + ram_mb: "{{ (15258 * 0.95) | int }}" + sockets: 1 + cores_per_socket: 4 + threads_per_core: 1 From 0bde5fcb16377b5250587309f584922b62a41f74 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 30 Sep 2021 14:57:51 +0000 Subject: [PATCH 057/133] set autoscale defaults with merged options --- .../inventory/group_vars/all/autoscale.yml | 52 +++++++++++++------ .../inventory/group_vars/all/openhpc.yml | 9 ++-- .../inventory/group_vars/all/rebuild.yml | 5 +- 3 files changed, 44 insertions(+), 22 deletions(-) diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index 26edefea1..328938a97 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -1,18 +1,38 @@ -autoscale_openhpc_suspend_exc_nodes: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all nodes in inventory, i.e. not in State=CLOUD initially -autoscale_openhpc_config: - # required parameters: - SlurmctldParameters: enable_configless,idle_on_node_suspend,cloud_dns - CommunicationParameters: NoAddrCache +# recommended: +autoscale_suspend_exc_nodes: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all non-CLOUD nodes, and prevent login-only slurmd nodes getting powered down +autoscale_private_data: # PrivateData + - cloud # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud + +# for debugging, may want to amend in production: +autoscale_debug_flags: + - PowerSave # https://slurm.schedmd.com/slurm.conf.html#OPT_Power +autoscale_slurmctld_syslog_debug: info # https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldSyslogDebug + +# likely to need tuning: +autoscale_suspend_time: 120 # https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime +autoscale_suspend_timeout: 30 # https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout +autoscale_resume_timeout: 300 # https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout +autoscale_power_save_interval: 10 # https://slurm.schedmd.com/slurm.conf.html#OPT_power_save_interval +autoscale_power_save_min_interval: 0 # https://slurm.schedmd.com/slurm.conf.html#OPT_power_save_min_intervals + +openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: fix name here? + +_autoscale_openhpc_config: SuspendProgram: /opt/slurm-tools/bin/suspend ResumeProgram: /opt/slurm-tools/bin/resume - # recommended: - PrivateData: cloud # shows cloud node state - # TODO: for testing only, not production: - DebugFlags: PowerSave - SlurmctldSyslogDebug: info - SuspendTime: 120 - SuspendTimeout: 30 - ResumeTimeout: 300 - # FIXME: maybe need to set TreeWidth to >= number of nodes (default: 50) - # power_save_*interval: options are defaults but should enable changes -openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: fix name here? + SlurmctldParameters: + - idle_on_node_suspend # https://slurm.schedmd.com/slurm.conf.html#OPT_idle_on_node_suspend + - cloud_dns # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud_dns + # - "power_save_interval={{ autoscale_power_save_interval}}" # seems to break if you set this + # - "power_save_min_interval={{ autoscale_power_save_min_interval }}" + CommunicationParameters: + - NoAddrCache # https://slurm.schedmd.com/slurm.conf.html#OPT_NoAddrCache + PrivateData: "{{ autoscale_private_data }}" + DebugFlags: "{{ autoscale_debug_flags }}" + SlurmctldSyslogDebug: "{{ autoscale_slurmctld_syslog_debug }}" + SuspendTime: "{{ autoscale_suspend_time }}" + SuspendTimeout: "{{ autoscale_suspend_timeout }}" + ResumeTimeout: "{{ autoscale_resume_timeout }}" +# See also TreeWidth but shouldn't needs setting with cloud_dns + +autoscale_openhpc_config: "{{ _autoscale_openhpc_config if groups.get('autoscale', []) else {} }}" diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index fc540fd4e..18b819808 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -28,10 +28,11 @@ openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}" openhpc_slurm_configless: true openhpc_login_only_nodes: login -openhpc_config_default: "{{ rebuild_openhpc_config | combine(autoscale_openhpc_config) }}" # TODO: handle case where groups aren't defined! +openhpc_config_default: + SlurmctldParameters: + - enable_configless # required as we might override SlurmctldParameters elsewhere openhpc_config_extra: {} -openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra) }}" +openhpc_config: "{{ openhpc_config_default | combine(rebuild_openhpc_config, autoscale_openhpc_config, openhpc_config_extra, list_merge='append') }}" -openhpc_suspend_exc_nodes_default: "{{ autoscale_openhpc_suspend_exc_nodes }}" openhpc_suspend_exc_nodes_extra: [] -openhpc_suspend_exc_nodes: "{{ openhpc_suspend_exc_nodes_default + openhpc_suspend_exc_nodes_extra }}" +openhpc_suspend_exc_nodes: "{{ autoscale_suspend_exc_nodes + openhpc_suspend_exc_nodes_extra }}" diff --git a/environments/common/inventory/group_vars/all/rebuild.yml b/environments/common/inventory/group_vars/all/rebuild.yml index b1162ffc3..ba631a07c 100644 --- a/environments/common/inventory/group_vars/all/rebuild.yml +++ b/environments/common/inventory/group_vars/all/rebuild.yml @@ -1,3 +1,4 @@ -rebuild_openhpc_config: +_rebuild_openhpc_config: RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild -openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml \ No newline at end of file +rebuild_openhpc_config: "{{ _rebuild_openhpc_config if groups.get('rebuild', []) else {} }}" +openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: rename? From 37a1070b693ce69585f82ca65f0245ec420353e2 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 30 Sep 2021 16:15:42 +0000 Subject: [PATCH 058/133] enable rebuild from controller --- ansible/.gitignore | 4 +- ansible/roles/rebuild/README.md | 38 +++++ ansible/roles/rebuild/defaults/main.yml | 2 + ansible/roles/rebuild/tasks/main.yml | 9 ++ ansible/roles/rebuild/templates/rebuild.py.j2 | 147 ++++++++++++++++++ ansible/slurm.yml | 13 +- .../inventory/group_vars/all/rebuild.yml | 4 +- environments/smslabs/inventory/groups | 3 +- 8 files changed, 212 insertions(+), 8 deletions(-) create mode 100644 ansible/roles/rebuild/README.md create mode 100644 ansible/roles/rebuild/defaults/main.yml create mode 100644 ansible/roles/rebuild/tasks/main.yml create mode 100644 ansible/roles/rebuild/templates/rebuild.py.j2 diff --git a/ansible/.gitignore b/ansible/.gitignore index fd78abade..0ccc6a74f 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -23,4 +23,6 @@ roles/* !roles/basic_users/ !roles/basic_users/** !roles/autoscale/ -!roles/autoscale/** \ No newline at end of file +!roles/autoscale/** +!roles/rebuild/ +!roles/rebuild/** diff --git a/ansible/roles/rebuild/README.md b/ansible/roles/rebuild/README.md new file mode 100644 index 000000000..225dd44b9 --- /dev/null +++ b/ansible/roles/rebuild/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/ansible/roles/rebuild/defaults/main.yml b/ansible/roles/rebuild/defaults/main.yml new file mode 100644 index 000000000..0a0383df4 --- /dev/null +++ b/ansible/roles/rebuild/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# defaults file for rebuild diff --git a/ansible/roles/rebuild/tasks/main.yml b/ansible/roles/rebuild/tasks/main.yml new file mode 100644 index 000000000..68acdd606 --- /dev/null +++ b/ansible/roles/rebuild/tasks/main.yml @@ -0,0 +1,9 @@ +--- +- name: Create RebootProgram # TODO: FIXME: add to slurm-tools + template: + src: rebuild.py.j2 + dest: /opt/slurm-tools/bin/rebuild + mode: u=rx,go= + owner: slurm + group: slurm + tags: resume diff --git a/ansible/roles/rebuild/templates/rebuild.py.j2 b/ansible/roles/rebuild/templates/rebuild.py.j2 new file mode 100644 index 000000000..e080d763e --- /dev/null +++ b/ansible/roles/rebuild/templates/rebuild.py.j2 @@ -0,0 +1,147 @@ +#!/opt/slurm-tools/bin/python3 +# -*- coding: utf-8 -*- + +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import logging.handlers +import os +from os import path +import sys +import socket +import subprocess + +import openstack +import pbr.version + +__version__ = pbr.version.VersionInfo("slurm-openstack-tools").version_string() + +MAX_REASON_LENGTH = 1000 + +# configure logging to syslog - by default only "info" +# and above categories appear +logger = logging.getLogger("syslogger") +logger.setLevel(logging.DEBUG) +handler = logging.handlers.SysLogHandler("/dev/log") +handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) +logger.addHandler(handler) + +def get_statesavelocation(): + """ Return the path for Slurm's StateSaveLocation """ + scontrol = subprocess.run(['scontrol', 'show', 'config'], stdout=subprocess.PIPE, universal_newlines=True) + for line in scontrol.stdout.splitlines(): + if line.startswith('StateSaveLocation'): # StateSaveLocation = /var/spool/slurm + return line.split()[-1] + +def get_openstack_server_id(node): + + statedir = get_statesavelocation() + instance_file = os.path.join(statedir, node) + try: + with open(instance_file) as f: + instance_id = f.readline().strip() + return instance_id + except FileNotFoundError: + logger.error(f"no instance file found in {statedir} for node {node}") + return None + +def get_sinfo_path(): + # TODO(johngarbutt): get this from environment or config file? + sinfo_alt_path = "/usr/local/software/slurm/current/bin/sinfo" + if path.exists(sinfo_alt_path): + return sinfo_alt_path + return "sinfo" + + +def get_reboot_reason(node): + sinfo_path = get_sinfo_path() + # see why we're being rebooted: + sinfo = subprocess.run( + [ + sinfo_path, + "--noheader", + "--nodes=%s" % node, + "-O", + "Reason:%i" % MAX_REASON_LENGTH, + ], + stdout=subprocess.PIPE, + universal_newlines=True, + ) + return sinfo.stdout.strip() + + +def get_image_from_reason(reason): + tokens = reason.split() + image = None + if len(tokens) > 1: + image_tokens = tokens[1].split(":") + if len(image_tokens) == 2 and image_tokens[0] == "image": + if image_tokens[1]: + image = image_tokens[1] + logger.info(f"user requested image: {image}") + return image + + +def rebuild_openstack_server(server_id, reason): + # Validate server_id + conn = openstack.connection.from_config() + try: + server = conn.get_server(server_id) + except openstack.exceptions.ResourceNotFound: + logger.error(f"server id {server_id} is not valid") + return None + + image_name_or_uuid = get_image_from_reason(reason) + if not image_name_or_uuid: + image_name_or_uuid = server.image.id + logger.info(f"couldn't parse image from reason '{reason}', falling back to existing image: {image_name_or_uuid}") + + image = conn.image.find_image(image_name_or_uuid) # doesn't throw exception + if image is None: + logger.error(f"image {image_name_or_uuid} either not found or not unique") + return None + + # Note that OpenStack will power down the server as part of the rebuild + logger.info(f"rebuilding server {server_id} with image {image.id}") + conn.rebuild_server(server_id, image.id) + +def reboot_openstack_server(server_id): + conn = openstack.connection.from_config() + server = conn.get_server(server_id) + logger.info(f"rebooting server %{server_id} with image %{image_uuid}") + conn.reboot_server(server_id, 'SOFT') + +def expand_nodes(hostlist_expr): + scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) + return scontrol.stdout.strip().split('\n') + +def rebuild_or_reboot(): + """ Rebuild or reboot an OpenStack node from the controller. """ + + hostlist_expr = sys.argv[1] + logger.info(f"Slurmctld invoked RebootProgram {hostlist_expr}") + for node in expand_nodes(hostlist_expr): + server_uuid = get_openstack_server_id(node) + if not server_uuid: + continue # can just try next one (but really should now exit > 0 even if others succeed) + reason = get_reboot_reason(node) + if not reason.startswith("rebuild"): + reboot_openstack_server(server_uuid) # TODO: support selecting soft or hard reboot via reason? + else: + rebuild_openstack_server(server_uuid, reason) + +if __name__ == "__main__": + try: + rebuild_or_reboot() + except: + logger.exception('Exception in main:') + raise \ No newline at end of file diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 174a3cf55..8d825a64b 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -35,19 +35,22 @@ src: "{{ openhpc_rebuild_clouds }}" # TODO: name is wrong really! dest: /etc/openstack/clouds.yaml mode: u=r,go= - owner: slurm # TODO: check this works for rebuild too + owner: slurm group: slurm - name: Setup slurm tools include_role: - name: stackhpc.slurm_openstack_tools.pytools # TODO: could just move this into the repo? - vars: # TODO: debug - become_user: slurm # TODO: check this works for rebuild too + name: stackhpc.slurm_openstack_tools.pytools + vars: + become_user: slurm become_flags: '-s /bin/bash' # as has shell specified as /sbin/nologin - name: Configure autoscale programs and parameters include_role: name: autoscale when: "'autoscale' in group_names" - # TODO: rebuild + - name: Configure rebuild programs and parameters + include_role: + name: rebuild + when: "'rebuild' in group_names" - name: Setup slurm hosts: openhpc diff --git a/environments/common/inventory/group_vars/all/rebuild.yml b/environments/common/inventory/group_vars/all/rebuild.yml index ba631a07c..57e30ad5f 100644 --- a/environments/common/inventory/group_vars/all/rebuild.yml +++ b/environments/common/inventory/group_vars/all/rebuild.yml @@ -1,4 +1,6 @@ _rebuild_openhpc_config: - RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild + RebootProgram: /opt/slurm-tools/bin/rebuild + SlurmctldParameters: + - reboot_from_controller rebuild_openhpc_config: "{{ _rebuild_openhpc_config if groups.get('rebuild', []) else {} }}" openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: rename? diff --git a/environments/smslabs/inventory/groups b/environments/smslabs/inventory/groups index acf3ca6bc..1f4e97615 100644 --- a/environments/smslabs/inventory/groups +++ b/environments/smslabs/inventory/groups @@ -34,7 +34,8 @@ control [filebeat:children] slurm_stats -# NB: [rebuild] not defined here as this template is used in CI, which does not run in openstack +[rebuild:children] +control [update:children] cluster From 138de0a52d6e2d51c23313ad800fa18d076ce4db Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 1 Oct 2021 08:47:52 +0000 Subject: [PATCH 059/133] make suspend less picky about instance ID file format --- ansible/roles/autoscale/templates/suspend.py.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/autoscale/templates/suspend.py.j2 b/ansible/roles/autoscale/templates/suspend.py.j2 index e296604ab..d52bd4d03 100644 --- a/ansible/roles/autoscale/templates/suspend.py.j2 +++ b/ansible/roles/autoscale/templates/suspend.py.j2 @@ -41,7 +41,7 @@ def suspend(): instance_file = os.path.join(statedir, node) try: with open(instance_file) as f: - instance_id = f.read() + instance_id = f.readline().strip() except FileNotFoundError: logger.info(f"no instance file found in {statedir} for node {node}") From dee0807ce15255b30c3f9e6cf801750f5b2ffd5e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 1 Oct 2021 14:16:12 +0000 Subject: [PATCH 060/133] use existing compute-based rebuild --- ansible/slurm.yml | 6 +----- environments/common/inventory/group_vars/all/rebuild.yml | 4 +--- environments/smslabs/inventory/groups | 1 + 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 8d825a64b..2a73c3a84 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -37,7 +37,7 @@ mode: u=r,go= owner: slurm group: slurm - - name: Setup slurm tools + - name: Setup slurm tools # this installs RebootProgram for rebuild too include_role: name: stackhpc.slurm_openstack_tools.pytools vars: @@ -47,10 +47,6 @@ include_role: name: autoscale when: "'autoscale' in group_names" - - name: Configure rebuild programs and parameters - include_role: - name: rebuild - when: "'rebuild' in group_names" - name: Setup slurm hosts: openhpc diff --git a/environments/common/inventory/group_vars/all/rebuild.yml b/environments/common/inventory/group_vars/all/rebuild.yml index 57e30ad5f..ba631a07c 100644 --- a/environments/common/inventory/group_vars/all/rebuild.yml +++ b/environments/common/inventory/group_vars/all/rebuild.yml @@ -1,6 +1,4 @@ _rebuild_openhpc_config: - RebootProgram: /opt/slurm-tools/bin/rebuild - SlurmctldParameters: - - reboot_from_controller + RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild rebuild_openhpc_config: "{{ _rebuild_openhpc_config if groups.get('rebuild', []) else {} }}" openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: rename? diff --git a/environments/smslabs/inventory/groups b/environments/smslabs/inventory/groups index 1f4e97615..6fde43dfa 100644 --- a/environments/smslabs/inventory/groups +++ b/environments/smslabs/inventory/groups @@ -36,6 +36,7 @@ slurm_stats [rebuild:children] control +compute [update:children] cluster From 993d413a35cb46bcc5e8082267de35633c36eb5d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 1 Oct 2021 19:53:22 +0000 Subject: [PATCH 061/133] move suspend/resume program into slurm_openstack_tools --- .../roles/autoscale/templates/resume.py.j2 | 127 ------------------ .../roles/autoscale/templates/suspend.py.j2 | 56 -------- .../inventory/group_vars/all/autoscale.yml | 4 +- .../inventory/group_vars/rebuild/override.yml | 1 + 4 files changed, 3 insertions(+), 185 deletions(-) delete mode 100644 ansible/roles/autoscale/templates/resume.py.j2 delete mode 100644 ansible/roles/autoscale/templates/suspend.py.j2 create mode 100644 environments/smslabs/inventory/group_vars/rebuild/override.yml diff --git a/ansible/roles/autoscale/templates/resume.py.j2 b/ansible/roles/autoscale/templates/resume.py.j2 deleted file mode 100644 index c17890608..000000000 --- a/ansible/roles/autoscale/templates/resume.py.j2 +++ /dev/null @@ -1,127 +0,0 @@ -#!/opt/slurm-tools/bin/python3 -""" A Slurm ResumeProgram to create OpenStack instances. - - Usage: - - resume HOSTLIST_EXPRESSION [debug] - - where: - HOSTLIST_EXPRESSION: Name(s) of node(s) to create, using Slurm's hostlist expression, as per [1]. - debug: Any 2nd argument puts this in debug mode which is more verbose but does not actually create nodes. - - Output and exceptions are written to the syslog. - - The flavor, image, network and keypair to be used must be defined as node Features [2] in the format "parameter=value". - - OpenStack credentials must be available to this script (e.g. via an application credential in /etc/openstack/clouds.yaml readable by the slurm user) - - [1]: https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeProgram - [2]: https://slurm.schedmd.com/slurm.conf.html#OPT_Features -""" - -import sys, os, subprocess, logging.handlers -import openstack -import pprint - -REQUIRED_PARAMS = ('image', 'flavor', 'keypair', 'network') - -# configure logging to syslog - by default only "info" and above categories appear -logger = logging.getLogger("syslogger") -logger.setLevel(logging.DEBUG) -handler = logging.handlers.SysLogHandler("/dev/log") -handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) -logger.addHandler(handler) - -def get_statesavelocation(): - """ Return the path for Slurm's StateSaveLocation """ - scontrol = subprocess.run(['scontrol', 'show', 'config'], stdout=subprocess.PIPE, universal_newlines=True) - for line in scontrol.stdout.splitlines(): - if line.startswith('StateSaveLocation'): # StateSaveLocation = /var/spool/slurm - return line.split()[-1] - -def expand_nodes(hostlist_expr): - scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) - return scontrol.stdout.strip().split('\n') - -def get_features(nodenames): - """ Retrieve the features specified for given node(s). - - Returns a dict with a key/value pair for each node. Keys are node names, values are lists of strings, one string per feature. - """ - - scontrol = subprocess.run(['scontrol', 'show', 'node', nodenames], stdout=subprocess.PIPE, universal_newlines=True) - features = {} - for line in scontrol.stdout.splitlines(): - line = line.strip() - if line.startswith('NodeName'): # NodeName=dev-small-cloud-1 CoresPerSocket=1 - node = line.split()[0].split('=')[1] - if line.startswith('AvailableFeatures'): - feature_args = line.split('=', 1)[1] - features[node] = feature_args.split(',') - - return features - -def create_server(conn, name, image, flavor, network, keypair): - - server = conn.compute.create_server( - name=name, image_id=image.id, flavor_id=flavor.id, - networks=[{"uuid": network.id}], key_name=keypair.name, - ) - #server = conn.compute.wait_for_server(...) - - return server - -def resume(): - debug = False - if len(sys.argv) > 2: - logger.info(f"Running in debug mode - won't actually create nodes") - debug = True - hostlist_expr = sys.argv[1] - logger.info(f"Slurmctld invoked resume {hostlist_expr}") - new_nodes = expand_nodes(hostlist_expr) - - conn = openstack.connection.from_config() - logger.info(f"Got openstack connection {conn}") - - features = get_features(hostlist_expr) - logger.info(f"Read feature information from slurm") - - statedir = get_statesavelocation() - - for node in new_nodes: - # extract the openstack parameters from node features: - if node not in features: - logger.error(f"No Feature definitions found for node {node}: {features}") - os_parameters = dict(feature.split('=') for feature in features[node]) - if debug: - logger.info(f"os_parameters for {node}: {os_parameters}") - missing = set(REQUIRED_PARAMS).difference(os_parameters.keys()) - if missing: - logger.error(f"Missing {','.join(missing)} from feature definition for node {node}: {os_parameters}") - - # get openstack objects: - os_objects = { - 'image': conn.compute.find_image(os_parameters['image']), - 'flavor': conn.compute.find_flavor(os_parameters['flavor']), - 'network': conn.network.find_network(os_parameters['network']), - 'keypair': conn.compute.find_keypair(os_parameters['keypair']), - } - not_found = dict((k, v) for (k, v) in os_objects.items() if v is None) - if not_found: - raise ValueError('Could not find openstack objects for: %s' % ', '.join(not_found)) - if debug: - logger.info(f"os_objects for {node} : {os_objects}") - if not debug: - logger.info(f"creating node {node}") - server = create_server(conn, node, **os_objects) # TODO: save id to disk so can use it instead of name on deletion (to cope with multiple instances with same name) - logger.info(f"server: {server}") - with open(os.path.join(statedir, node), 'w') as f: - f.write(server.id) - # Don't need scontrol update nodename={node} nodeaddr={server_ip} as using SlurmctldParameters=cloud_dns - -if __name__ == "__main__": - try: - resume() - except: - logger.exception('Exception in main:') - raise diff --git a/ansible/roles/autoscale/templates/suspend.py.j2 b/ansible/roles/autoscale/templates/suspend.py.j2 deleted file mode 100644 index d52bd4d03..000000000 --- a/ansible/roles/autoscale/templates/suspend.py.j2 +++ /dev/null @@ -1,56 +0,0 @@ -#!/opt/slurm-tools/bin/python3 -""" Delete openstack instances """ - -import sys, os, subprocess, logging, logging.handlers -import openstack -import pprint - -# configure logging to syslog - by default only "info" and above categories appear -logger = logging.getLogger("syslogger") -logger.setLevel(logging.DEBUG) -handler = logging.handlers.SysLogHandler("/dev/log") -handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) -logger.addHandler(handler) - -def get_statesavelocation(): - """ Return the path for Slurm's StateSaveLocation """ - scontrol = subprocess.run(['scontrol', 'show', 'config'], stdout=subprocess.PIPE, universal_newlines=True) - for line in scontrol.stdout.splitlines(): - if line.startswith('StateSaveLocation'): # StateSaveLocation = /var/spool/slurm - return line.split()[-1] - -def expand_nodes(hostlist_expr): - scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) - return scontrol.stdout.strip().split('\n') - -def delete_server(conn, name): - server = conn.compute.find_server(name) - conn.compute.delete_server(server) - -def suspend(): - hostlist_expr = sys.argv[1] - logger.info(f"Slurmctld invoked suspend {hostlist_expr}") - remove_nodes = expand_nodes(hostlist_expr) - - conn = openstack.connection.from_config() - logger.info(f"Got openstack connection {conn}") - - for node in remove_nodes: - instance_id = False - statedir = get_statesavelocation() - instance_file = os.path.join(statedir, node) - try: - with open(instance_file) as f: - instance_id = f.readline().strip() - except FileNotFoundError: - logger.info(f"no instance file found in {statedir} for node {node}") - - logger.info(f"deleting node {instance_id or node}") - delete_server(conn, (instance_id or node)) - -if __name__ == "__main__": - try: - suspend() - except: - logger.exception('Exception in main:') - raise diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index 328938a97..cc27f9b26 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -18,8 +18,8 @@ autoscale_power_save_min_interval: 0 # https://slurm.schedmd.com/slurm.conf.html openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: fix name here? _autoscale_openhpc_config: - SuspendProgram: /opt/slurm-tools/bin/suspend - ResumeProgram: /opt/slurm-tools/bin/resume + SuspendProgram: /opt/slurm-tools/bin/slurm-openstack-suspend + ResumeProgram: /opt/slurm-tools/bin/slurm-openstack-resume SlurmctldParameters: - idle_on_node_suspend # https://slurm.schedmd.com/slurm.conf.html#OPT_idle_on_node_suspend - cloud_dns # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud_dns diff --git a/environments/smslabs/inventory/group_vars/rebuild/override.yml b/environments/smslabs/inventory/group_vars/rebuild/override.yml new file mode 100644 index 000000000..178ab7848 --- /dev/null +++ b/environments/smslabs/inventory/group_vars/rebuild/override.yml @@ -0,0 +1 @@ +pytools_gitref: feature/autoscale From 53e27fdb955b7c144381e4be79ef0023065020a8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 1 Oct 2021 20:11:39 +0000 Subject: [PATCH 062/133] use autoscale defaults in role via set_fact --- ansible/roles/autoscale/defaults/main.yml | 38 +++++++++++++++++- ansible/roles/autoscale/tasks/main.yml | 17 ++------ .../inventory/group_vars/all/autoscale.yml | 40 ++----------------- .../inventory/group_vars/all/openhpc.yml | 6 +-- 4 files changed, 44 insertions(+), 57 deletions(-) diff --git a/ansible/roles/autoscale/defaults/main.yml b/ansible/roles/autoscale/defaults/main.yml index e85b2db21..73cd8792f 100644 --- a/ansible/roles/autoscale/defaults/main.yml +++ b/ansible/roles/autoscale/defaults/main.yml @@ -1,2 +1,36 @@ ---- -# defaults file for autoscale +# recommended: +autoscale_private_data: # PrivateData + - cloud # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud + +# useful for debugging, may want to amend in production: +autoscale_debug_flags: + - PowerSave # https://slurm.schedmd.com/slurm.conf.html#OPT_Power +autoscale_slurmctld_syslog_debug: info # https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldSyslogDebug + +# likely to need tuning: +autoscale_suspend_time: 120 # https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime +autoscale_suspend_timeout: 30 # https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout +autoscale_resume_timeout: 300 # https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout +# autoscale_power_save_interval: 10 # https://slurm.schedmd.com/slurm.conf.html#OPT_power_save_interval +# autoscale_power_save_min_interval: 0 # https://slurm.schedmd.com/slurm.conf.html#OPT_power_save_min_intervals + +# likely to need defining: +autoscale_suspend_exc_nodes: [] + +autoscale_openhpc_config: + SuspendProgram: /opt/slurm-tools/bin/slurm-openstack-suspend + ResumeProgram: /opt/slurm-tools/bin/slurm-openstack-resume + SlurmctldParameters: + - idle_on_node_suspend # https://slurm.schedmd.com/slurm.conf.html#OPT_idle_on_node_suspend + - cloud_dns # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud_dns + # - "power_save_interval={{ autoscale_power_save_interval}}" # seems to break if you set this + # - "power_save_min_interval={{ autoscale_power_save_min_interval }}" + CommunicationParameters: + - NoAddrCache # https://slurm.schedmd.com/slurm.conf.html#OPT_NoAddrCache + PrivateData: "{{ autoscale_private_data }}" + DebugFlags: "{{ autoscale_debug_flags }}" + SlurmctldSyslogDebug: "{{ autoscale_slurmctld_syslog_debug }}" + SuspendTime: "{{ autoscale_suspend_time }}" + SuspendTimeout: "{{ autoscale_suspend_timeout }}" + ResumeTimeout: "{{ autoscale_resume_timeout }}" +# See also TreeWidth but shouldn't needs setting with cloud_dns diff --git a/ansible/roles/autoscale/tasks/main.yml b/ansible/roles/autoscale/tasks/main.yml index 86ff4c438..1ad1bbefc 100644 --- a/ansible/roles/autoscale/tasks/main.yml +++ b/ansible/roles/autoscale/tasks/main.yml @@ -1,19 +1,8 @@ --- -- name: Create SuspendProgram - template: - src: suspend.py.j2 - dest: /opt/slurm-tools/bin/suspend - mode: u=rx,go= - tags: suspend - when: "'autoscale' in group_names" -- name: Create ResumeProgram # TODO: FIXME: add to slurm-tools - template: - src: resume.py.j2 - dest: /opt/slurm-tools/bin/resume - mode: u=rx,go= - #was: mode: u=rwx,g=rx,o=rx # is OK as clouds.yaml is protected - tags: resume - name: Modify openhpc_slurm_partitions set_fact: openhpc_slurm_partitions: "{{ openhpc_slurm_partitions | modify_autoscale_partitions }}" +- name: Merge autoscale configuration + set_fact: + openhpc_config: "{{ autoscale_openhpc_config | combine(openhpc_config, list_merge='append') }}" diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index cc27f9b26..14c3ef38a 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -1,38 +1,4 @@ -# recommended: -autoscale_suspend_exc_nodes: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all non-CLOUD nodes, and prevent login-only slurmd nodes getting powered down -autoscale_private_data: # PrivateData - - cloud # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud - -# for debugging, may want to amend in production: -autoscale_debug_flags: - - PowerSave # https://slurm.schedmd.com/slurm.conf.html#OPT_Power -autoscale_slurmctld_syslog_debug: info # https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldSyslogDebug - -# likely to need tuning: -autoscale_suspend_time: 120 # https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime -autoscale_suspend_timeout: 30 # https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout -autoscale_resume_timeout: 300 # https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout -autoscale_power_save_interval: 10 # https://slurm.schedmd.com/slurm.conf.html#OPT_power_save_interval -autoscale_power_save_min_interval: 0 # https://slurm.schedmd.com/slurm.conf.html#OPT_power_save_min_intervals +openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: rename? -openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: fix name here? - -_autoscale_openhpc_config: - SuspendProgram: /opt/slurm-tools/bin/slurm-openstack-suspend - ResumeProgram: /opt/slurm-tools/bin/slurm-openstack-resume - SlurmctldParameters: - - idle_on_node_suspend # https://slurm.schedmd.com/slurm.conf.html#OPT_idle_on_node_suspend - - cloud_dns # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud_dns - # - "power_save_interval={{ autoscale_power_save_interval}}" # seems to break if you set this - # - "power_save_min_interval={{ autoscale_power_save_min_interval }}" - CommunicationParameters: - - NoAddrCache # https://slurm.schedmd.com/slurm.conf.html#OPT_NoAddrCache - PrivateData: "{{ autoscale_private_data }}" - DebugFlags: "{{ autoscale_debug_flags }}" - SlurmctldSyslogDebug: "{{ autoscale_slurmctld_syslog_debug }}" - SuspendTime: "{{ autoscale_suspend_time }}" - SuspendTimeout: "{{ autoscale_suspend_timeout }}" - ResumeTimeout: "{{ autoscale_resume_timeout }}" -# See also TreeWidth but shouldn't needs setting with cloud_dns - -autoscale_openhpc_config: "{{ _autoscale_openhpc_config if groups.get('autoscale', []) else {} }}" +# TODO: should this get moved?? +autoscale_suspend_exc_nodes: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all non-CLOUD nodes, and prevent login-only slurmd nodes getting powered down diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 18b819808..2139d140b 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -25,14 +25,12 @@ openhpc_packages_extra: [] openhpc_packages: "{{ openhpc_packages_default + openhpc_packages_extra }}" openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}" -openhpc_slurm_configless: true openhpc_login_only_nodes: login - openhpc_config_default: SlurmctldParameters: - - enable_configless # required as we might override SlurmctldParameters elsewhere + - enable_configless openhpc_config_extra: {} -openhpc_config: "{{ openhpc_config_default | combine(rebuild_openhpc_config, autoscale_openhpc_config, openhpc_config_extra, list_merge='append') }}" +openhpc_config: "{{ openhpc_config_default | combine(rebuild_openhpc_config, openhpc_config_extra, list_merge='append') }}" openhpc_suspend_exc_nodes_extra: [] openhpc_suspend_exc_nodes: "{{ autoscale_suspend_exc_nodes + openhpc_suspend_exc_nodes_extra }}" From 051649985ab6f0bf8b684749c559aacb828a7fbf Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 1 Oct 2021 20:36:00 +0000 Subject: [PATCH 063/133] improve autoscale vars/defaults/docs --- ansible/roles/autoscale/README.md | 15 +++++++++++++-- ansible/roles/autoscale/defaults/main.yml | 10 ++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/ansible/roles/autoscale/README.md b/ansible/roles/autoscale/README.md index 63c6c13ab..69e3ddeb3 100644 --- a/ansible/roles/autoscale/README.md +++ b/ansible/roles/autoscale/README.md @@ -11,7 +11,7 @@ NOTES TODO: - Describe groups. - Describe cpu/memory info requirements (inc. for mixed partitions) - Describe what happens on failure. - +- Note that DNS is REQUIRED for this. ## Requirements @@ -24,8 +24,19 @@ NOTES TODO: - `openhpc_slurm_partitions`: This role modifies what the partitions/groups defined [openhpc_slurm_partitions](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) in the by `stackhpc.openhpc` role accept: - `cloud_nodes`: Optional. As per the `stackhpc.openhpc` docs this defines nodes in a ["CLOUD" state](https://slurm.schedmd.com/slurm.conf.html#OPT_CLOUD), i.e. treated as powered down/not existing when the Slurm control daemon starts. The value is a suffix for the group/partition's node names in Slurm's hostlist expression format (e.g. `-[11-25]`) and therefore defines the number of CLOUD-state nodes. - `cloud_instances`: Required if `cloud_nodes` is defined. A dict defining the `flavor`, `image`, `keypair` and `network` to use for CLOUD-state instances in this partition/group. Values for these parameters may be either names (if unique in the cloud) or IDs. + + Some examples are given below. + +- `autoscale_show_suspended_nodes`: Optional, default `true`. Whether to show suspended/powered-down nodes in `sinfo` etc. See `slurm.conf` parameter [PrivateData - cloud](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_cloud). + +The following variables have defaults useful for debugging autoscaling, but may be altered for production: +- `autoscale_debug_powersaving`: Optional, default `true`. Log additional information for powersaving, see `slurm.conf` parameter [DebugFlags - PowerSave](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_PowerSave_2). +- `autoscale_slurmctld_syslog_debug`: Optional, default `info`. Syslog logging level. See `slurm.conf` parameter [SlurmctldSyslogDebug](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SlurmctldSyslogDebug). -Some examples are given below. +The following variables are likely to need tuning for the specific site/instances: +- `autoscale_suspend_time`: Optional, default 120s TODO https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime +- `autoscale_suspend_timeout`: Optional, default 30s TODO https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout +- `autoscale_resume_timeout`: Optional, default 300s TODO https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout ### Processor/memory information Non-CLOUD-state nodes in a group/partition are defined by the hosts in an inventory group named `_` as per `stackhpc.openhpc` [docs](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) and processor/memory information is automatically retrieved from them. diff --git a/ansible/roles/autoscale/defaults/main.yml b/ansible/roles/autoscale/defaults/main.yml index 73cd8792f..d8a2b3bac 100644 --- a/ansible/roles/autoscale/defaults/main.yml +++ b/ansible/roles/autoscale/defaults/main.yml @@ -1,10 +1,8 @@ # recommended: -autoscale_private_data: # PrivateData - - cloud # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud +autoscale_show_suspended_nodes: true # useful for debugging, may want to amend in production: -autoscale_debug_flags: - - PowerSave # https://slurm.schedmd.com/slurm.conf.html#OPT_Power +autoscale_debug_powersaving: true autoscale_slurmctld_syslog_debug: info # https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldSyslogDebug # likely to need tuning: @@ -27,8 +25,8 @@ autoscale_openhpc_config: # - "power_save_min_interval={{ autoscale_power_save_min_interval }}" CommunicationParameters: - NoAddrCache # https://slurm.schedmd.com/slurm.conf.html#OPT_NoAddrCache - PrivateData: "{{ autoscale_private_data }}" - DebugFlags: "{{ autoscale_debug_flags }}" + PrivateData: "{{ ['cloud'] if autoscale_show_suspended_nodes else [] }}" + DebugFlags: "{{ ['PowerSave'] if autoscale_debug_powersaving else [] }}" # NB: Seems to have disappeared in latest Slurm SlurmctldSyslogDebug: "{{ autoscale_slurmctld_syslog_debug }}" SuspendTime: "{{ autoscale_suspend_time }}" SuspendTimeout: "{{ autoscale_suspend_timeout }}" From 04198d5966347bff2a60d7db4a2ba2e3aac4d071 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 5 Oct 2021 13:58:15 +0000 Subject: [PATCH 064/133] use set_fact merging on rebuild and fix venv deployment --- ansible/roles/rebuild/README.md | 33 ++-- ansible/roles/rebuild/defaults/main.yml | 4 +- ansible/roles/rebuild/tasks/main.yml | 12 +- ansible/roles/rebuild/templates/rebuild.py.j2 | 147 ------------------ ansible/slurm.yml | 8 +- .../inventory/group_vars/all/openhpc.yml | 5 +- .../inventory/group_vars/all/rebuild.yml | 3 - 7 files changed, 34 insertions(+), 178 deletions(-) delete mode 100644 ansible/roles/rebuild/templates/rebuild.py.j2 diff --git a/ansible/roles/rebuild/README.md b/ansible/roles/rebuild/README.md index 225dd44b9..f846bbeb3 100644 --- a/ansible/roles/rebuild/README.md +++ b/ansible/roles/rebuild/README.md @@ -1,38 +1,47 @@ -Role Name -========= +rebuild +======= -A brief description of the role goes here. +Enable the compute nodes to be reimaged from Slurm. To use this functionality add the `control` and `compute` groups to the `rebuild` group. + +Once `ansible/slurm.yml` has run, node(s) can be reimaged using: + + scontrol reboot [ASAP] [nextstate=] reason="rebuild image:" [] + +where: +- `` is the name (if unique) or ID of an image in OpenStack. +- `` is a Slurm hostlist expression defining the nodes to reimage. +- `ASAP` means the rebuild will happen as soon as existing jobs on the node(s) complete - no new jobs will be scheduled on it. +- If `nextstate=...` is not given nodes remain in DRAIN state after the rebuild. Requirements ------------ -Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. +- This role must be run before the `stackhpc.openhpc` role's `runtime.yml` playbook as it modifies the `openhpc_config` variable. +- OpenStack credentials on the compute nodes, e.g. at `/etc/openstack/clouds.yaml` which are readable by the root user. It is recommended these credentials are an [application credential](https://docs.openstack.org/keystone/latest/user/application_credentials.html). This can be created in Horizon via Identity > Application Credentials > +Create Application Credential. The usual role required is `member`. Using access rules has been found not to work at present. Note that the downloaded credential can be encrpyted using `ansible-vault` to allow commit to source control. It will automatically be decrypted when copied onto the compute nodes. +- An image which when booted adds that node to the Slurm cluster. E.g. see `packer/README.md`. Role Variables -------------- -A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. +None normally required. Dependencies ------------ -A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. +See above. Example Playbook ---------------- -Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: +See `ansible/slurm.yml` - - hosts: servers - roles: - - { role: username.rolename, x: 42 } License ------- -BSD +Apache v2 Author Information ------------------ -An optional section for the role authors to include contact information, or a website (HTML is not allowed). +StackHPC Ltd. diff --git a/ansible/roles/rebuild/defaults/main.yml b/ansible/roles/rebuild/defaults/main.yml index 0a0383df4..5e532ef24 100644 --- a/ansible/roles/rebuild/defaults/main.yml +++ b/ansible/roles/rebuild/defaults/main.yml @@ -1,2 +1,4 @@ --- -# defaults file for rebuild + +rebuild_openhpc_config: + RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild diff --git a/ansible/roles/rebuild/tasks/main.yml b/ansible/roles/rebuild/tasks/main.yml index 68acdd606..a191e807a 100644 --- a/ansible/roles/rebuild/tasks/main.yml +++ b/ansible/roles/rebuild/tasks/main.yml @@ -1,9 +1,5 @@ --- -- name: Create RebootProgram # TODO: FIXME: add to slurm-tools - template: - src: rebuild.py.j2 - dest: /opt/slurm-tools/bin/rebuild - mode: u=rx,go= - owner: slurm - group: slurm - tags: resume + +- name: Merge rebuild configuration + set_fact: + openhpc_config: "{{ rebuild_openhpc_config | combine(openhpc_config, list_merge='append') }}" diff --git a/ansible/roles/rebuild/templates/rebuild.py.j2 b/ansible/roles/rebuild/templates/rebuild.py.j2 deleted file mode 100644 index e080d763e..000000000 --- a/ansible/roles/rebuild/templates/rebuild.py.j2 +++ /dev/null @@ -1,147 +0,0 @@ -#!/opt/slurm-tools/bin/python3 -# -*- coding: utf-8 -*- - -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import logging.handlers -import os -from os import path -import sys -import socket -import subprocess - -import openstack -import pbr.version - -__version__ = pbr.version.VersionInfo("slurm-openstack-tools").version_string() - -MAX_REASON_LENGTH = 1000 - -# configure logging to syslog - by default only "info" -# and above categories appear -logger = logging.getLogger("syslogger") -logger.setLevel(logging.DEBUG) -handler = logging.handlers.SysLogHandler("/dev/log") -handler.setFormatter(logging.Formatter(sys.argv[0] + ': %(message)s')) -logger.addHandler(handler) - -def get_statesavelocation(): - """ Return the path for Slurm's StateSaveLocation """ - scontrol = subprocess.run(['scontrol', 'show', 'config'], stdout=subprocess.PIPE, universal_newlines=True) - for line in scontrol.stdout.splitlines(): - if line.startswith('StateSaveLocation'): # StateSaveLocation = /var/spool/slurm - return line.split()[-1] - -def get_openstack_server_id(node): - - statedir = get_statesavelocation() - instance_file = os.path.join(statedir, node) - try: - with open(instance_file) as f: - instance_id = f.readline().strip() - return instance_id - except FileNotFoundError: - logger.error(f"no instance file found in {statedir} for node {node}") - return None - -def get_sinfo_path(): - # TODO(johngarbutt): get this from environment or config file? - sinfo_alt_path = "/usr/local/software/slurm/current/bin/sinfo" - if path.exists(sinfo_alt_path): - return sinfo_alt_path - return "sinfo" - - -def get_reboot_reason(node): - sinfo_path = get_sinfo_path() - # see why we're being rebooted: - sinfo = subprocess.run( - [ - sinfo_path, - "--noheader", - "--nodes=%s" % node, - "-O", - "Reason:%i" % MAX_REASON_LENGTH, - ], - stdout=subprocess.PIPE, - universal_newlines=True, - ) - return sinfo.stdout.strip() - - -def get_image_from_reason(reason): - tokens = reason.split() - image = None - if len(tokens) > 1: - image_tokens = tokens[1].split(":") - if len(image_tokens) == 2 and image_tokens[0] == "image": - if image_tokens[1]: - image = image_tokens[1] - logger.info(f"user requested image: {image}") - return image - - -def rebuild_openstack_server(server_id, reason): - # Validate server_id - conn = openstack.connection.from_config() - try: - server = conn.get_server(server_id) - except openstack.exceptions.ResourceNotFound: - logger.error(f"server id {server_id} is not valid") - return None - - image_name_or_uuid = get_image_from_reason(reason) - if not image_name_or_uuid: - image_name_or_uuid = server.image.id - logger.info(f"couldn't parse image from reason '{reason}', falling back to existing image: {image_name_or_uuid}") - - image = conn.image.find_image(image_name_or_uuid) # doesn't throw exception - if image is None: - logger.error(f"image {image_name_or_uuid} either not found or not unique") - return None - - # Note that OpenStack will power down the server as part of the rebuild - logger.info(f"rebuilding server {server_id} with image {image.id}") - conn.rebuild_server(server_id, image.id) - -def reboot_openstack_server(server_id): - conn = openstack.connection.from_config() - server = conn.get_server(server_id) - logger.info(f"rebooting server %{server_id} with image %{image_uuid}") - conn.reboot_server(server_id, 'SOFT') - -def expand_nodes(hostlist_expr): - scontrol = subprocess.run(['scontrol', 'show', 'hostnames', hostlist_expr], stdout=subprocess.PIPE, universal_newlines=True) - return scontrol.stdout.strip().split('\n') - -def rebuild_or_reboot(): - """ Rebuild or reboot an OpenStack node from the controller. """ - - hostlist_expr = sys.argv[1] - logger.info(f"Slurmctld invoked RebootProgram {hostlist_expr}") - for node in expand_nodes(hostlist_expr): - server_uuid = get_openstack_server_id(node) - if not server_uuid: - continue # can just try next one (but really should now exit > 0 even if others succeed) - reason = get_reboot_reason(node) - if not reason.startswith("rebuild"): - reboot_openstack_server(server_uuid) # TODO: support selecting soft or hard reboot via reason? - else: - rebuild_openstack_server(server_uuid, reason) - -if __name__ == "__main__": - try: - rebuild_or_reboot() - except: - logger.exception('Exception in main:') - raise \ No newline at end of file diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 2a73c3a84..f7c60543f 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -37,16 +37,16 @@ mode: u=r,go= owner: slurm group: slurm - - name: Setup slurm tools # this installs RebootProgram for rebuild too + - name: Setup Python/Slurm tools include_role: name: stackhpc.slurm_openstack_tools.pytools - vars: - become_user: slurm - become_flags: '-s /bin/bash' # as has shell specified as /sbin/nologin - name: Configure autoscale programs and parameters include_role: name: autoscale when: "'autoscale' in group_names" + - name: Configure rebuild programs and parameters + include_role: + name: rebuild - name: Setup slurm hosts: openhpc diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 2139d140b..d9b9eca49 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -27,10 +27,9 @@ openhpc_packages: "{{ openhpc_packages_default + openhpc_packages_extra }}" openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}" openhpc_login_only_nodes: login openhpc_config_default: - SlurmctldParameters: - - enable_configless + SlurmctldParameters: enable_configless openhpc_config_extra: {} -openhpc_config: "{{ openhpc_config_default | combine(rebuild_openhpc_config, openhpc_config_extra, list_merge='append') }}" +openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" openhpc_suspend_exc_nodes_extra: [] openhpc_suspend_exc_nodes: "{{ autoscale_suspend_exc_nodes + openhpc_suspend_exc_nodes_extra }}" diff --git a/environments/common/inventory/group_vars/all/rebuild.yml b/environments/common/inventory/group_vars/all/rebuild.yml index ba631a07c..0be916def 100644 --- a/environments/common/inventory/group_vars/all/rebuild.yml +++ b/environments/common/inventory/group_vars/all/rebuild.yml @@ -1,4 +1 @@ -_rebuild_openhpc_config: - RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild -rebuild_openhpc_config: "{{ _rebuild_openhpc_config if groups.get('rebuild', []) else {} }}" openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: rename? From 60e74a89de141d3c6fbffb9c7085ad5b9a0f57f3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 5 Oct 2021 15:39:26 +0000 Subject: [PATCH 065/133] use openhpc role's extra_nodes feature --- .../autoscale/filter_plugins/openhpc_partitions.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py b/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py index d424e419b..1e9a778d1 100644 --- a/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py +++ b/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py @@ -28,15 +28,21 @@ def modify_autoscale_partitions(partitions): for part in partitions: for group in part.get('groups', [part]): group_name = group.get('name', '') + extra_nodes = group.get('extra_nodes', []) + if 'cloud_nodes' in group: if 'cloud_instances' not in group: raise errors.AnsibleFilterError(f"`openhpc_slurm_partitions` group '{group_name}' specifies 'cloud_nodes' but is missing 'cloud_instances'.") missing_attrs = ','.join(set(REQUIRED_INSTANCE_ATTRS).difference(group['cloud_instances'])) if missing_attrs: raise errors.AnsibleFilterError(f"`openhpc_slurm_partitions` group '{group_name}' item 'cloud_instances' is missing items: {missing_attrs}.") - if 'features' not in group: - group['features'] = [] - group['features'].extend(['%s=%s' % (k, v) for (k, v) in group['cloud_instances'].items()]) + cloud_names = group['cloud_nodes'] + # TODO: check for cloud nodes overlapping real ones? + + features = ['%s=%s' % (k, v) for (k, v) in group['cloud_instances'].items()] + cloud_nodes = f'NodeName={cloud_names} State=CLOUD Features={features}' + + extra_nodes.append(cloud_nodes) return partitions From 1ee10e9931c9478a48922d09d6d5495a647515fc Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Oct 2021 13:09:06 +0000 Subject: [PATCH 066/133] fix actually generataing cloud_node info --- .../filter_plugins/openhpc_partitions.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py b/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py index 1e9a778d1..46d648177 100644 --- a/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py +++ b/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py @@ -21,28 +21,32 @@ REQUIRED_INSTANCE_ATTRS=('flavor', 'image', 'keypair', 'network') -def modify_autoscale_partitions(partitions): +def modify_autoscale_partitions(partitions, flavors): """ TODO: docs """ for part in partitions: for group in part.get('groups', [part]): group_name = group.get('name', '') - extra_nodes = group.get('extra_nodes', []) if 'cloud_nodes' in group: if 'cloud_instances' not in group: raise errors.AnsibleFilterError(f"`openhpc_slurm_partitions` group '{group_name}' specifies 'cloud_nodes' but is missing 'cloud_instances'.") - missing_attrs = ','.join(set(REQUIRED_INSTANCE_ATTRS).difference(group['cloud_instances'])) + missing_attrs = ', '.join(set(REQUIRED_INSTANCE_ATTRS).difference(group['cloud_instances'])) if missing_attrs: raise errors.AnsibleFilterError(f"`openhpc_slurm_partitions` group '{group_name}' item 'cloud_instances' is missing items: {missing_attrs}.") cloud_names = group['cloud_nodes'] # TODO: check for cloud nodes overlapping real ones? features = ['%s=%s' % (k, v) for (k, v) in group['cloud_instances'].items()] - cloud_nodes = f'NodeName={cloud_names} State=CLOUD Features={features}' - - extra_nodes.append(cloud_nodes) + cloud_nodes = { + 'NodeName': cloud_names, + 'State':'CLOUD', + 'Features': ','.join(features), + } + + group['extra_nodes'] = group.get('extra_nodes', []) + group['extra_nodes'].append(cloud_nodes) return partitions From 8a956677af8867dd9c60f814cfff71766eb25530 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Oct 2021 13:36:05 +0000 Subject: [PATCH 067/133] retrieve cloud_node instance cpu/mem from openstack --- .../openhpc_partitions.cpython-36.pyc | Bin 1462 -> 0 bytes .../filter_plugins/openhpc_partitions.py | 19 +++++++++++++++--- ansible/roles/autoscale/tasks/main.yml | 16 ++++++++++++++- 3 files changed, 31 insertions(+), 4 deletions(-) delete mode 100644 ansible/roles/autoscale/filter_plugins/__pycache__/openhpc_partitions.cpython-36.pyc diff --git a/ansible/roles/autoscale/filter_plugins/__pycache__/openhpc_partitions.cpython-36.pyc b/ansible/roles/autoscale/filter_plugins/__pycache__/openhpc_partitions.cpython-36.pyc deleted file mode 100644 index 100545137c74508e603d7c79d3de87c9f5cc88b6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1462 zcmaJ>PjBNy6rUM8w(F#@D%G-E5E!IYh?XV@ZcC`P+7`4DWJTLmC5x5U^-QvM9owB5 zmo{=FMEU{P3m3F^xFAlP_#|`X#8>EvH%`+=fEa7uyz#uhzxQT7SzUEkUOo+fvl03e zEe#X+hcHDS2t^d9XpDV~HBVCFlX^9LgEoDW+SIu~e&Y*7SEze|sLQOG;oFq#qfYBD zuz)&PS&Z|Xi-EF+X?U7*)riO8h$;J+ofl!um7Ov9ZO)G)tTkXXU^*}#!4!W0Is3gr zJLuW3Q+)Pwg%b#?!XrE*@bAYdsR;Zb)R>W}LCuOBTYQ5yV72*|5+{UOmGKjz_A6tC zzsJ-`3~9>73@27aXyY!L;TfKqmC3(bl3uAy&~aaZ9OyJ_Yd?$-Xf*pZy^9i?w&?1G z-hq|GnL&)&7DguvQL=^%cUYp$V4xa2hdEn7J60QeBStFj8g-~V|j@hKyFEh%9 zamMJD-;C0{q(PQbCj8gun>Z6P%pxWx??2BAmc1yVK%^xf2SvzbEaN;A&%N5F*PeXZ z_C&#=co;L`wU^Aby`xfkvGB&R5OFqI9A2AmcQ5gWllyHimTdg*(`8jHAs%=Nca<|_ zp)5HQ*9A17X1a*`x`1y30pQTH^>4@&PoaYaa1 zQutWmQ{Do-np__CUgTrei&!Re(W?{R42xouveRCeiTEgGJ)Wmb^uiJ(q7e4OIF*bC zMOuy^z1{+~8@#&3`K8-WVYzf!G3Oq1#ze z#)wI-)z#YeJfhF*0CsinD>KRCOc{bn_2%C0*H0hs?d}AR2m1%xgGal;_QAp4zH-6k zkZ}fOhAVR>Sw`WeXp}n*<=(`tOwFrCU2WyoOQ74xU%e`85K76#4SxKVj_a-eN$QSH zeggENqHnS!Omf2Ly3iz_xkwXPW=ll5g!-7A3s zgx~};C5YspPuL zxdo)-DEq1x=Y3!$ah8N1E5aGSt)(=8%F2%t7RlPse|c0+e;(u38i)%QNL>wL66cM- E0Z#me!T') @@ -38,17 +43,25 @@ def modify_autoscale_partitions(partitions, flavors): cloud_names = group['cloud_nodes'] # TODO: check for cloud nodes overlapping real ones? + flavor = [f for f in flavors if f['name'] == group['cloud_instances']['flavor']] + if len(flavor) != 1: + raise errors.AnsibleFilterError(f'expected one flavor matching {group["cloud_instances"]["flavor"]}, found {len(flavor)}: {flavor}') + flavor = flavor[0] + ram_mb = int(flavor['ram'] * group.get('ram_multiplier', openhpc_ram_multiplier)) # ram in flavor in MB, so no units conversion needed + features = ['%s=%s' % (k, v) for (k, v) in group['cloud_instances'].items()] cloud_nodes = { 'NodeName': cloud_names, 'State':'CLOUD', 'Features': ','.join(features), + 'CPUs': flavor['vcpus'], + 'RealMemory': group.get('ram_mb', ram_mb) } group['extra_nodes'] = group.get('extra_nodes', []) group['extra_nodes'].append(cloud_nodes) - return partitions + return openhpc_slurm_partitions class FilterModule(object): diff --git a/ansible/roles/autoscale/tasks/main.yml b/ansible/roles/autoscale/tasks/main.yml index 1ad1bbefc..cbaca6b13 100644 --- a/ansible/roles/autoscale/tasks/main.yml +++ b/ansible/roles/autoscale/tasks/main.yml @@ -1,8 +1,22 @@ --- +- name: Get cloud_node specs + shell: + cmd: "openstack flavor show --format json {{ item.cloud_instances.flavor }}" + delegate_to: localhost + run_once: true + loop: "{{ openhpc_slurm_partitions }}" + when: "'cloud_instances' in item" + register: _os_flavors + become: no +- name: Manipulate flavor information + set_fact: + flavor_info: "{{ _os_flavors.results | map(attribute='stdout') | map('from_json') }}" # list of json info +- debug: + var: flavor_info - name: Modify openhpc_slurm_partitions set_fact: - openhpc_slurm_partitions: "{{ openhpc_slurm_partitions | modify_autoscale_partitions }}" + openhpc_slurm_partitions: "{{ openhpc_slurm_partitions | modify_autoscale_partitions(flavor_info, openhpc_ram_multiplier) }}" - name: Merge autoscale configuration set_fact: openhpc_config: "{{ autoscale_openhpc_config | combine(openhpc_config, list_merge='append') }}" From a96e68c9de4e87dc10bd60aa64716cd9ff85adbe Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Oct 2021 14:04:51 +0000 Subject: [PATCH 068/133] WIP autoscale README --- ansible/roles/autoscale/README.md | 94 ++++++++++++++++--------------- 1 file changed, 49 insertions(+), 45 deletions(-) diff --git a/ansible/roles/autoscale/README.md b/ansible/roles/autoscale/README.md index 69e3ddeb3..aa4eed164 100644 --- a/ansible/roles/autoscale/README.md +++ b/ansible/roles/autoscale/README.md @@ -4,82 +4,86 @@ Support autoscaling nodes on OpenStack clouds, i.e. creating nodes when necessar This is implemented using Slurm's ["elastic computing"](https://slurm.schedmd.com/elastic_computing.html) features which are based on Slurm's [power saving](https://slurm.schedmd.com/power_save.html) features. - -NOTES TODO: -- Won't get monitoring for autoscaling nodes -- Describe autoscale vs `State=CLOUD` and powersaving enablement. -- Describe groups. -- Describe cpu/memory info requirements (inc. for mixed partitions) -- Describe what happens on failure. -- Note that DNS is REQUIRED for this. +Add the `control` group to the `autoscale` group to activate this functionality in the `ansible/slurm.yml` playbook. Note some role variables are likely to need configuring. By default, node creation and deletion will be logged in the control node's syslog. ## Requirements -- Role `stackhpc.slurm_openstack_tools.pytools`. Installs [slurm-openstack-tools](github.com/stackhpc/slurm-openstack-tools) which provides a venv with the `openstacksdk`. +- Working DNS. +- Active OpenStack credentials on localhost (e.g a sourced `openrc.sh` in the shell running ansible). +- Role `stackhpc.slurm_openstack_tools.pytools`. Installs [slurm-openstack-tools](github.com/stackhpc/slurm-openstack-tools) which provides a venv with the `openstacksdk` and the resume/suspend scripts. - Role `stackhpc.openhpc` to create a Slurm cluster. -- This role should be run on the Slurm controller only, i.e. add the `control` group to the `autoscale` group to activate this functionality. +- This role should be run on the Slurm controller only. ## Role Variables -- `openhpc_slurm_partitions`: This role modifies what the partitions/groups defined [openhpc_slurm_partitions](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) in the by `stackhpc.openhpc` role accept: - - `cloud_nodes`: Optional. As per the `stackhpc.openhpc` docs this defines nodes in a ["CLOUD" state](https://slurm.schedmd.com/slurm.conf.html#OPT_CLOUD), i.e. treated as powered down/not existing when the Slurm control daemon starts. The value is a suffix for the group/partition's node names in Slurm's hostlist expression format (e.g. `-[11-25]`) and therefore defines the number of CLOUD-state nodes. - - `cloud_instances`: Required if `cloud_nodes` is defined. A dict defining the `flavor`, `image`, `keypair` and `network` to use for CLOUD-state instances in this partition/group. Values for these parameters may be either names (if unique in the cloud) or IDs. - - Some examples are given below. - -- `autoscale_show_suspended_nodes`: Optional, default `true`. Whether to show suspended/powered-down nodes in `sinfo` etc. See `slurm.conf` parameter [PrivateData - cloud](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_cloud). - -The following variables have defaults useful for debugging autoscaling, but may be altered for production: -- `autoscale_debug_powersaving`: Optional, default `true`. Log additional information for powersaving, see `slurm.conf` parameter [DebugFlags - PowerSave](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_PowerSave_2). -- `autoscale_slurmctld_syslog_debug`: Optional, default `info`. Syslog logging level. See `slurm.conf` parameter [SlurmctldSyslogDebug](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SlurmctldSyslogDebug). +### openhpc_slurm_partitions +This role modifies what the [openhpc_slurm_partitions variable](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) in the `stackhpc.openhpc` role accepts. Partition/group definitions may additionally include: +- `cloud_nodes`: Optional. Slurm hostlist expression (e.g. `'small-[8,10-16]'`) defining names of nodes to be defined in a ["CLOUD" state](https://slurm.schedmd.com/slurm.conf.html#OPT_CLOUD), i.e. not operational when the Slurm control daemon starts. +- `cloud_instances`: Required if `cloud_nodes` is defined. A mapping with keys `flavor`, `image`, `keypair` and `network` defining the OpenStack ID or names of properties for the CLOUD-state instances. -The following variables are likely to need tuning for the specific site/instances: -- `autoscale_suspend_time`: Optional, default 120s TODO https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime -- `autoscale_suspend_timeout`: Optional, default 30s TODO https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout -- `autoscale_resume_timeout`: Optional, default 300s TODO https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout +Partitions/groups defining `cloud_nodes` may or may not also contain non-CLOUD state nodes (i.e. nodes in a matching inventory group). For CLOUD-state nodes, memory and CPU information is retrieved from OpenStack for the specified flavors. The `stackhpc.openhpc` group/partition options `ram_mb` and `ram_multiplier` and role variable `openhpc_ram_multiplier` are handled exactly as for non-CLOUD state nodes. This implies that if CLOUD and non-CLOUD state nodes are mixed in a single group all nodes must be homogenous in terms of processors/memory. -### Processor/memory information -Non-CLOUD-state nodes in a group/partition are defined by the hosts in an inventory group named `_` as per `stackhpc.openhpc` [docs](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) and processor/memory information is automatically retrieved from them. +Some examples are given below. Note that currently monitoring is not enabled for CLOUD-state nodes. -- If a group/partition contains both CLOUD and non-CLOUD nodes the processor/memory information for the CLOUD nodes is assumed to match that retrieved for the non-CLOUD nodes. -- If a group/partition only contains CLOUD-state nodes (i.e. no matching inventory group or it is empty) then processor/memory information must be specified using the `ram_mb`, `sockets`, `cores_per_socket` and `threads_per_core` options. +### Other variables +TODO: what about suspend_excl +The following variables are likely to need tuning for the specific site/instances: +- `autoscale_suspend_time`: Optional, default 120s. See `slurm.conf` parameter [SuspendTime](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SuspendTime). +- `autoscale_suspend_timeout`: Optional, default 30s. See `slurm.conf` parameter [SuspendTimeout](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SuspendTimeout). +- `autoscale_resume_timeout`: Optional, default 300s See `slurm.conf` parameter [ResumeTimeout](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_ResumeTimeout). +The following variables have defaults useful for debugging autoscaling, but may be altered for production: +- `autoscale_show_suspended_nodes`: Optional, default `true`. Whether to show suspended/powered-down nodes in `sinfo` etc. See `slurm.conf` parameter [PrivateData - cloud](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_cloud). +- `autoscale_debug_powersaving`: Optional, default `true`. Log additional information for powersaving, see `slurm.conf` parameter [DebugFlags - PowerSave](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_PowerSave_2). +- `autoscale_slurmctld_syslog_debug`: Optional, default `info`. Syslog logging level. See `slurm.conf` parameter [SlurmctldSyslogDebug](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SlurmctldSyslogDebug). +### Examples - - ```yaml - cloud_instances: - flavor: general.v1.medium - image: ohpc-compute-210909-1316.qcow2 - keypair: centos-at-steveb-ansible - network: "{{ autoscale_network }}" +Below is an example of partition definition, e.g. in `environments//inventory/group_vars/openhpc/overrides.yml`. Not shown here the inventory group `dev_small` contains 2 (non-CLOUD state) nodes. The "small" partition is the default and contains 2 non-CLOUD and 2 CLOUD nodes. The "burst" partition contains only CLOUD-state nodes. -A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. +```yaml +openhpc_cluster_name: dev +general_v1_small: + image: ohpc-compute-210909-1316.qcow2 + flavor: general.v1.small + keypair: centos-at-steveb-ansible + network: stackhpc-ipv4-geneve +general_v1_medium: + image: ohpc-compute-210909-1316.qcow2 + flavor: general.v1.medium + keypair: centos-at-steveb-ansible + network: stackhpc-ipv4-geneve +openhpc_slurm_partitions: +- name: small + default: yes + cloud_nodes: dev-small-[2-3] + cloud_instances: "{{ general_v1_small }}" +- name: burst + default: no + cloud_nodes: 'burst-[0-3]' + cloud_instances: "{{ general_v1_medium }}" +``` Dependencies ------------ -A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. +TODO: A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. Example Playbook ---------------- -Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: - - - hosts: servers - roles: - - { role: username.rolename, x: 42 } +See ansible/slurm.yml License ------- -BSD +Apache v2 Author Information ------------------ -An optional section for the role authors to include contact information, or a website (HTML is not allowed). +StackHPC Ltd. From 20d98fcb1bb22eebc7c494ed0ecfe84d61cecde3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Oct 2021 14:05:20 +0000 Subject: [PATCH 069/133] smslabs: update demo partition --- .../inventory/group_vars/openhpc/partitions.yml | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml index 1180bf9e5..95fa1d839 100755 --- a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml +++ b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml @@ -12,15 +12,12 @@ general_v1_medium: openhpc_slurm_partitions: - name: small - default: no - cloud_nodes: '-[2-3]' + default: yes + cloud_nodes: dev-small-[2-3] cloud_instances: "{{ general_v1_small }}" - name: burst - default: yes - cloud_nodes: '-[1-4]' + default: no + cloud_nodes: 'burst-[0-3]' cloud_instances: "{{ general_v1_medium }}" - ram_mb: "{{ (15258 * 0.95) | int }}" - sockets: 1 - cores_per_socket: 4 - threads_per_core: 1 + From 173fe3e34d25f1c6a0a23d4ccb878322c5cde55c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Oct 2021 14:06:36 +0000 Subject: [PATCH 070/133] add install tag to first run of stackhpc.openhpc:install.yml --- ansible/slurm.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index f7c60543f..6a76aaf85 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -23,6 +23,7 @@ import_role: name: stackhpc.openhpc tasks_from: install.yml + tags: install - name: Create /etc/openstack file: path: /etc/openstack From 474c8384ccc0752d9843998a1dc69e2b89be9390 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 6 Oct 2021 14:15:20 +0000 Subject: [PATCH 071/133] fix changed_when --- ansible/roles/autoscale/tasks/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/autoscale/tasks/main.yml b/ansible/roles/autoscale/tasks/main.yml index cbaca6b13..6dbc37325 100644 --- a/ansible/roles/autoscale/tasks/main.yml +++ b/ansible/roles/autoscale/tasks/main.yml @@ -9,6 +9,7 @@ when: "'cloud_instances' in item" register: _os_flavors become: no + changed_when: false - name: Manipulate flavor information set_fact: flavor_info: "{{ _os_flavors.results | map(attribute='stdout') | map('from_json') }}" # list of json info From 8054f7793c06263683920129873e740aac2636d6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Oct 2021 08:31:44 +0000 Subject: [PATCH 072/133] add autoscale_clouds --- ansible/roles/autoscale/README.md | 2 ++ ansible/roles/autoscale/defaults/main.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/ansible/roles/autoscale/README.md b/ansible/roles/autoscale/README.md index aa4eed164..eda01b774 100644 --- a/ansible/roles/autoscale/README.md +++ b/ansible/roles/autoscale/README.md @@ -27,6 +27,8 @@ Some examples are given below. Note that currently monitoring is not enabled for ### Other variables +- `autoscale_clouds`: Optional, path to a `clouds.yaml` file containing a single cloud. Defaults to `~/.config/openstack/clouds.yaml`. It is recommended this is an [application credential](https://docs.openstack.org/keystone/latest/user/application_credentials.html). This can be created in Horizon via Identity > Application Credentials > +Create Application Credential. The usual role required is `member`. Using access rules has been found not to work at present. Note that the downloaded credential can be encrpyted using `ansible-vault` to allow it to be committed to source control. It will automatically be decrypted when copied onto the compute nodes. + TODO: what about suspend_excl The following variables are likely to need tuning for the specific site/instances: - `autoscale_suspend_time`: Optional, default 120s. See `slurm.conf` parameter [SuspendTime](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SuspendTime). diff --git a/ansible/roles/autoscale/defaults/main.yml b/ansible/roles/autoscale/defaults/main.yml index d8a2b3bac..c1f76c899 100644 --- a/ansible/roles/autoscale/defaults/main.yml +++ b/ansible/roles/autoscale/defaults/main.yml @@ -1,3 +1,5 @@ +autoscale_clouds: ~/.config/openstack/clouds.yaml + # recommended: autoscale_show_suspended_nodes: true From 8c1b4be7a4ed8a23178582defb765ccc9d21e0f4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Oct 2021 08:48:07 +0000 Subject: [PATCH 073/133] move suspend_excl_nodes definition from openhpc role to here --- ansible/roles/autoscale/README.md | 40 ++++++++----------- ansible/roles/autoscale/defaults/main.yml | 1 + .../inventory/group_vars/all/autoscale.yml | 8 ++-- .../inventory/group_vars/all/openhpc.yml | 3 -- 4 files changed, 22 insertions(+), 30 deletions(-) diff --git a/ansible/roles/autoscale/README.md b/ansible/roles/autoscale/README.md index eda01b774..4e1ac8336 100644 --- a/ansible/roles/autoscale/README.md +++ b/ansible/roles/autoscale/README.md @@ -10,35 +10,33 @@ Add the `control` group to the `autoscale` group to activate this functionality - Working DNS. - Active OpenStack credentials on localhost (e.g a sourced `openrc.sh` in the shell running ansible). -- Role `stackhpc.slurm_openstack_tools.pytools`. Installs [slurm-openstack-tools](github.com/stackhpc/slurm-openstack-tools) which provides a venv with the `openstacksdk` and the resume/suspend scripts. +- Role `stackhpc.slurm_openstack_tools.pytools`. Installs [slurm-openstack-tools](github.com/stackhpc/slurm-openstack-tools) which provides a venv with the `openstacksdk` and the required resume/suspend scripts. - Role `stackhpc.openhpc` to create a Slurm cluster. - This role should be run on the Slurm controller only. ## Role Variables -### openhpc_slurm_partitions -This role modifies what the [openhpc_slurm_partitions variable](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) in the `stackhpc.openhpc` role accepts. Partition/group definitions may additionally include: -- `cloud_nodes`: Optional. Slurm hostlist expression (e.g. `'small-[8,10-16]'`) defining names of nodes to be defined in a ["CLOUD" state](https://slurm.schedmd.com/slurm.conf.html#OPT_CLOUD), i.e. not operational when the Slurm control daemon starts. -- `cloud_instances`: Required if `cloud_nodes` is defined. A mapping with keys `flavor`, `image`, `keypair` and `network` defining the OpenStack ID or names of properties for the CLOUD-state instances. - -Partitions/groups defining `cloud_nodes` may or may not also contain non-CLOUD state nodes (i.e. nodes in a matching inventory group). For CLOUD-state nodes, memory and CPU information is retrieved from OpenStack for the specified flavors. The `stackhpc.openhpc` group/partition options `ram_mb` and `ram_multiplier` and role variable `openhpc_ram_multiplier` are handled exactly as for non-CLOUD state nodes. This implies that if CLOUD and non-CLOUD state nodes are mixed in a single group all nodes must be homogenous in terms of processors/memory. - -Some examples are given below. Note that currently monitoring is not enabled for CLOUD-state nodes. - -### Other variables - - `autoscale_clouds`: Optional, path to a `clouds.yaml` file containing a single cloud. Defaults to `~/.config/openstack/clouds.yaml`. It is recommended this is an [application credential](https://docs.openstack.org/keystone/latest/user/application_credentials.html). This can be created in Horizon via Identity > Application Credentials > +Create Application Credential. The usual role required is `member`. Using access rules has been found not to work at present. Note that the downloaded credential can be encrpyted using `ansible-vault` to allow it to be committed to source control. It will automatically be decrypted when copied onto the compute nodes. -TODO: what about suspend_excl The following variables are likely to need tuning for the specific site/instances: - `autoscale_suspend_time`: Optional, default 120s. See `slurm.conf` parameter [SuspendTime](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SuspendTime). - `autoscale_suspend_timeout`: Optional, default 30s. See `slurm.conf` parameter [SuspendTimeout](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SuspendTimeout). - `autoscale_resume_timeout`: Optional, default 300s See `slurm.conf` parameter [ResumeTimeout](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_ResumeTimeout). -The following variables have defaults useful for debugging autoscaling, but may be altered for production: +The following variables may need altering for production: - `autoscale_show_suspended_nodes`: Optional, default `true`. Whether to show suspended/powered-down nodes in `sinfo` etc. See `slurm.conf` parameter [PrivateData - cloud](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_cloud). - `autoscale_debug_powersaving`: Optional, default `true`. Log additional information for powersaving, see `slurm.conf` parameter [DebugFlags - PowerSave](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_PowerSave_2). - `autoscale_slurmctld_syslog_debug`: Optional, default `info`. Syslog logging level. See `slurm.conf` parameter [SlurmctldSyslogDebug](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SlurmctldSyslogDebug). +- `autoscale_suspend_exc_nodes`: Optional. List of nodenames (or Slurm hostlist expressions) to exclude from "power saving", i.e. they will not be autoscaled away. + +## stackhpc.openhpc role variables +This role modifies what the [openhpc_slurm_partitions variable](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) in the `stackhpc.openhpc` role accepts. Partition/group definitions may additionally include: +- `cloud_nodes`: Optional. Slurm hostlist expression (e.g. `'small-[8,10-16]'`) defining names of nodes to be defined in a ["CLOUD" state](https://slurm.schedmd.com/slurm.conf.html#OPT_CLOUD), i.e. not operational when the Slurm control daemon starts. +- `cloud_instances`: Required if `cloud_nodes` is defined. A mapping with keys `flavor`, `image`, `keypair` and `network` defining the OpenStack ID or names of properties for the CLOUD-state instances. + +Partitions/groups defining `cloud_nodes` may or may not also contain non-CLOUD state nodes (i.e. nodes in a matching inventory group). For CLOUD-state nodes, memory and CPU information is retrieved from OpenStack for the specified flavors. The `stackhpc.openhpc` group/partition options `ram_mb` and `ram_multiplier` and role variable `openhpc_ram_multiplier` are handled exactly as for non-CLOUD state nodes. This implies that if CLOUD and non-CLOUD state nodes are mixed in a single group all nodes must be homogenous in terms of processors/memory. + +Some examples are given below. Note that currently monitoring is not enabled for CLOUD-state nodes. ### Examples @@ -70,22 +68,18 @@ openhpc_slurm_partitions: cloud_instances: "{{ general_v1_medium }}" ``` -Dependencies ------------- +# Dependencies -TODO: A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. +`stackhpc.openhpc` role as described above. -Example Playbook ----------------- +# Example Playbook See ansible/slurm.yml -License -------- +# License Apache v2 -Author Information ------------------- +# Author Information StackHPC Ltd. diff --git a/ansible/roles/autoscale/defaults/main.yml b/ansible/roles/autoscale/defaults/main.yml index c1f76c899..70916ddc2 100644 --- a/ansible/roles/autoscale/defaults/main.yml +++ b/ansible/roles/autoscale/defaults/main.yml @@ -33,4 +33,5 @@ autoscale_openhpc_config: SuspendTime: "{{ autoscale_suspend_time }}" SuspendTimeout: "{{ autoscale_suspend_timeout }}" ResumeTimeout: "{{ autoscale_resume_timeout }}" + SuspendExcNodes: "{{ autoscale_suspend_exc_nodes | join(',') }}" # See also TreeWidth but shouldn't needs setting with cloud_dns diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml index 14c3ef38a..b4816f571 100644 --- a/environments/common/inventory/group_vars/all/autoscale.yml +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -1,4 +1,4 @@ -openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: rename? - -# TODO: should this get moved?? -autoscale_suspend_exc_nodes: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all non-CLOUD nodes, and prevent login-only slurmd nodes getting powered down +autoscale_rebuild_clouds: ~/.config/openstack/clouds.yaml +autoscale_suspend_exc_nodes_default: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all non-CLOUD nodes, and prevent login-only slurmd nodes getting powered down +autoscale_suspend_exc_nodes_extra: [] +autoscale_suspend_exc_nodes: "{{ autoscale_suspend_exc_nodes_default + autoscale_suspend_exc_nodes_extra }}" diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index d9b9eca49..e587c7884 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -30,6 +30,3 @@ openhpc_config_default: SlurmctldParameters: enable_configless openhpc_config_extra: {} openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" - -openhpc_suspend_exc_nodes_extra: [] -openhpc_suspend_exc_nodes: "{{ autoscale_suspend_exc_nodes + openhpc_suspend_exc_nodes_extra }}" From dfc859e47344568773666745be6f754e2c7911c0 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Oct 2021 08:49:53 +0000 Subject: [PATCH 074/133] use separate tasks for rebuild and autoscale and move rebuild role into appliance --- ansible/roles/autoscale/tasks/main.yml | 30 +++++++++++- ansible/roles/rebuild/tasks/main.yml | 19 ++++++++ ansible/slurm.yml | 46 ++++++------------- .../inventory/group_vars/all/rebuild.yml | 2 +- 4 files changed, 61 insertions(+), 36 deletions(-) diff --git a/ansible/roles/autoscale/tasks/main.yml b/ansible/roles/autoscale/tasks/main.yml index 6dbc37325..6283410f4 100644 --- a/ansible/roles/autoscale/tasks/main.yml +++ b/ansible/roles/autoscale/tasks/main.yml @@ -1,4 +1,29 @@ --- +- name: Install slurm packages to create slurm user + import_role: + name: stackhpc.openhpc + tasks_from: install.yml + tags: install + +- name: Create /etc/openstack + file: + path: /etc/openstack + state: directory + owner: slurm + group: slurm + mode: u=rX,go= + +- name: Copy out clouds.yaml + copy: + src: "{{ autoscale_clouds }}" + dest: /etc/openstack/clouds.yaml + mode: u=r,go= + owner: slurm + group: slurm + +- name: Setup Python/Slurm tools + include_role: + name: stackhpc.slurm_openstack_tools.pytools - name: Get cloud_node specs shell: @@ -10,14 +35,15 @@ register: _os_flavors become: no changed_when: false + - name: Manipulate flavor information set_fact: flavor_info: "{{ _os_flavors.results | map(attribute='stdout') | map('from_json') }}" # list of json info -- debug: - var: flavor_info + - name: Modify openhpc_slurm_partitions set_fact: openhpc_slurm_partitions: "{{ openhpc_slurm_partitions | modify_autoscale_partitions(flavor_info, openhpc_ram_multiplier) }}" + - name: Merge autoscale configuration set_fact: openhpc_config: "{{ autoscale_openhpc_config | combine(openhpc_config, list_merge='append') }}" diff --git a/ansible/roles/rebuild/tasks/main.yml b/ansible/roles/rebuild/tasks/main.yml index a191e807a..281b46043 100644 --- a/ansible/roles/rebuild/tasks/main.yml +++ b/ansible/roles/rebuild/tasks/main.yml @@ -1,4 +1,23 @@ --- +- block: + - name: Create /etc/openstack + file: + path: /etc/openstack + state: directory + owner: root + group: root + mode: '0400' + - name: Copy out clouds.yaml + copy: + src: "{{ openhpc_rebuild_clouds }}" + dest: /etc/openstack/clouds.yaml + owner: root + group: root + mode: '0400' + - name: Setup slurm tools + include_role: + name: stackhpc.slurm_openstack_tools.pytools + when: openhpc_enable.batch - name: Merge rebuild configuration set_fact: diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 6a76aaf85..de4386e53 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -9,45 +9,25 @@ - include_role: name: geerlingguy.mysql -- name: Enable Slurm/OpenStack integrations - hosts: - - rebuild - - autoscale - become: true +- name: Setup Slurm-driven reimage on OpenStack + hosts: rebuild + become: yes tags: - rebuild + - openhpc + tasks: + - import_role: + name: rebuild + +- name: Setup autoscaling on OpenStack + hosts: autoscale + become: yes + tags: - autoscale - openhpc tasks: - - name: Install slurm packages to create slurm user - import_role: - name: stackhpc.openhpc - tasks_from: install.yml - tags: install - - name: Create /etc/openstack - file: - path: /etc/openstack - state: directory - owner: slurm # TODO: check this works for rebuild too - group: slurm - mode: u=rX,go= - - name: Copy out clouds.yaml - copy: - src: "{{ openhpc_rebuild_clouds }}" # TODO: name is wrong really! - dest: /etc/openstack/clouds.yaml - mode: u=r,go= - owner: slurm - group: slurm - - name: Setup Python/Slurm tools - include_role: - name: stackhpc.slurm_openstack_tools.pytools - - name: Configure autoscale programs and parameters - include_role: + - import_role: name: autoscale - when: "'autoscale' in group_names" - - name: Configure rebuild programs and parameters - include_role: - name: rebuild - name: Setup slurm hosts: openhpc diff --git a/environments/common/inventory/group_vars/all/rebuild.yml b/environments/common/inventory/group_vars/all/rebuild.yml index 0be916def..e40ffe66c 100644 --- a/environments/common/inventory/group_vars/all/rebuild.yml +++ b/environments/common/inventory/group_vars/all/rebuild.yml @@ -1 +1 @@ -openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml # TODO: rename? +openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml From a236d36b1884ea444ac271f960c81074d8f952f1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Oct 2021 09:38:00 +0000 Subject: [PATCH 075/133] move rebuild role back into collection --- ansible/slurm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index de4386e53..766d54f71 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -17,7 +17,7 @@ - openhpc tasks: - import_role: - name: rebuild + name: stackhpc.slurm_openstack_tools.rebuild - name: Setup autoscaling on OpenStack hosts: autoscale From 62b6cf240e5f0772cff0bcda74665128a9719cb6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Oct 2021 10:13:58 +0000 Subject: [PATCH 076/133] move autoscale into collection --- ansible/roles/autoscale/.travis.yml | 29 ------- ansible/roles/autoscale/README.md | 85 ------------------- ansible/roles/autoscale/defaults/main.yml | 37 -------- .../filter_plugins/openhpc_partitions.py | 71 ---------------- ansible/roles/autoscale/meta.empty/main.yml | 52 ------------ ansible/roles/autoscale/tasks/main.yml | 49 ----------- ansible/roles/autoscale/tasks/validate.yml | 5 -- ansible/slurm.yml | 2 +- 8 files changed, 1 insertion(+), 329 deletions(-) delete mode 100644 ansible/roles/autoscale/.travis.yml delete mode 100644 ansible/roles/autoscale/README.md delete mode 100644 ansible/roles/autoscale/defaults/main.yml delete mode 100644 ansible/roles/autoscale/filter_plugins/openhpc_partitions.py delete mode 100644 ansible/roles/autoscale/meta.empty/main.yml delete mode 100644 ansible/roles/autoscale/tasks/main.yml delete mode 100644 ansible/roles/autoscale/tasks/validate.yml diff --git a/ansible/roles/autoscale/.travis.yml b/ansible/roles/autoscale/.travis.yml deleted file mode 100644 index 36bbf6208..000000000 --- a/ansible/roles/autoscale/.travis.yml +++ /dev/null @@ -1,29 +0,0 @@ ---- -language: python -python: "2.7" - -# Use the new container infrastructure -sudo: false - -# Install ansible -addons: - apt: - packages: - - python-pip - -install: - # Install ansible - - pip install ansible - - # Check ansible version - - ansible --version - - # Create ansible.cfg with correct roles_path - - printf '[defaults]\nroles_path=../' >ansible.cfg - -script: - # Basic role syntax check - - ansible-playbook tests/test.yml -i tests/inventory --syntax-check - -notifications: - webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/ansible/roles/autoscale/README.md b/ansible/roles/autoscale/README.md deleted file mode 100644 index 4e1ac8336..000000000 --- a/ansible/roles/autoscale/README.md +++ /dev/null @@ -1,85 +0,0 @@ -# autoscale - -Support autoscaling nodes on OpenStack clouds, i.e. creating nodes when necessary to service the queue and deleting them when they are no longer needed. - -This is implemented using Slurm's ["elastic computing"](https://slurm.schedmd.com/elastic_computing.html) features which are based on Slurm's [power saving](https://slurm.schedmd.com/power_save.html) features. - -Add the `control` group to the `autoscale` group to activate this functionality in the `ansible/slurm.yml` playbook. Note some role variables are likely to need configuring. By default, node creation and deletion will be logged in the control node's syslog. - -## Requirements - -- Working DNS. -- Active OpenStack credentials on localhost (e.g a sourced `openrc.sh` in the shell running ansible). -- Role `stackhpc.slurm_openstack_tools.pytools`. Installs [slurm-openstack-tools](github.com/stackhpc/slurm-openstack-tools) which provides a venv with the `openstacksdk` and the required resume/suspend scripts. -- Role `stackhpc.openhpc` to create a Slurm cluster. -- This role should be run on the Slurm controller only. - -## Role Variables - -- `autoscale_clouds`: Optional, path to a `clouds.yaml` file containing a single cloud. Defaults to `~/.config/openstack/clouds.yaml`. It is recommended this is an [application credential](https://docs.openstack.org/keystone/latest/user/application_credentials.html). This can be created in Horizon via Identity > Application Credentials > +Create Application Credential. The usual role required is `member`. Using access rules has been found not to work at present. Note that the downloaded credential can be encrpyted using `ansible-vault` to allow it to be committed to source control. It will automatically be decrypted when copied onto the compute nodes. - -The following variables are likely to need tuning for the specific site/instances: -- `autoscale_suspend_time`: Optional, default 120s. See `slurm.conf` parameter [SuspendTime](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SuspendTime). -- `autoscale_suspend_timeout`: Optional, default 30s. See `slurm.conf` parameter [SuspendTimeout](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SuspendTimeout). -- `autoscale_resume_timeout`: Optional, default 300s See `slurm.conf` parameter [ResumeTimeout](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_ResumeTimeout). - -The following variables may need altering for production: -- `autoscale_show_suspended_nodes`: Optional, default `true`. Whether to show suspended/powered-down nodes in `sinfo` etc. See `slurm.conf` parameter [PrivateData - cloud](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_cloud). -- `autoscale_debug_powersaving`: Optional, default `true`. Log additional information for powersaving, see `slurm.conf` parameter [DebugFlags - PowerSave](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_PowerSave_2). -- `autoscale_slurmctld_syslog_debug`: Optional, default `info`. Syslog logging level. See `slurm.conf` parameter [SlurmctldSyslogDebug](https://slurm.schedmd.com/archive/slurm-20.11.7/slurm.conf.html#OPT_SlurmctldSyslogDebug). -- `autoscale_suspend_exc_nodes`: Optional. List of nodenames (or Slurm hostlist expressions) to exclude from "power saving", i.e. they will not be autoscaled away. - -## stackhpc.openhpc role variables -This role modifies what the [openhpc_slurm_partitions variable](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) in the `stackhpc.openhpc` role accepts. Partition/group definitions may additionally include: -- `cloud_nodes`: Optional. Slurm hostlist expression (e.g. `'small-[8,10-16]'`) defining names of nodes to be defined in a ["CLOUD" state](https://slurm.schedmd.com/slurm.conf.html#OPT_CLOUD), i.e. not operational when the Slurm control daemon starts. -- `cloud_instances`: Required if `cloud_nodes` is defined. A mapping with keys `flavor`, `image`, `keypair` and `network` defining the OpenStack ID or names of properties for the CLOUD-state instances. - -Partitions/groups defining `cloud_nodes` may or may not also contain non-CLOUD state nodes (i.e. nodes in a matching inventory group). For CLOUD-state nodes, memory and CPU information is retrieved from OpenStack for the specified flavors. The `stackhpc.openhpc` group/partition options `ram_mb` and `ram_multiplier` and role variable `openhpc_ram_multiplier` are handled exactly as for non-CLOUD state nodes. This implies that if CLOUD and non-CLOUD state nodes are mixed in a single group all nodes must be homogenous in terms of processors/memory. - -Some examples are given below. Note that currently monitoring is not enabled for CLOUD-state nodes. - -### Examples - -Below is an example of partition definition, e.g. in `environments//inventory/group_vars/openhpc/overrides.yml`. Not shown here the inventory group `dev_small` contains 2 (non-CLOUD state) nodes. The "small" partition is the default and contains 2 non-CLOUD and 2 CLOUD nodes. The "burst" partition contains only CLOUD-state nodes. - -```yaml -openhpc_cluster_name: dev -general_v1_small: - image: ohpc-compute-210909-1316.qcow2 - flavor: general.v1.small - keypair: centos-at-steveb-ansible - network: stackhpc-ipv4-geneve - -general_v1_medium: - image: ohpc-compute-210909-1316.qcow2 - flavor: general.v1.medium - keypair: centos-at-steveb-ansible - network: stackhpc-ipv4-geneve - -openhpc_slurm_partitions: -- name: small - default: yes - cloud_nodes: dev-small-[2-3] - cloud_instances: "{{ general_v1_small }}" - -- name: burst - default: no - cloud_nodes: 'burst-[0-3]' - cloud_instances: "{{ general_v1_medium }}" -``` - -# Dependencies - -`stackhpc.openhpc` role as described above. - -# Example Playbook - -See ansible/slurm.yml - -# License - -Apache v2 - -# Author Information - -StackHPC Ltd. diff --git a/ansible/roles/autoscale/defaults/main.yml b/ansible/roles/autoscale/defaults/main.yml deleted file mode 100644 index 70916ddc2..000000000 --- a/ansible/roles/autoscale/defaults/main.yml +++ /dev/null @@ -1,37 +0,0 @@ -autoscale_clouds: ~/.config/openstack/clouds.yaml - -# recommended: -autoscale_show_suspended_nodes: true - -# useful for debugging, may want to amend in production: -autoscale_debug_powersaving: true -autoscale_slurmctld_syslog_debug: info # https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldSyslogDebug - -# likely to need tuning: -autoscale_suspend_time: 120 # https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime -autoscale_suspend_timeout: 30 # https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout -autoscale_resume_timeout: 300 # https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout -# autoscale_power_save_interval: 10 # https://slurm.schedmd.com/slurm.conf.html#OPT_power_save_interval -# autoscale_power_save_min_interval: 0 # https://slurm.schedmd.com/slurm.conf.html#OPT_power_save_min_intervals - -# likely to need defining: -autoscale_suspend_exc_nodes: [] - -autoscale_openhpc_config: - SuspendProgram: /opt/slurm-tools/bin/slurm-openstack-suspend - ResumeProgram: /opt/slurm-tools/bin/slurm-openstack-resume - SlurmctldParameters: - - idle_on_node_suspend # https://slurm.schedmd.com/slurm.conf.html#OPT_idle_on_node_suspend - - cloud_dns # https://slurm.schedmd.com/slurm.conf.html#OPT_cloud_dns - # - "power_save_interval={{ autoscale_power_save_interval}}" # seems to break if you set this - # - "power_save_min_interval={{ autoscale_power_save_min_interval }}" - CommunicationParameters: - - NoAddrCache # https://slurm.schedmd.com/slurm.conf.html#OPT_NoAddrCache - PrivateData: "{{ ['cloud'] if autoscale_show_suspended_nodes else [] }}" - DebugFlags: "{{ ['PowerSave'] if autoscale_debug_powersaving else [] }}" # NB: Seems to have disappeared in latest Slurm - SlurmctldSyslogDebug: "{{ autoscale_slurmctld_syslog_debug }}" - SuspendTime: "{{ autoscale_suspend_time }}" - SuspendTimeout: "{{ autoscale_suspend_timeout }}" - ResumeTimeout: "{{ autoscale_resume_timeout }}" - SuspendExcNodes: "{{ autoscale_suspend_exc_nodes | join(',') }}" -# See also TreeWidth but shouldn't needs setting with cloud_dns diff --git a/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py b/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py deleted file mode 100644 index 061fa01ae..000000000 --- a/ansible/roles/autoscale/filter_plugins/openhpc_partitions.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) 2021 StackHPC Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -# NB: To test this from the repo root run: -# ansible-playbook -i tests/inventory -i tests/inventory-mock-groups tests/filter.yml - -from ansible import errors -import jinja2 -import re - -REQUIRED_INSTANCE_ATTRS=('flavor', 'image', 'keypair', 'network') - -def modify_autoscale_partitions(openhpc_slurm_partitions, flavors, openhpc_ram_multiplier): - """ TODO: docs - - partitions: openhpc_slurm_partitions variable from stackhpc.openhpc role - flavors: List of dicts with info from `openstack flavor show`. Must contain keys 'ram' and 'vcpus' - openhpc_ram_multiplier: openhpc_ram_multiplier variable from stackhpc.openhpc role - - """ - - for part in openhpc_slurm_partitions: - for group in part.get('groups', [part]): - group_name = group.get('name', '') - - if 'cloud_nodes' in group: - if 'cloud_instances' not in group: - raise errors.AnsibleFilterError(f"`openhpc_slurm_partitions` group '{group_name}' specifies 'cloud_nodes' but is missing 'cloud_instances'.") - missing_attrs = ', '.join(set(REQUIRED_INSTANCE_ATTRS).difference(group['cloud_instances'])) - if missing_attrs: - raise errors.AnsibleFilterError(f"`openhpc_slurm_partitions` group '{group_name}' item 'cloud_instances' is missing items: {missing_attrs}.") - cloud_names = group['cloud_nodes'] - # TODO: check for cloud nodes overlapping real ones? - - flavor = [f for f in flavors if f['name'] == group['cloud_instances']['flavor']] - if len(flavor) != 1: - raise errors.AnsibleFilterError(f'expected one flavor matching {group["cloud_instances"]["flavor"]}, found {len(flavor)}: {flavor}') - flavor = flavor[0] - ram_mb = int(flavor['ram'] * group.get('ram_multiplier', openhpc_ram_multiplier)) # ram in flavor in MB, so no units conversion needed - - features = ['%s=%s' % (k, v) for (k, v) in group['cloud_instances'].items()] - cloud_nodes = { - 'NodeName': cloud_names, - 'State':'CLOUD', - 'Features': ','.join(features), - 'CPUs': flavor['vcpus'], - 'RealMemory': group.get('ram_mb', ram_mb) - } - - group['extra_nodes'] = group.get('extra_nodes', []) - group['extra_nodes'].append(cloud_nodes) - - return openhpc_slurm_partitions - -class FilterModule(object): - - def filters(self): - return { - 'modify_autoscale_partitions': modify_autoscale_partitions, - } diff --git a/ansible/roles/autoscale/meta.empty/main.yml b/ansible/roles/autoscale/meta.empty/main.yml deleted file mode 100644 index c572acc9f..000000000 --- a/ansible/roles/autoscale/meta.empty/main.yml +++ /dev/null @@ -1,52 +0,0 @@ -galaxy_info: - author: your name - description: your role description - company: your company (optional) - - # If the issue tracker for your role is not on github, uncomment the - # next line and provide a value - # issue_tracker_url: http://example.com/issue/tracker - - # Choose a valid license ID from https://spdx.org - some suggested licenses: - # - BSD-3-Clause (default) - # - MIT - # - GPL-2.0-or-later - # - GPL-3.0-only - # - Apache-2.0 - # - CC-BY-4.0 - license: license (GPL-2.0-or-later, MIT, etc) - - min_ansible_version: 2.1 - - # If this a Container Enabled role, provide the minimum Ansible Container version. - # min_ansible_container_version: - - # - # Provide a list of supported platforms, and for each platform a list of versions. - # If you don't wish to enumerate all versions for a particular platform, use 'all'. - # To view available platforms and versions (or releases), visit: - # https://galaxy.ansible.com/api/v1/platforms/ - # - # platforms: - # - name: Fedora - # versions: - # - all - # - 25 - # - name: SomePlatform - # versions: - # - all - # - 1.0 - # - 7 - # - 99.99 - - galaxy_tags: [] - # List tags for your role here, one per line. A tag is a keyword that describes - # and categorizes the role. Users find roles by searching for tags. Be sure to - # remove the '[]' above, if you add tags to this list. - # - # NOTE: A tag is limited to a single word comprised of alphanumeric characters. - # Maximum 20 tags per role. - -dependencies: [] - # List your role dependencies here, one per line. Be sure to remove the '[]' above, - # if you add dependencies to this list. diff --git a/ansible/roles/autoscale/tasks/main.yml b/ansible/roles/autoscale/tasks/main.yml deleted file mode 100644 index 6283410f4..000000000 --- a/ansible/roles/autoscale/tasks/main.yml +++ /dev/null @@ -1,49 +0,0 @@ ---- -- name: Install slurm packages to create slurm user - import_role: - name: stackhpc.openhpc - tasks_from: install.yml - tags: install - -- name: Create /etc/openstack - file: - path: /etc/openstack - state: directory - owner: slurm - group: slurm - mode: u=rX,go= - -- name: Copy out clouds.yaml - copy: - src: "{{ autoscale_clouds }}" - dest: /etc/openstack/clouds.yaml - mode: u=r,go= - owner: slurm - group: slurm - -- name: Setup Python/Slurm tools - include_role: - name: stackhpc.slurm_openstack_tools.pytools - -- name: Get cloud_node specs - shell: - cmd: "openstack flavor show --format json {{ item.cloud_instances.flavor }}" - delegate_to: localhost - run_once: true - loop: "{{ openhpc_slurm_partitions }}" - when: "'cloud_instances' in item" - register: _os_flavors - become: no - changed_when: false - -- name: Manipulate flavor information - set_fact: - flavor_info: "{{ _os_flavors.results | map(attribute='stdout') | map('from_json') }}" # list of json info - -- name: Modify openhpc_slurm_partitions - set_fact: - openhpc_slurm_partitions: "{{ openhpc_slurm_partitions | modify_autoscale_partitions(flavor_info, openhpc_ram_multiplier) }}" - -- name: Merge autoscale configuration - set_fact: - openhpc_config: "{{ autoscale_openhpc_config | combine(openhpc_config, list_merge='append') }}" diff --git a/ansible/roles/autoscale/tasks/validate.yml b/ansible/roles/autoscale/tasks/validate.yml deleted file mode 100644 index 5a56fa019..000000000 --- a/ansible/roles/autoscale/tasks/validate.yml +++ /dev/null @@ -1,5 +0,0 @@ ---- - -- name: Check openhpc_slurm_partitions information - debug: - msg: "{{ openhpc_slurm_partitions | modify_autoscale_partitions | to_nice_yaml }}" diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 766d54f71..e3051a2f6 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -27,7 +27,7 @@ - openhpc tasks: - import_role: - name: autoscale + name: stackhpc.slurm_openstack_tools.autoscale - name: Setup slurm hosts: openhpc From e140d6a45b50a8c137789232e4bdb84a1aa0e7e6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Oct 2021 10:15:31 +0000 Subject: [PATCH 077/133] remove autoscale validation as needed vars not available --- ansible/validate.yml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/ansible/validate.yml b/ansible/validate.yml index 805f66164..0c0ba8f38 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -2,15 +2,6 @@ # Fail early if configuration is invalid -- name: Validate autoscale configuration - hosts: autoscale - tags: autoscale - tasks: - - import_role: - name: autoscale - tasks_from: validate.yml - tags: validate - - name: Validate podman configuration hosts: podman tags: podman From 1132ccd1b29ea92f300de928448f2ed095d79a2c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Oct 2021 11:56:35 +0000 Subject: [PATCH 078/133] fix merging of enable_configless --- environments/common/inventory/group_vars/all/openhpc.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index e587c7884..5cc011fa6 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -27,6 +27,7 @@ openhpc_packages: "{{ openhpc_packages_default + openhpc_packages_extra }}" openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}" openhpc_login_only_nodes: login openhpc_config_default: - SlurmctldParameters: enable_configless + SlurmctldParameters: + - enable_configless openhpc_config_extra: {} openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" From 4e7b28da38c0b2016df4852c07cb2137ffb16f6e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 7 Oct 2021 15:20:37 +0000 Subject: [PATCH 079/133] avoid multiple package installation tasks when using autoscale --- ansible/slurm.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index e3051a2f6..14f1a0cc2 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -19,6 +19,20 @@ - import_role: name: stackhpc.slurm_openstack_tools.rebuild +- name: Preinstall Slurm packages to create slurm user + # This is an optimisation for speed as it avoids having to do this once for `control` then again for `openhpc` nodes. + hosts: openhpc + become: yes + tags: + - autoscale + - openhpc + - install + tasks: + - import_role: + name: stackhpc.openhpc + tasks_from: install.yml + when: groups.get('autoscale', []) | length > 0 + - name: Setup autoscaling on OpenStack hosts: autoscale become: yes From 99950adac059b5e46f05a2dbd44c8709915f12b5 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 8 Oct 2021 07:57:51 +0000 Subject: [PATCH 080/133] remove in-appliance rebuild role --- ansible/roles/rebuild/README.md | 47 ------------------------- ansible/roles/rebuild/defaults/main.yml | 4 --- ansible/roles/rebuild/tasks/main.yml | 24 ------------- 3 files changed, 75 deletions(-) delete mode 100644 ansible/roles/rebuild/README.md delete mode 100644 ansible/roles/rebuild/defaults/main.yml delete mode 100644 ansible/roles/rebuild/tasks/main.yml diff --git a/ansible/roles/rebuild/README.md b/ansible/roles/rebuild/README.md deleted file mode 100644 index f846bbeb3..000000000 --- a/ansible/roles/rebuild/README.md +++ /dev/null @@ -1,47 +0,0 @@ -rebuild -======= - -Enable the compute nodes to be reimaged from Slurm. To use this functionality add the `control` and `compute` groups to the `rebuild` group. - -Once `ansible/slurm.yml` has run, node(s) can be reimaged using: - - scontrol reboot [ASAP] [nextstate=] reason="rebuild image:" [] - -where: -- `` is the name (if unique) or ID of an image in OpenStack. -- `` is a Slurm hostlist expression defining the nodes to reimage. -- `ASAP` means the rebuild will happen as soon as existing jobs on the node(s) complete - no new jobs will be scheduled on it. -- If `nextstate=...` is not given nodes remain in DRAIN state after the rebuild. - -Requirements ------------- - -- This role must be run before the `stackhpc.openhpc` role's `runtime.yml` playbook as it modifies the `openhpc_config` variable. -- OpenStack credentials on the compute nodes, e.g. at `/etc/openstack/clouds.yaml` which are readable by the root user. It is recommended these credentials are an [application credential](https://docs.openstack.org/keystone/latest/user/application_credentials.html). This can be created in Horizon via Identity > Application Credentials > +Create Application Credential. The usual role required is `member`. Using access rules has been found not to work at present. Note that the downloaded credential can be encrpyted using `ansible-vault` to allow commit to source control. It will automatically be decrypted when copied onto the compute nodes. -- An image which when booted adds that node to the Slurm cluster. E.g. see `packer/README.md`. - -Role Variables --------------- - -None normally required. - -Dependencies ------------- - -See above. - -Example Playbook ----------------- - -See `ansible/slurm.yml` - - -License -------- - -Apache v2 - -Author Information ------------------- - -StackHPC Ltd. diff --git a/ansible/roles/rebuild/defaults/main.yml b/ansible/roles/rebuild/defaults/main.yml deleted file mode 100644 index 5e532ef24..000000000 --- a/ansible/roles/rebuild/defaults/main.yml +++ /dev/null @@ -1,4 +0,0 @@ ---- - -rebuild_openhpc_config: - RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild diff --git a/ansible/roles/rebuild/tasks/main.yml b/ansible/roles/rebuild/tasks/main.yml deleted file mode 100644 index 281b46043..000000000 --- a/ansible/roles/rebuild/tasks/main.yml +++ /dev/null @@ -1,24 +0,0 @@ ---- -- block: - - name: Create /etc/openstack - file: - path: /etc/openstack - state: directory - owner: root - group: root - mode: '0400' - - name: Copy out clouds.yaml - copy: - src: "{{ openhpc_rebuild_clouds }}" - dest: /etc/openstack/clouds.yaml - owner: root - group: root - mode: '0400' - - name: Setup slurm tools - include_role: - name: stackhpc.slurm_openstack_tools.pytools - when: openhpc_enable.batch - -- name: Merge rebuild configuration - set_fact: - openhpc_config: "{{ rebuild_openhpc_config | combine(openhpc_config, list_merge='append') }}" From 3f6419d39f2fc33e5dbb8818e37283261014d22e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 8 Oct 2021 07:58:23 +0000 Subject: [PATCH 081/133] fallback to working smslabs partition definition for demo --- .../smslabs/inventory/group_vars/openhpc/partitions.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml index 95fa1d839..92a606279 100755 --- a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml +++ b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml @@ -10,12 +10,21 @@ general_v1_medium: keypair: centos-at-steveb-ansible network: stackhpc-ipv4-geneve + +openhpc_ram_multiplier: 0.90 # TODO: fixme for groups openhpc_slurm_partitions: - name: small default: yes cloud_nodes: dev-small-[2-3] cloud_instances: "{{ general_v1_small }}" + # groups: # TODO: support this + # - name: small + # - name: small_cloud + # ram_multiplier: 0.90 + # cloud_nodes: dev-small-[2-3] + # cloud_instances: "{{ general_v1_small }}" + - name: burst default: no cloud_nodes: 'burst-[0-3]' From ef90759ee58ad00f9a227b35060f4968bc04773c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 12 Oct 2021 07:29:36 +0000 Subject: [PATCH 082/133] smslabs: demo groups in openhpc_slurm_partitions --- .../inventory/group_vars/openhpc/partitions.yml | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml index 92a606279..f407239bd 100755 --- a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml +++ b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml @@ -14,16 +14,12 @@ general_v1_medium: openhpc_ram_multiplier: 0.90 # TODO: fixme for groups openhpc_slurm_partitions: - name: small - default: yes - cloud_nodes: dev-small-[2-3] - cloud_instances: "{{ general_v1_small }}" - - # groups: # TODO: support this - # - name: small - # - name: small_cloud - # ram_multiplier: 0.90 - # cloud_nodes: dev-small-[2-3] - # cloud_instances: "{{ general_v1_small }}" + groups: + - name: small + - name: small_cloud + ram_multiplier: 0.90 + cloud_nodes: dev-small-[2-3] + cloud_instances: "{{ general_v1_small }}" - name: burst default: no From 2ee93044f8c6abd5c7f7a3783f637b08490ff6fe Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 15 Oct 2021 08:04:40 +0000 Subject: [PATCH 083/133] tidy for PR --- ansible/.gitignore | 2 -- ansible/roles/podman/tasks/validate.yml | 2 +- environments/common/inventory/group_vars/all/openhpc.yml | 3 --- environments/common/inventory/groups | 3 ++- requirements.yml | 2 +- 5 files changed, 4 insertions(+), 8 deletions(-) diff --git a/ansible/.gitignore b/ansible/.gitignore index 0ccc6a74f..15ab96184 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -24,5 +24,3 @@ roles/* !roles/basic_users/** !roles/autoscale/ !roles/autoscale/** -!roles/rebuild/ -!roles/rebuild/** diff --git a/ansible/roles/podman/tasks/validate.yml b/ansible/roles/podman/tasks/validate.yml index edd877cbd..2b7bcb18d 100644 --- a/ansible/roles/podman/tasks/validate.yml +++ b/ansible/roles/podman/tasks/validate.yml @@ -12,4 +12,4 @@ assert: that: ( podman_cidr | ansible.netcommon.network_in_network(item)) == false fail_msg: "Address {{ item }} for {{ inventory_hostname }} is in podman network range {{ podman_cidr }} - set `podman_cidr` to avoid host network address ranges" - loop: "{{ ansible_all_ipv4_addresses }}" + loop: "{{ ansible_all_ipv4_addresses }}" \ No newline at end of file diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 5cc011fa6..f757eb04e 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -15,15 +15,12 @@ openhpc_slurmdbd_mysql_username: slurm openhpc_slurm_control_host: "{{ groups['control'] | first }}" openhpc_slurm_partitions: - name: "compute" - -# TODO: WIP PR to change/deprecate name here: openhpc_packages_default: - slurm-libpmi-ohpc # to allow intel mpi to work properly - ohpc-gnu9-openmpi4-perf-tools # for hpctests - openblas-gnu9-ohpc # for hpctests (HPL) openhpc_packages_extra: [] openhpc_packages: "{{ openhpc_packages_default + openhpc_packages_extra }}" - openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}" openhpc_login_only_nodes: login openhpc_config_default: diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 6811d0cc6..e9072ef18 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -76,7 +76,8 @@ cluster # All hosts to (optionally) run yum update on. [autoscale] -# Add control to enable autoscaling +# Add control to enable autoscaling on OpenStack. +# See ansible/collections/ansible_collections/stackhpc/slurm_openstack_tools/roles/autoscale/README.md [block_devices] # Superset of hosts to configure filesystems on - see ansible/roles/block_devices/README.md diff --git a/requirements.yml b/requirements.yml index afa1ab90e..28ea1a948 100644 --- a/requirements.yml +++ b/requirements.yml @@ -2,7 +2,7 @@ roles: - src: stackhpc.nfs - src: https://github.com/stackhpc/ansible-role-openhpc - version: feature/autoscale + version: feature/autoscale # TODO: remove once merged name: stackhpc.openhpc - src: cloudalchemy.node_exporter - src: cloudalchemy.blackbox-exporter From 6476e8280c40ee6484a5cfa42ee48ac5f9d5702e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 15 Oct 2021 10:27:14 +0000 Subject: [PATCH 084/133] fix branch for ansible_collection_slurm_openstack_tools --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 28ea1a948..07b4ae319 100644 --- a/requirements.yml +++ b/requirements.yml @@ -17,5 +17,5 @@ collections: - name: community.grafana - name: https://github.com/stackhpc/ansible_collection_slurm_openstack_tools type: git - version: main + version: feature/autoscale # TODO: FIXME once merged ... From 2c6c642d7224f942bfd8103ac7433934092f2e37 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 31 Jan 2022 10:06:39 +0000 Subject: [PATCH 085/133] fix up autoscale test environment --- ansible/.gitignore | 2 - environments/smslabs/activate | 23 ---------- environments/smslabs/hooks/post.yml | 19 -------- .../smslabs/inventory/group_vars/all/.gitkeep | 0 .../inventory/group_vars/all/users.yml | 3 -- .../group_vars/openhpc/overrides.yml | 9 ---- .../group_vars/openhpc/partitions.yml | 28 ------------ .../inventory/group_vars/podman/overrides.yml | 1 - .../inventory/group_vars/rebuild/override.yml | 1 - environments/smslabs/inventory/groups | 45 ------------------- environments/smslabs/inventory/hosts | 18 -------- 11 files changed, 149 deletions(-) delete mode 100644 environments/smslabs/activate delete mode 100644 environments/smslabs/hooks/post.yml delete mode 100644 environments/smslabs/inventory/group_vars/all/.gitkeep delete mode 100644 environments/smslabs/inventory/group_vars/all/users.yml delete mode 100644 environments/smslabs/inventory/group_vars/openhpc/overrides.yml delete mode 100755 environments/smslabs/inventory/group_vars/openhpc/partitions.yml delete mode 100644 environments/smslabs/inventory/group_vars/podman/overrides.yml delete mode 100644 environments/smslabs/inventory/group_vars/rebuild/override.yml delete mode 100644 environments/smslabs/inventory/groups delete mode 100755 environments/smslabs/inventory/hosts diff --git a/ansible/.gitignore b/ansible/.gitignore index 15ab96184..bf07028ab 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -22,5 +22,3 @@ roles/* !roles/block_devices/** !roles/basic_users/ !roles/basic_users/** -!roles/autoscale/ -!roles/autoscale/** diff --git a/environments/smslabs/activate b/environments/smslabs/activate deleted file mode 100644 index e74031095..000000000 --- a/environments/smslabs/activate +++ /dev/null @@ -1,23 +0,0 @@ -export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) -echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" - -APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) -export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" - -export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") -echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" - -export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") -echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root" - -export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") -echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root" - -export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT") -echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" - -if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then - export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg -fi - - diff --git a/environments/smslabs/hooks/post.yml b/environments/smslabs/hooks/post.yml deleted file mode 100644 index 87e637f8c..000000000 --- a/environments/smslabs/hooks/post.yml +++ /dev/null @@ -1,19 +0,0 @@ -- hosts: control - become: true - tasks: - - name: Prevent ansible_user's processes being killed on compute nodes at job completion - replace: - path: /etc/slurm/slurm.epilog.clean - regexp: 'if \[ \$SLURM_UID -lt 100 \] ; then' - replace: "if [[ $SLURM_UID -lt 100 || $SLURM_JOB_USER -eq {{ ansible_user }} ]] ; then" - - name: Make a /home/test directory for centos - file: - path: /home/test - state: directory - owner: centos - group: centos - - name: Install ewatch - git: - repo: https://github.com/sjpb/ewatch.git - dest: /home/test/ewatch - force: yes diff --git a/environments/smslabs/inventory/group_vars/all/.gitkeep b/environments/smslabs/inventory/group_vars/all/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/environments/smslabs/inventory/group_vars/all/users.yml b/environments/smslabs/inventory/group_vars/all/users.yml deleted file mode 100644 index 3de23fee4..000000000 --- a/environments/smslabs/inventory/group_vars/all/users.yml +++ /dev/null @@ -1,3 +0,0 @@ -users: - - name: stig - pubkey: ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDTXW9Y0r0cTW/ay6FEIlIejuRPZZ+ObzR08XFzp4x8ecCW//WSZAjo1fD/u/CQGoV552QCjWj+tP9Cy9UcsI3WLAx+n4i48oHqvpRLO1CLgJazNpQ8Bc7GveF78xhD5EoL/IpcAFKIad3CU7gb8HLRJIQpER1OsY96T9ViKe9lDWy8mk2WjoYoU1niMtmbs549Gqwl+fGNdBVUsGS5k7Xy4D/0T8TitthN3W6UbMHXVCUzdd3v9TNl7hgyeq6dCvRS6g8Vmlp2Ia0NLkrWF+bqP2RhRuqWOj71PD3auPAq0hF4yqdW9awMuZY8vBesnjE3iC2h34jvFkYaolGTfDZUa48s7yBTpjWoINUSbg105KJoPg55lWwXj58MMhvyX6hyYl3oJMiG3eq48jAAA4n80EKK4IBXrg/yjpuoDiNGqVe9hDAoT94j3+s8Smz5rohsKQVS+l266eyjo2VLUVR2NaOnw5fW86MEUyTicvHjSN4xOCGjSK2j1k6hXT7EiuM= stig@nrel-jumphost.novalocal \ No newline at end of file diff --git a/environments/smslabs/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs/inventory/group_vars/openhpc/overrides.yml deleted file mode 100644 index 4bed1823f..000000000 --- a/environments/smslabs/inventory/group_vars/openhpc/overrides.yml +++ /dev/null @@ -1,9 +0,0 @@ -openhpc_extra_packages: - - git - - python3 -openhpc_extra_config_overrides: - SlurmctldDebug: debug - SlurmdDebug: debug - -#example_list: "{{ example_list + [7] }}" # FAILS - recursive -#example_dict: "{{ example_dict | combine({c: 4} ) }}" # FAILS - recursive diff --git a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml b/environments/smslabs/inventory/group_vars/openhpc/partitions.yml deleted file mode 100755 index f407239bd..000000000 --- a/environments/smslabs/inventory/group_vars/openhpc/partitions.yml +++ /dev/null @@ -1,28 +0,0 @@ -general_v1_small: - image: ohpc-compute-210909-1316.qcow2 - flavor: general.v1.small - keypair: centos-at-steveb-ansible - network: stackhpc-ipv4-geneve - -general_v1_medium: - image: ohpc-compute-210909-1316.qcow2 - flavor: general.v1.medium - keypair: centos-at-steveb-ansible - network: stackhpc-ipv4-geneve - - -openhpc_ram_multiplier: 0.90 # TODO: fixme for groups -openhpc_slurm_partitions: -- name: small - groups: - - name: small - - name: small_cloud - ram_multiplier: 0.90 - cloud_nodes: dev-small-[2-3] - cloud_instances: "{{ general_v1_small }}" - -- name: burst - default: no - cloud_nodes: 'burst-[0-3]' - cloud_instances: "{{ general_v1_medium }}" - diff --git a/environments/smslabs/inventory/group_vars/podman/overrides.yml b/environments/smslabs/inventory/group_vars/podman/overrides.yml deleted file mode 100644 index 18e712665..000000000 --- a/environments/smslabs/inventory/group_vars/podman/overrides.yml +++ /dev/null @@ -1 +0,0 @@ -podman_cidr: 192.168.1.0/24 diff --git a/environments/smslabs/inventory/group_vars/rebuild/override.yml b/environments/smslabs/inventory/group_vars/rebuild/override.yml deleted file mode 100644 index 178ab7848..000000000 --- a/environments/smslabs/inventory/group_vars/rebuild/override.yml +++ /dev/null @@ -1 +0,0 @@ -pytools_gitref: feature/autoscale diff --git a/environments/smslabs/inventory/groups b/environments/smslabs/inventory/groups deleted file mode 100644 index 6fde43dfa..000000000 --- a/environments/smslabs/inventory/groups +++ /dev/null @@ -1,45 +0,0 @@ -[nfs:children] -openhpc - -[hpctests:children] -# Login node to use for running mpi-based testing. -login - -[mysql:children] -control - -[prometheus:children] -control - -[grafana:children] -control - -[alertmanager:children] -control - -[node_exporter:children] -# disabled node_exporter on control to avoid noise in syslog -login -compute - -[opendistro:children] -control - -[kibana:children] -control - -[slurm_stats:children] -control - -[filebeat:children] -slurm_stats - -[rebuild:children] -control -compute - -[update:children] -cluster - -[autoscale:children] -control diff --git a/environments/smslabs/inventory/hosts b/environments/smslabs/inventory/hosts deleted file mode 100755 index 5ab90d3b8..000000000 --- a/environments/smslabs/inventory/hosts +++ /dev/null @@ -1,18 +0,0 @@ -[all:vars] -ansible_user=centos -openhpc_cluster_name=dev - -[control] -dev-control ansible_host=10.0.3.182 server_networks='{"stackhpc-ipv4-geneve":["10.0.3.182"]}' - -[login] -dev-login-1 ansible_host=10.0.1.54 server_networks='{"stackhpc-ipv4-geneve":["10.0.1.54"]}' - -[compute] -dev-small-0 ansible_host=10.0.1.217 server_networks='{"stackhpc-ipv4-geneve":["10.0.1.217"]}' -dev-small-1 ansible_host=10.0.3.253 server_networks='{"stackhpc-ipv4-geneve":["10.0.3.253"]}' - -# Define groups for slurm parititions: -[dev_small] -dev-small-0 -dev-small-1 From 12e7de430f6afb8c3f42a33ad43113f9cf6427bf Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 31 Jan 2022 10:07:55 +0000 Subject: [PATCH 086/133] change autoscale group to be openstack-specific --- .github/workflows/smslabs.yml | 2 +- ansible/slurm.yml | 16 ++++------------ environments/common/layouts/everything | 3 +++ environments/smslabs-example/inventory/groups | 3 +++ 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/.github/workflows/smslabs.yml b/.github/workflows/smslabs.yml index 00217cee3..5cdd23ea8 100644 --- a/.github/workflows/smslabs.yml +++ b/.github/workflows/smslabs.yml @@ -95,7 +95,7 @@ jobs: env: ANSIBLE_FORCE_COLOR: True - - name: Build control and compute images + - name: Build login and compute images run: | . venv/bin/activate . environments/smslabs-example/activate diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 14f1a0cc2..f00fec45d 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -24,20 +24,20 @@ hosts: openhpc become: yes tags: - - autoscale + - openstack_autoscale - openhpc - install tasks: - import_role: name: stackhpc.openhpc tasks_from: install.yml - when: groups.get('autoscale', []) | length > 0 + when: groups.get('openstack_autoscale', []) | length > 0 - name: Setup autoscaling on OpenStack - hosts: autoscale + hosts: openstack_autoscale become: yes tags: - - autoscale + - openstack_autoscale - openhpc tasks: - import_role: @@ -49,14 +49,6 @@ tags: - openhpc tasks: - # - name: Add CentOS 8.3 Vault repo for OpenHPC hwloc dependency - # # NB: REMOVE THIS once OpenHPC works on CentOS 8.4 - # yum_repository: - # name: vault - # file: CentOS-Linux-Vault8.3 - # description: CentOS 8.3 packages from Vault - # baseurl: https://vault.centos.org/8.3.2011/BaseOS/$basearch/os/ - # gpgkey: file:///etc/pki/rpm-gpg/RPM-GPG-KEY-centosofficial - import_role: name: stackhpc.openhpc diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index c6a47453e..e120aa1f7 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -41,3 +41,6 @@ cluster [basic_users] # Add `openhpc` group to add Slurm users via creation of users on each node. + +[openstack_autoscale] +# Add `control` group to configure autoscaling on OpenStack clouds. diff --git a/environments/smslabs-example/inventory/groups b/environments/smslabs-example/inventory/groups index 2e5efeb67..b721d0e93 100644 --- a/environments/smslabs-example/inventory/groups +++ b/environments/smslabs-example/inventory/groups @@ -38,3 +38,6 @@ compute [update:children] cluster + +[openstack_autoscale:children] +control From c0370d6ff81bfaf0f0c1c6d31afe114eeb6ad46d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 17 Feb 2022 10:17:38 +0000 Subject: [PATCH 087/133] fix security groups in smslabs for idempotency --- environments/smslabs-example/terraform/nodes.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/smslabs-example/terraform/nodes.tf b/environments/smslabs-example/terraform/nodes.tf index 3bca7fb36..4b849f0bb 100644 --- a/environments/smslabs-example/terraform/nodes.tf +++ b/environments/smslabs-example/terraform/nodes.tf @@ -6,7 +6,7 @@ resource "openstack_compute_instance_v2" "control" { flavor_name = var.control_node.flavor key_pair = var.key_pair config_drive = true - security_groups = ["default", "ssh"] + security_groups = ["default", "SSH"] network { uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id # ensures nodes not created till subnet created @@ -24,7 +24,7 @@ resource "openstack_compute_instance_v2" "login" { flavor_name = each.value.flavor key_pair = var.key_pair config_drive = true - security_groups = ["default", "ssh"] + security_groups = ["default", "SSH"] network { uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id From 2192ab7648f340fdecb14ce84ebd04e162ee1ab1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 17 Feb 2022 17:08:22 +0000 Subject: [PATCH 088/133] fix smslabs env not being configless, add checks for this --- ansible/slurm.yml | 6 ++++++ .../inventory/group_vars/openhpc/overrides.yml | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index f00fec45d..a60918070 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -49,6 +49,12 @@ tags: - openhpc tasks: + - assert: + that: "'enable_configless' in openhpc_config.SlurmctldParameters | default([])" + fail_msg: | + 'enable_configless' not found in openhpc_config.SlurmctldParameters - is variable openhpc_config overridden? + Additional slurm.conf parameters should be provided using variable openhpc_config_extra. + success_msg: Checked Slurm will be configured for configless operation - import_role: name: stackhpc.openhpc diff --git a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml index 3585ae073..4cf1e5bc1 100644 --- a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml +++ b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml @@ -1,4 +1,4 @@ -openhpc_config: +openhpc_config_extra: SlurmctldDebug: debug SlurmdDebug: debug openhpc_slurm_partitions: From 0290115ffb400233a059ff8825b47777309eb07a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 17 Feb 2022 17:34:59 +0000 Subject: [PATCH 089/133] WIP for smslabs autoscale --- .github/workflows/smslabs.yml | 2 ++ environments/common/inventory/group_vars/all/openhpc.yml | 1 + 2 files changed, 3 insertions(+) diff --git a/.github/workflows/smslabs.yml b/.github/workflows/smslabs.yml index 5cdd23ea8..05a392aa0 100644 --- a/.github/workflows/smslabs.yml +++ b/.github/workflows/smslabs.yml @@ -132,3 +132,5 @@ jobs: OS_CLOUD: openstack TF_VAR_cluster_name: ci${{ github.run_id }} if: ${{ success() || cancelled() }} + +# TODO: delete images! \ No newline at end of file diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index f757eb04e..0b3912622 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -28,3 +28,4 @@ openhpc_config_default: - enable_configless openhpc_config_extra: {} openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" +openhpc_ram_multiplier: 0.90 # TODO: DOCS: needs to be available to stackhpc.slurm_openstack_tools.autoscale role, plus lowered a bit to cope with autoscale problems From 7c33f1c1669294c787b5693a81a4637e9fe2a0f7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 21 Feb 2022 19:04:56 +0000 Subject: [PATCH 090/133] add basic autoscale to CI --- .github/workflows/smslabs.yml | 16 ++++++++-------- .../smslabs-example/ci/reimage-compute.yml | 8 ++++++++ environments/smslabs-example/hooks/post.yml | 17 ++++++++++++++--- .../inventory/group_vars/openhpc/overrides.yml | 6 ++++++ 4 files changed, 36 insertions(+), 11 deletions(-) diff --git a/.github/workflows/smslabs.yml b/.github/workflows/smslabs.yml index 05a392aa0..aed4aa433 100644 --- a/.github/workflows/smslabs.yml +++ b/.github/workflows/smslabs.yml @@ -87,14 +87,6 @@ jobs: env: ANSIBLE_FORCE_COLOR: True - - name: Run MPI-based tests - run: | - . venv/bin/activate - . environments/smslabs-example/activate - ansible-playbook -vv ansible/adhoc/hpctests.yml - env: - ANSIBLE_FORCE_COLOR: True - - name: Build login and compute images run: | . venv/bin/activate @@ -121,6 +113,14 @@ jobs: ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml env: OS_CLOUD: openstack + + - name: Run MPI-based tests, triggering autoscaling + run: | + . venv/bin/activate + . environments/smslabs-example/activate + ansible-playbook -vv ansible/adhoc/hpctests.yml + env: + ANSIBLE_FORCE_COLOR: True - name: Delete infrastructure run: | diff --git a/environments/smslabs-example/ci/reimage-compute.yml b/environments/smslabs-example/ci/reimage-compute.yml index 3efa4e47c..42989800a 100644 --- a/environments/smslabs-example/ci/reimage-compute.yml +++ b/environments/smslabs-example/ci/reimage-compute.yml @@ -14,6 +14,14 @@ set_fact: compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" + - name: Add compute image ID to autoscale definition + copy: + dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/openhpc/autoscale.yml" + content: | + openhpc_autoscale_image: {{ compute_build.artifact_id }} + delegate_to: localhost + - meta: end_here + - name: Request compute node rebuild via Slurm shell: cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1] diff --git a/environments/smslabs-example/hooks/post.yml b/environments/smslabs-example/hooks/post.yml index 68303c5cb..e764f99fc 100644 --- a/environments/smslabs-example/hooks/post.yml +++ b/environments/smslabs-example/hooks/post.yml @@ -4,11 +4,22 @@ tasks: - block: - name: Run sinfo - shell: 'sinfo --noheader --format="%N %P %a %l %D %t"' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name + shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name register: sinfo changed_when: false - name: Check nodes have expected slurm state assert: - that: "(sinfo.stdout_lines[0] | split)[1:] == ['small*', 'up', '60-00:00:00', '2', 'idle']" # don't know what instance names are as have CI run ID in them - fail_msg: "sinfo output not as expected: {{ sinfo.stdout }}" + that: sinfo.stdout_lines == expected_sinfo + fail_msg: | + sinfo output not as expected: + actual: + {{ sinfo.stdout_lines }} + expected: + {{ expected_sinfo }} + + vars: + expected_sinfo: + - "{{ openhpc_cluster_name }}-compute-[0-1] small* up 60-00:00:00 2 idle" + - "{{ openhpc_cluster_name }}-compute-[2-3] small* up 60-00:00:00 2 idle~" + when: "'builder' not in group_names" # won't have a slurm control daemon when in build diff --git a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml index 4cf1e5bc1..a8d82a032 100644 --- a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml +++ b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml @@ -3,3 +3,9 @@ openhpc_config_extra: SlurmdDebug: debug openhpc_slurm_partitions: - name: small + cloud_nodes: autoscale-compute-[2-3] + cloud_instances: # TODO: can we somehow check these when templating?? + flavor: general.v1.tiny + image: "{{ openhpc_autoscale_image | default('IMAGE_PLACEHOLDER') }}" + keypair: slurm-app-ci + network: "{{ server_networks.keys() | first }}" # TODO: bit hacky?? From e353d4ebc685a3f0ee9e36cb44971a0242c634e1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 22 Feb 2022 12:31:59 +0000 Subject: [PATCH 091/133] WIP CI for autoscale on Arcus --- environments/arcus/.gitignore | 3 + environments/arcus/activate | 23 +++++++ environments/arcus/ansible.cfg | 15 ++++ environments/arcus/bastion_fingerprint | 3 + environments/arcus/builder.pkrvars.hcl | 7 ++ environments/arcus/ci/reimage-compute.yml | 45 ++++++++++++ environments/arcus/ci/reimage-login.yml | 23 +++++++ environments/arcus/hooks/post.yml | 25 +++++++ .../arcus/inventory/group_vars/all/.gitkeep | 0 .../inventory/group_vars/all/bastion.yml | 1 + .../group_vars/openhpc/overrides.yml | 11 +++ .../inventory/group_vars/podman/overrides.yml | 1 + environments/arcus/inventory/groups | 43 ++++++++++++ .../arcus/terraform/.terraform.lock.hcl | 40 +++++++++++ environments/arcus/terraform/getfaults.py | 32 +++++++++ environments/arcus/terraform/inventory.tf | 13 ++++ environments/arcus/terraform/inventory.tpl | 24 +++++++ environments/arcus/terraform/main.tf | 12 ++++ environments/arcus/terraform/network.tf | 22 ++++++ environments/arcus/terraform/nodes.tf | 52 ++++++++++++++ environments/arcus/terraform/terraform.tfvars | 1 + environments/arcus/terraform/variables.tf | 68 +++++++++++++++++++ 22 files changed, 464 insertions(+) create mode 100644 environments/arcus/.gitignore create mode 100644 environments/arcus/activate create mode 100644 environments/arcus/ansible.cfg create mode 100644 environments/arcus/bastion_fingerprint create mode 100644 environments/arcus/builder.pkrvars.hcl create mode 100644 environments/arcus/ci/reimage-compute.yml create mode 100644 environments/arcus/ci/reimage-login.yml create mode 100644 environments/arcus/hooks/post.yml create mode 100644 environments/arcus/inventory/group_vars/all/.gitkeep create mode 100644 environments/arcus/inventory/group_vars/all/bastion.yml create mode 100644 environments/arcus/inventory/group_vars/openhpc/overrides.yml create mode 100644 environments/arcus/inventory/group_vars/podman/overrides.yml create mode 100644 environments/arcus/inventory/groups create mode 100644 environments/arcus/terraform/.terraform.lock.hcl create mode 100755 environments/arcus/terraform/getfaults.py create mode 100644 environments/arcus/terraform/inventory.tf create mode 100644 environments/arcus/terraform/inventory.tpl create mode 100644 environments/arcus/terraform/main.tf create mode 100644 environments/arcus/terraform/network.tf create mode 100644 environments/arcus/terraform/nodes.tf create mode 100644 environments/arcus/terraform/terraform.tfvars create mode 100644 environments/arcus/terraform/variables.tf diff --git a/environments/arcus/.gitignore b/environments/arcus/.gitignore new file mode 100644 index 000000000..12b21a20f --- /dev/null +++ b/environments/arcus/.gitignore @@ -0,0 +1,3 @@ +secrets.yml +.vscode +hosts diff --git a/environments/arcus/activate b/environments/arcus/activate new file mode 100644 index 000000000..e74031095 --- /dev/null +++ b/environments/arcus/activate @@ -0,0 +1,23 @@ +export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) +echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" + +APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) +export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" + +export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") +echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" + +export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") +echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root" + +export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") +echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root" + +export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT") +echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" + +if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then + export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg +fi + + diff --git a/environments/arcus/ansible.cfg b/environments/arcus/ansible.cfg new file mode 100644 index 000000000..d7a3783fa --- /dev/null +++ b/environments/arcus/ansible.cfg @@ -0,0 +1,15 @@ +[defaults] +any_errors_fatal = True +stdout_callback = debug +stderr_callback = debug +gathering = smart +forks = 30 +host_key_checking = False +inventory = ../common/inventory,inventory +collections_path = ../../ansible/collections +roles_path = ../../ansible/roles +filter_plugins = ../../ansible/filter_plugins + +[ssh_connection] +ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +pipelining = True diff --git a/environments/arcus/bastion_fingerprint b/environments/arcus/bastion_fingerprint new file mode 100644 index 000000000..fd848d48e --- /dev/null +++ b/environments/arcus/bastion_fingerprint @@ -0,0 +1,3 @@ +|1|D3dYOn3TW5it2JYvKXYaA9A2n20=|kfvkBf/81L0icRH0E8A4ZEsudmY= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINNuXZkH7ppkTGNGKzmGEvAnvlLO2D+YtlJw1m3P16FV +|1|pU67hESNkulTilRIRjZYcU8t3g8=|y02bfKJ4CKmFBYiJ9AEGJLCS/LU= ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDkOPL7fQiLFrg+/mDbff+jr+mQkI8pAkS5aBKOaknKuzTGrxILO5XSbyTJxyEwIKzZHBCUH2w99yv3oCqiphYp7iLLdPKl98RRnAXneJ1mo7nJfaTOSj5FGFf/AeHFZFa18B8zZrfFOOTGdEXeQpcik6R2A0/o4ZGE9rUg/dEoLQpFp8z+XRhsbNWgZ4a63oWrt02p+zdXPZ+Plir56j0qyQXoOo/BjEoLHs0aah61jfEOcJAcgpTrev/vdhBqJCgEXkf6AhiKidTnQxw7G/5C/BKtJbtuBWMgWZKcDf/uCzRkXaHNEggcJi1e6jvpUkvPLUfpRnNiBWLzehw3xZL4NicMM6D2TU0TSpB+UfEOLR0jyhCGKRQQN4jnj8ll0h+JBE6a0KnyKG+B5mXrD7THYu848jXUmBnxIaeor/NUItKEnCL0hzvAygOnniBN6uvtszSJHoGe8WbChLYJcoH3mOQTUH0k9RhXSEe90gSlLfRQInU+uzf2/qc6pffcKuc= +|1|v9pdzNOESRau/eRtyGgbjfpMTig=|E4zJdI7HOay6AQQonFqb0OtQpw0= ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBCB8R1BElOz4geGfCcb/ObF5n4Par+g9AaXQW5FU1ccgnPA59uJeOEALPeXAgJijVOhwqTdIkIoWYWeGdlud9Wc= diff --git a/environments/arcus/builder.pkrvars.hcl b/environments/arcus/builder.pkrvars.hcl new file mode 100644 index 000000000..059f49afb --- /dev/null +++ b/environments/arcus/builder.pkrvars.hcl @@ -0,0 +1,7 @@ +flavor = "general.v1.tiny" +networks = ["c245901d-6b84-4dc4-b02b-eec0fb6122b2"] # stackhpc-ci-geneve +source_image_name = "Rocky-8-GenericCloud-8.5-20211114.2.x86_64" +ssh_keypair_name = "slurm-app-ci" +security_groups = ["default", "SSH"] +ssh_bastion_host = "185.45.78.150" +ssh_bastion_username = "slurm-app-ci" diff --git a/environments/arcus/ci/reimage-compute.yml b/environments/arcus/ci/reimage-compute.yml new file mode 100644 index 000000000..42989800a --- /dev/null +++ b/environments/arcus/ci/reimage-compute.yml @@ -0,0 +1,45 @@ +# Reimage compute nodes via Slurm with latest packer-build images + +- hosts: login[0] + become: no + tasks: + - name: Read packer build manifest + set_fact: + manifest: "{{ lookup('file', manifest_path) | from_json }}" + vars: + manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" + delegate_to: localhost + + - name: Get latest compute image build + set_fact: + compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" + + - name: Add compute image ID to autoscale definition + copy: + dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/openhpc/autoscale.yml" + content: | + openhpc_autoscale_image: {{ compute_build.artifact_id }} + delegate_to: localhost + - meta: end_here + + - name: Request compute node rebuild via Slurm + shell: + cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1] + become: true + + - name: Check compute node rebuild completed + shell: + cmd: openstack server show {{ item }} --format value -c image + register: openstack_server + loop: "{{ groups['compute'] }}" + retries: 5 + delay: 30 + until: compute_build.artifact_id in openstack_server.stdout + delegate_to: localhost + +- hosts: compute + become: no + gather_facts: no + tasks: + - name: Wait for nodes to boot + wait_for_connection: diff --git a/environments/arcus/ci/reimage-login.yml b/environments/arcus/ci/reimage-login.yml new file mode 100644 index 000000000..f76f6e8d0 --- /dev/null +++ b/environments/arcus/ci/reimage-login.yml @@ -0,0 +1,23 @@ +# Reimage login nodes via OpenStack + +- hosts: login + become: no + tasks: + - name: Read packer build manifest + set_fact: + manifest: "{{ lookup('file', manifest_path) | from_json }}" + vars: + manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" + delegate_to: localhost + + - name: Get latest login image build + set_fact: + login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}" + + - name: Reimage node via openstack + shell: + cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ login_build.artifact_id }}" + delegate_to: localhost + + - name: Wait for connection + wait_for_connection: diff --git a/environments/arcus/hooks/post.yml b/environments/arcus/hooks/post.yml new file mode 100644 index 000000000..e764f99fc --- /dev/null +++ b/environments/arcus/hooks/post.yml @@ -0,0 +1,25 @@ +- hosts: login + become: no + gather_facts: false + tasks: + - block: + - name: Run sinfo + shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name + register: sinfo + changed_when: false + - name: Check nodes have expected slurm state + assert: + that: sinfo.stdout_lines == expected_sinfo + fail_msg: | + sinfo output not as expected: + actual: + {{ sinfo.stdout_lines }} + expected: + {{ expected_sinfo }} + + vars: + expected_sinfo: + - "{{ openhpc_cluster_name }}-compute-[0-1] small* up 60-00:00:00 2 idle" + - "{{ openhpc_cluster_name }}-compute-[2-3] small* up 60-00:00:00 2 idle~" + + when: "'builder' not in group_names" # won't have a slurm control daemon when in build diff --git a/environments/arcus/inventory/group_vars/all/.gitkeep b/environments/arcus/inventory/group_vars/all/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/environments/arcus/inventory/group_vars/all/bastion.yml b/environments/arcus/inventory/group_vars/all/bastion.yml new file mode 100644 index 000000000..e6d5f7699 --- /dev/null +++ b/environments/arcus/inventory/group_vars/all/bastion.yml @@ -0,0 +1 @@ +ansible_ssh_common_args: '-o ProxyCommand="ssh slurm-app-ci@128.232.222.183 -W %h:%p"' diff --git a/environments/arcus/inventory/group_vars/openhpc/overrides.yml b/environments/arcus/inventory/group_vars/openhpc/overrides.yml new file mode 100644 index 000000000..a8d82a032 --- /dev/null +++ b/environments/arcus/inventory/group_vars/openhpc/overrides.yml @@ -0,0 +1,11 @@ +openhpc_config_extra: + SlurmctldDebug: debug + SlurmdDebug: debug +openhpc_slurm_partitions: +- name: small + cloud_nodes: autoscale-compute-[2-3] + cloud_instances: # TODO: can we somehow check these when templating?? + flavor: general.v1.tiny + image: "{{ openhpc_autoscale_image | default('IMAGE_PLACEHOLDER') }}" + keypair: slurm-app-ci + network: "{{ server_networks.keys() | first }}" # TODO: bit hacky?? diff --git a/environments/arcus/inventory/group_vars/podman/overrides.yml b/environments/arcus/inventory/group_vars/podman/overrides.yml new file mode 100644 index 000000000..fc90e22f4 --- /dev/null +++ b/environments/arcus/inventory/group_vars/podman/overrides.yml @@ -0,0 +1 @@ +podman_cidr: 192.168.1.0/24 # default podman network range clashes with stackhpc-ipv4-geneve-subnet diff --git a/environments/arcus/inventory/groups b/environments/arcus/inventory/groups new file mode 100644 index 000000000..b721d0e93 --- /dev/null +++ b/environments/arcus/inventory/groups @@ -0,0 +1,43 @@ +[nfs:children] +openhpc + +[hpctests:children] +# Login node to use for running mpi-based testing. +login + +[mysql:children] +control + +[prometheus:children] +control + +[grafana:children] +control + +[alertmanager:children] +control + +[node_exporter:children] +cluster + +[opendistro:children] +control + +[kibana:children] +control + +[slurm_stats:children] +control + +[filebeat:children] +slurm_stats + +[rebuild:children] +control +compute + +[update:children] +cluster + +[openstack_autoscale:children] +control diff --git a/environments/arcus/terraform/.terraform.lock.hcl b/environments/arcus/terraform/.terraform.lock.hcl new file mode 100644 index 000000000..6f55d88a6 --- /dev/null +++ b/environments/arcus/terraform/.terraform.lock.hcl @@ -0,0 +1,40 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/local" { + version = "2.1.0" + hashes = [ + "h1:EYZdckuGU3n6APs97nS2LxZm3dDtGqyM4qaIvsmac8o=", + "zh:0f1ec65101fa35050978d483d6e8916664b7556800348456ff3d09454ac1eae2", + "zh:36e42ac19f5d68467aacf07e6adcf83c7486f2e5b5f4339e9671f68525fc87ab", + "zh:6db9db2a1819e77b1642ec3b5e95042b202aee8151a0256d289f2e141bf3ceb3", + "zh:719dfd97bb9ddce99f7d741260b8ece2682b363735c764cac83303f02386075a", + "zh:7598bb86e0378fd97eaa04638c1a4c75f960f62f69d3662e6d80ffa5a89847fe", + "zh:ad0a188b52517fec9eca393f1e2c9daea362b33ae2eb38a857b6b09949a727c1", + "zh:c46846c8df66a13fee6eff7dc5d528a7f868ae0dcf92d79deaac73cc297ed20c", + "zh:dc1a20a2eec12095d04bf6da5321f535351a594a636912361db20eb2a707ccc4", + "zh:e57ab4771a9d999401f6badd8b018558357d3cbdf3d33cc0c4f83e818ca8e94b", + "zh:ebdcde208072b4b0f8d305ebf2bfdc62c926e0717599dcf8ec2fd8c5845031c3", + "zh:ef34c52b68933bedd0868a13ccfd59ff1c820f299760b3c02e008dc95e2ece91", + ] +} + +provider "registry.terraform.io/terraform-provider-openstack/openstack" { + version = "1.43.0" + hashes = [ + "h1:1QwVWBH4Boye5QDpB3YG/WE2grF3m9c3afX3tcGv/A8=", + "zh:08af4c5b2136a95cd16789c5cb4945ad288d02b91d06018c74ed14b97b335857", + "zh:2c99eaf2a86ae1ab8186226c1be5395d45a91d93f4e65cc8731afbc736aea4e9", + "zh:3f0226ce9737e7e47822d009419a78477d5286bf30896b85cbe3af0cf9ff7c90", + "zh:40811116da43f6cab91016150462da847413b188b3e7060759a37dcd0ebbfb8d", + "zh:447678224527eeb9c8a145ad8aaec6c0e032e2e789d68708aeb3e2b488fd7e63", + "zh:49adbdcd112edd29bb71b03e5e0060c63c2904358cd34f199dcd606b63521a0e", + "zh:51054fed551149aa2962ec4192dc8a7f3b25ef170d161a4e7f68e0ea099c4c78", + "zh:635181a35d224433a2adecdf72c01e0d1873929a51ebea8730d512ecc5b5c9e0", + "zh:71752e30bfac741e8040f52d3722d3c804e7edc022e989d7ebe47537e80a6267", + "zh:75262bc0087d0f119066d156d9e5c139db93695b551c794af711f3c2b03b2fa3", + "zh:aa640e5f357c08dffce9cfbc35251a81851c2c9696d9752f5e5201d330a84627", + "zh:bbb6164d149891b340d3293ef3a26d80738f9ef5025863e30b36c3854eea0149", + "zh:d2c08432fe39c8dfb3ec929e181bb8235b0073944d96811f4654ca578fb090b1", + ] +} diff --git a/environments/arcus/terraform/getfaults.py b/environments/arcus/terraform/getfaults.py new file mode 100755 index 000000000..b3e9ecf35 --- /dev/null +++ b/environments/arcus/terraform/getfaults.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +""" Display any failure messages for openstack servers in current terraform state. """ +import json, subprocess + +def get_openstack_server(uuid): + """ Return json with openstack server info """ + cmd = ['openstack', 'server', 'show', uuid, '-f', 'json'] + server_txt = subprocess.run(cmd, stdout=subprocess.PIPE, check=True, universal_newlines=True).stdout + return json.loads(server_txt) + +def read_tf_state(): + """ Return json from terraform state in current directory """ + + with open('terraform.tfstate') as statef: + state = json.load(statef) + return state + +def check_server_errors(): + tf_state = read_tf_state() + for resource in tf_state['resources']: + if resource['type'] == 'openstack_compute_instance_v2': + for instance in resource['instances']: + name = instance['attributes']['name'] + uuid = instance['attributes']['id'] + + server = get_openstack_server(uuid) + failure_msg = server.get('fault', {}).get('message') + if failure_msg: + print(name, uuid, failure_msg) + +if __name__ == '__main__': + check_server_errors() \ No newline at end of file diff --git a/environments/arcus/terraform/inventory.tf b/environments/arcus/terraform/inventory.tf new file mode 100644 index 000000000..e85c5dcc4 --- /dev/null +++ b/environments/arcus/terraform/inventory.tf @@ -0,0 +1,13 @@ +resource "local_file" "hosts" { + content = templatefile("${path.module}/inventory.tpl", + { + "cluster_name": var.cluster_name + "control": openstack_compute_instance_v2.control, + "logins": openstack_compute_instance_v2.login, + "computes": openstack_compute_instance_v2.compute, + "compute_types": var.compute_types, + "compute_nodes": var.compute_nodes, + }, + ) + filename = "../inventory/hosts" +} diff --git a/environments/arcus/terraform/inventory.tpl b/environments/arcus/terraform/inventory.tpl new file mode 100644 index 000000000..2a3ec5c1d --- /dev/null +++ b/environments/arcus/terraform/inventory.tpl @@ -0,0 +1,24 @@ +[all:vars] +ansible_user=rocky +openhpc_cluster_name=${cluster_name} + +[control] +${control.name} ansible_host=${[for n in control.network: n.fixed_ip_v4 if n.access_network][0]} server_networks='${jsonencode({for net in control.network: net.name => [ net.fixed_ip_v4 ] })}' + +[login] +%{ for login in logins ~} +${login.name} ansible_host=${[for n in login.network: n.fixed_ip_v4 if n.access_network][0]} server_networks='${jsonencode({for net in login.network: net.name => [ net.fixed_ip_v4 ] })}' +%{ endfor ~} + +[compute] +%{ for compute in computes ~} +${compute.name} ansible_host=${[for n in compute.network: n.fixed_ip_v4 if n.access_network][0]} server_networks='${jsonencode({for net in compute.network: net.name => [ net.fixed_ip_v4 ] })}' +%{ endfor ~} + +# Define groups for slurm parititions: +%{~ for type_name, type_descr in compute_types} +[${cluster_name}_${type_name}] + %{~ for node_name, node_type in compute_nodes ~} + %{~ if node_type == type_name }${cluster_name}-${node_name}%{ endif } + %{~ endfor ~} +%{ endfor ~} diff --git a/environments/arcus/terraform/main.tf b/environments/arcus/terraform/main.tf new file mode 100644 index 000000000..03beb0adc --- /dev/null +++ b/environments/arcus/terraform/main.tf @@ -0,0 +1,12 @@ +terraform { + required_version = ">= 0.14" + required_providers { + openstack = { + source = "terraform-provider-openstack/openstack" + } + } +} + +provider "openstack" { + cloud = "openstack" +} diff --git a/environments/arcus/terraform/network.tf b/environments/arcus/terraform/network.tf new file mode 100644 index 000000000..f20a84659 --- /dev/null +++ b/environments/arcus/terraform/network.tf @@ -0,0 +1,22 @@ +data "openstack_networking_network_v2" "cluster_net" { + name = var.cluster_net +} + +data "openstack_networking_subnet_v2" "cluster_subnet" { + + name = var.cluster_subnet +} + +resource "openstack_networking_port_v2" "rdma" { + + for_each = toset(concat(keys(var.login_nodes), keys(var.compute_nodes))) + + name = "${var.cluster_name}-${each.key}" + network_id = data.openstack_networking_network_v2.cluster_net.id + admin_state_up = "true" + + binding { + vnic_type = "direct" + } + +} diff --git a/environments/arcus/terraform/nodes.tf b/environments/arcus/terraform/nodes.tf new file mode 100644 index 000000000..2ca09a205 --- /dev/null +++ b/environments/arcus/terraform/nodes.tf @@ -0,0 +1,52 @@ + +resource "openstack_compute_instance_v2" "control" { + + name = "${var.cluster_name}-control" + image_name = var.control_node.image + flavor_name = var.control_node.flavor + key_pair = var.key_pair + config_drive = true + security_groups = ["default", "SSH"] + + network { + uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id # ensures nodes not created till subnet created + access_network = true + } + +} + +resource "openstack_compute_instance_v2" "login" { + + for_each = var.login_nodes + + name = "${var.cluster_name}-${each.key}" + image_name = each.value.image + flavor_name = each.value.flavor + key_pair = var.key_pair + config_drive = true + security_groups = ["default", "SSH"] + + network { + port = openstack_networking_port_v2.rdma[each.key].id + access_network = true + } + +} + +resource "openstack_compute_instance_v2" "compute" { + + for_each = var.compute_nodes + + name = "${var.cluster_name}-${each.key}" + image_name = lookup(var.compute_images, each.key, var.compute_types[each.value].image) + flavor_name = var.compute_types[each.value].flavor + key_pair = var.key_pair + config_drive = true + security_groups = ["default", "ssh"] + + network { + uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id + access_network = true + } + +} diff --git a/environments/arcus/terraform/terraform.tfvars b/environments/arcus/terraform/terraform.tfvars new file mode 100644 index 000000000..a2aec503a --- /dev/null +++ b/environments/arcus/terraform/terraform.tfvars @@ -0,0 +1 @@ +cluster_name = "autoscale" diff --git a/environments/arcus/terraform/variables.tf b/environments/arcus/terraform/variables.tf new file mode 100644 index 000000000..2f26a6154 --- /dev/null +++ b/environments/arcus/terraform/variables.tf @@ -0,0 +1,68 @@ +variable "cluster_name" { + type = string + description = "Name for cluster, used as prefix for resources" +} + +variable "cluster_net" { + type = string + description = "Name of existing cluster network" + default = "WCDC-iLab-60" +} + +variable "cluster_subnet" { + type = string + description = "Name of existing cluster subnet" + default = "WCDC-iLab-60" +} + +variable "key_pair" { + type = string + description = "Name of an existing keypair in OpenStack" + default = "slurm-app-ci" +} + +variable "control_node" { + type = map + description = "Mapping {flavor: flavor_name, image: image_name_or_id }" + default = { + flavor: "vm.alaska.cpu.general.tiny" + image: "RockyLinux-8.5-20211114.2" + } +} + +variable "login_nodes" { + type = map + description = "Mapping defining login nodes: key -> (str) nodename suffix, value -> mapping {flavor: flavor_name, image: image_name_or_id }" + default = { + login-0: { + flavor: "vm.alaska.cpu.general.tiny" + image: "RockyLinux-8.5-20211114.2" + } + } +} + +variable "compute_types" { + type = map + description = "Mapping defining types of compute nodes: key -> (str) name of type, value -> mapping {flavor: flavor_name, image: image_name_or_id }" + default = { + tiny: { + flavor: "vm.alaska.cpu.general.tiny" + image: "RockyLinux-8.5-20211114.2" + } + } +} + +variable "compute_nodes" { + type = map(string) + description = "Mapping of compute nodename suffix -> key in compute_types" + default = { + compute-0: "tiny" + compute-1: "tiny" + } +} + +variable "compute_images" { + type = map(string) + default = {} + description = "Mapping to override compute images from compute_types: key ->(str) node name, value -> (str) image name" +} From b53b72dc39f38b0aebadeb406d6ae5884c1f36f7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 28 Feb 2022 11:51:00 +0000 Subject: [PATCH 092/133] add squid proxy on arcus CI --- environments/arcus/hooks/pre.yml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 environments/arcus/hooks/pre.yml diff --git a/environments/arcus/hooks/pre.yml b/environments/arcus/hooks/pre.yml new file mode 100644 index 000000000..ec26e6978 --- /dev/null +++ b/environments/arcus/hooks/pre.yml @@ -0,0 +1,8 @@ +- hosts: all + become: true + tasks: + - name: Configure yum proxy + lineinfile: + path: /etc/yum.conf + regexp: '^proxy=http://10\.60\.102\.179:3128' + line: 'proxy=http://10.60.102.179:3128' From 22747557ffd1bc28e9ccd56075ba128b2b89d9ed Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 28 Feb 2022 11:51:24 +0000 Subject: [PATCH 093/133] wip arcus CI (TF only) --- .github/workflows/arcus.yml | 141 ++++++++++++++++++ environments/arcus/bastion_fingerprint | 6 +- environments/arcus/terraform/network.tf | 2 +- environments/arcus/terraform/nodes.tf | 6 +- environments/arcus/terraform/terraform.tfvars | 1 - environments/arcus/terraform/variables.tf | 9 ++ 6 files changed, 157 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/arcus.yml delete mode 100644 environments/arcus/terraform/terraform.tfvars diff --git a/.github/workflows/arcus.yml b/.github/workflows/arcus.yml new file mode 100644 index 000000000..159d8ad22 --- /dev/null +++ b/.github/workflows/arcus.yml @@ -0,0 +1,141 @@ + +name: Test on Arcus OpenStack in rcp-cloud-portal-demo +on: + push: + branches: + - main + - ci/arcus + pull_request: +concurrency: rcp-cloud-portal_demo # openstack project +jobs: + deploy: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + + - name: Install ansible etc + run: dev/setup-env.sh + + - name: Install terraform + uses: hashicorp/setup-terraform@v1 + + - name: Initialise terraform + run: terraform init + working-directory: ${{ github.workspace }}/environments/smslabs-example/terraform + + - name: Write clouds.yaml + run: | + mkdir -p ~/.config/openstack/ + echo "$CLOUDS_YAML" > ~/.config/openstack/clouds.yaml + shell: bash + env: + CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }} + + - name: Provision infrastructure + id: provision + run: | + . venv/bin/activate + . environments/arcus/activate + cd $APPLIANCES_ENVIRONMENT_ROOT/terraform + terraform apply -auto-approve + env: + TF_VAR_cluster_name: ci${{ github.run_id }} + + +# - name: Setup ssh +# run: | +# set -x +# mkdir ~/.ssh +# echo "$SSH_KEY" > ~/.ssh/id_rsa +# chmod 0600 ~/.ssh/id_rsa +# env: +# SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }} + +# - name: Add bastion's ssh key to known_hosts +# run: cat environments/smslabs-example/bastion_fingerprint >> ~/.ssh/known_hosts +# shell: bash + + + + + +# - name: Get server provisioning failure messages +# id: provision_failure +# run: | +# . venv/bin/activate +# . environments/smslabs-example/activate +# cd $APPLIANCES_ENVIRONMENT_ROOT/terraform +# echo "::set-output name=messages::$(./getfaults.py)" +# env: +# OS_CLOUD: openstack +# TF_VAR_cluster_name: ci${{ github.run_id }} +# if: always() && steps.provision.outcome == 'failure' + +# - name: Delete infrastructure if failed due to lack of hosts +# run: | +# . venv/bin/activate +# . environments/smslabs-example/activate +# cd $APPLIANCES_ENVIRONMENT_ROOT/terraform +# terraform destroy -auto-approve +# env: +# OS_CLOUD: openstack +# TF_VAR_cluster_name: ci${{ github.run_id }} +# if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }} + +# - name: Configure infrastructure +# run: | +# . venv/bin/activate +# . environments/smslabs-example/activate +# ansible all -m wait_for_connection +# ansible-playbook ansible/adhoc/generate-passwords.yml +# ansible-playbook -vv ansible/site.yml +# env: +# ANSIBLE_FORCE_COLOR: True + +# - name: Build login and compute images +# run: | +# . venv/bin/activate +# . environments/smslabs-example/activate +# cd packer +# PACKER_LOG=1 PACKER_LOG_PATH=build.log packer build -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl +# env: +# OS_CLOUD: openstack + +# - name: Reimage compute nodes via slurm and check cluster still up +# run: | +# . venv/bin/activate +# . environments/smslabs-example/activate +# ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-compute.yml +# ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml +# env: +# OS_CLOUD: openstack + +# - name: Reimage login nodes via openstack and check cluster still up +# run: | +# . venv/bin/activate +# . environments/smslabs-example/activate +# ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-login.yml +# ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml +# env: +# OS_CLOUD: openstack + +# - name: Run MPI-based tests, triggering autoscaling +# run: | +# . venv/bin/activate +# . environments/smslabs-example/activate +# ansible-playbook -vv ansible/adhoc/hpctests.yml +# env: +# ANSIBLE_FORCE_COLOR: True + +# - name: Delete infrastructure +# run: | +# . venv/bin/activate +# . environments/smslabs-example/activate +# cd $APPLIANCES_ENVIRONMENT_ROOT/terraform +# terraform destroy -auto-approve +# env: +# OS_CLOUD: openstack +# TF_VAR_cluster_name: ci${{ github.run_id }} +# if: ${{ success() || cancelled() }} + +# # TODO: delete images! \ No newline at end of file diff --git a/environments/arcus/bastion_fingerprint b/environments/arcus/bastion_fingerprint index fd848d48e..713026452 100644 --- a/environments/arcus/bastion_fingerprint +++ b/environments/arcus/bastion_fingerprint @@ -1,3 +1,3 @@ -|1|D3dYOn3TW5it2JYvKXYaA9A2n20=|kfvkBf/81L0icRH0E8A4ZEsudmY= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINNuXZkH7ppkTGNGKzmGEvAnvlLO2D+YtlJw1m3P16FV -|1|pU67hESNkulTilRIRjZYcU8t3g8=|y02bfKJ4CKmFBYiJ9AEGJLCS/LU= ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDkOPL7fQiLFrg+/mDbff+jr+mQkI8pAkS5aBKOaknKuzTGrxILO5XSbyTJxyEwIKzZHBCUH2w99yv3oCqiphYp7iLLdPKl98RRnAXneJ1mo7nJfaTOSj5FGFf/AeHFZFa18B8zZrfFOOTGdEXeQpcik6R2A0/o4ZGE9rUg/dEoLQpFp8z+XRhsbNWgZ4a63oWrt02p+zdXPZ+Plir56j0qyQXoOo/BjEoLHs0aah61jfEOcJAcgpTrev/vdhBqJCgEXkf6AhiKidTnQxw7G/5C/BKtJbtuBWMgWZKcDf/uCzRkXaHNEggcJi1e6jvpUkvPLUfpRnNiBWLzehw3xZL4NicMM6D2TU0TSpB+UfEOLR0jyhCGKRQQN4jnj8ll0h+JBE6a0KnyKG+B5mXrD7THYu848jXUmBnxIaeor/NUItKEnCL0hzvAygOnniBN6uvtszSJHoGe8WbChLYJcoH3mOQTUH0k9RhXSEe90gSlLfRQInU+uzf2/qc6pffcKuc= -|1|v9pdzNOESRau/eRtyGgbjfpMTig=|E4zJdI7HOay6AQQonFqb0OtQpw0= ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBCB8R1BElOz4geGfCcb/ObF5n4Par+g9AaXQW5FU1ccgnPA59uJeOEALPeXAgJijVOhwqTdIkIoWYWeGdlud9Wc= +|1|BwhEZQPqvZcdf9Phmh2mTPmIivU=|bHi1Nf8dYI8z1C+qsqQFPAty1xA= ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQChxwhZggdwj55gNzfDBzah0G8IeTPQjgMZrpboxp2BO4J+o1iZSwDj+2fqyhBGTE43vCJR13uEygz49XIy+t17qBNwHz4fVVR7jdMNymtbZoOsq9oAoBdGEICHrMzQsYZmT9+Wt74ZP2PKOOn+a+f2vg7YdeSy1UhT08iJlbXwCx56fCQnMJMOnZM9MXVLd4NUFN1TeOCIBQHwRiMJyJ7S7CdUKpyUqHOG85peKiPJ07C0RZ/W5HkYKqltwtvPGQd262p5eLC9j3nhOYSG2meRV8yTxYz3lDIPDx0+189CZ5NaxFSPCgqSYA24zavhPVLQqoct7nd7fcEw9JiTs+abZC6GckCONSHDLM+iRtWC/i5u21ZZDLxM9SIqPI96cYFszGeqyZoXxS5qPaIDHbQNAEqJp9ygNXgh9vuBo7E+aWYbFDTG0RuvW02fbmFfZw2/yXIr37+cQX+GPOnkfIRuHE3Hx5eN8C04v+BMrAfK2minawhG3A2ONJs9LI6QoeE= +|1|whGSPLhKW4xt/7PWOZ1treg3PtA=|F5gwV8j0JYWDzjb6DvHHaqO+sxs= ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBCpCG881Gt3dr+nuVIC2uGEQkeVwG6WDdS1WcCoxXC7AG+Oi5bfdqtf4IfeLpWmeuEaAaSFH48ODFr76ViygSjU= +|1|0V6eQ1FKO5NMKaHZeNFbw62mrJs=|H1vuGTbbtZD2MEgZxQf1PXPk+yU= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEnOtYByM3s2qvRT8SS1sn5z5sbwjzb1alm0B3emPcHJ \ No newline at end of file diff --git a/environments/arcus/terraform/network.tf b/environments/arcus/terraform/network.tf index f20a84659..68f7c92a0 100644 --- a/environments/arcus/terraform/network.tf +++ b/environments/arcus/terraform/network.tf @@ -9,7 +9,7 @@ data "openstack_networking_subnet_v2" "cluster_subnet" { resource "openstack_networking_port_v2" "rdma" { - for_each = toset(concat(keys(var.login_nodes), keys(var.compute_nodes))) + for_each = toset(concat(["control"], keys(var.login_nodes), keys(var.compute_nodes), var.cloud_nodes)) name = "${var.cluster_name}-${each.key}" network_id = data.openstack_networking_network_v2.cluster_net.id diff --git a/environments/arcus/terraform/nodes.tf b/environments/arcus/terraform/nodes.tf index 2ca09a205..db4f4762d 100644 --- a/environments/arcus/terraform/nodes.tf +++ b/environments/arcus/terraform/nodes.tf @@ -9,7 +9,7 @@ resource "openstack_compute_instance_v2" "control" { security_groups = ["default", "SSH"] network { - uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id # ensures nodes not created till subnet created + port = openstack_networking_port_v2.rdma["control"].id access_network = true } @@ -42,10 +42,10 @@ resource "openstack_compute_instance_v2" "compute" { flavor_name = var.compute_types[each.value].flavor key_pair = var.key_pair config_drive = true - security_groups = ["default", "ssh"] + security_groups = ["default", "SSH"] network { - uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id + port = openstack_networking_port_v2.rdma[each.key].id access_network = true } diff --git a/environments/arcus/terraform/terraform.tfvars b/environments/arcus/terraform/terraform.tfvars deleted file mode 100644 index a2aec503a..000000000 --- a/environments/arcus/terraform/terraform.tfvars +++ /dev/null @@ -1 +0,0 @@ -cluster_name = "autoscale" diff --git a/environments/arcus/terraform/variables.tf b/environments/arcus/terraform/variables.tf index 2f26a6154..7ac093dca 100644 --- a/environments/arcus/terraform/variables.tf +++ b/environments/arcus/terraform/variables.tf @@ -61,6 +61,15 @@ variable "compute_nodes" { } } +variable "cloud_nodes" { + type = list(string) + description = "Cloud nodename suffixes to precreate RDMA-capable ports" + default = [ + "compute-2", + "compute-3", + ] +} + variable "compute_images" { type = map(string) default = {} From 177a6053b06f990618261bb0f434d5a34b69ddb1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 28 Feb 2022 11:57:21 +0000 Subject: [PATCH 094/133] add TF provisioners to arcus CI --- .github/workflows/arcus.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/arcus.yml b/.github/workflows/arcus.yml index 159d8ad22..00abcecd2 100644 --- a/.github/workflows/arcus.yml +++ b/.github/workflows/arcus.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - + - name: Install ansible etc run: dev/setup-env.sh @@ -37,6 +37,7 @@ jobs: . venv/bin/activate . environments/arcus/activate cd $APPLIANCES_ENVIRONMENT_ROOT/terraform + terraform init terraform apply -auto-approve env: TF_VAR_cluster_name: ci${{ github.run_id }} From 2c73a91b4fc0f205ac50250f310ae6674eb7ca47 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 28 Feb 2022 12:30:35 +0000 Subject: [PATCH 095/133] add direct deploy, reimage, destroy jobs to arcus workflow --- .github/workflows/arcus.yml | 230 +++++++++++++++++++++++++++++++----- 1 file changed, 200 insertions(+), 30 deletions(-) diff --git a/.github/workflows/arcus.yml b/.github/workflows/arcus.yml index 00abcecd2..67e756523 100644 --- a/.github/workflows/arcus.yml +++ b/.github/workflows/arcus.yml @@ -42,22 +42,168 @@ jobs: env: TF_VAR_cluster_name: ci${{ github.run_id }} + - name: Upload terraform state + uses: actions/upload-artifact@v2 + with: + name: tfstate + path: ${{ env.APPLIANCES_ENVIRONMENT_ROOT }}/terraform/terraform.tfstate -# - name: Setup ssh -# run: | -# set -x -# mkdir ~/.ssh -# echo "$SSH_KEY" > ~/.ssh/id_rsa -# chmod 0600 ~/.ssh/id_rsa -# env: -# SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }} + - name: Upload templated inventory hosts file + uses: actions/upload-artifact@v2 + with: + name: inventory_hosts + path: ${{ env.APPLIANCES_ENVIRONMENT_ROOT }}/inventory/hosts + + imagebuild: + needs: deploy + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + + - name: Install ansible etc + run: dev/setup-env.sh + + - name: Download templated inventory hosts file + uses: actions/download-artifact@v2 + with: + name: inventory_hosts + path: ${{ env.APPLIANCES_ENVIRONMENT_ROOT }}/inventory + + - name: Write clouds.yaml + run: | + mkdir -p ~/.config/openstack/ + echo "$CLOUDS_YAML" > ~/.config/openstack/clouds.yaml + shell: bash + env: + CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }} + + - name: Setup ssh + run: | + set -x + mkdir ~/.ssh + echo "$SSH_KEY" > ~/.ssh/id_rsa + chmod 0600 ~/.ssh/id_rsa + env: + SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }} + + - name: Add bastion's ssh key to known_hosts + run: cat environments/smslabs-example/bastion_fingerprint >> ~/.ssh/known_hosts + shell: bash + + - name: Build login and compute images + run: | + . venv/bin/activate + . environments/smslabs-example/activate + cd packer + PACKER_LOG=1 PACKER_LOG_PATH=build.log packer build -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl + env: + OS_CLOUD: openstack + + directdeploy: + needs: deploy + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + + - name: Install ansible etc + run: dev/setup-env.sh + + - name: Download templated inventory hosts file + uses: actions/download-artifact@v2 + with: + name: inventory_hosts + path: ${{ env.APPLIANCES_ENVIRONMENT_ROOT }}/inventory + + - name: Write clouds.yaml + run: | + mkdir -p ~/.config/openstack/ + echo "$CLOUDS_YAML" > ~/.config/openstack/clouds.yaml + shell: bash + env: + CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }} + + - name: Setup ssh + run: | + set -x + mkdir ~/.ssh + echo "$SSH_KEY" > ~/.ssh/id_rsa + chmod 0600 ~/.ssh/id_rsa + env: + SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }} + + - name: Add bastion's ssh key to known_hosts + run: cat environments/smslabs-example/bastion_fingerprint >> ~/.ssh/known_hosts + shell: bash + + - name: Configure infrastructure + run: | + . venv/bin/activate + . environments/smslabs-example/activate + ansible all -m wait_for_connection + ansible-playbook ansible/adhoc/generate-passwords.yml + ansible-playbook -vv ansible/site.yml + env: + ANSIBLE_FORCE_COLOR: True + OS_CLOUD: openstack + + reimage: + needs: + - deploy + - directdeploy + - imagebuild + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + + - name: Install ansible etc + run: dev/setup-env.sh -# - name: Add bastion's ssh key to known_hosts -# run: cat environments/smslabs-example/bastion_fingerprint >> ~/.ssh/known_hosts -# shell: bash + - name: Download templated inventory hosts file + uses: actions/download-artifact@v2 + with: + name: inventory_hosts + path: ${{ env.APPLIANCES_ENVIRONMENT_ROOT }}/inventory + - name: Write clouds.yaml + run: | + mkdir -p ~/.config/openstack/ + echo "$CLOUDS_YAML" > ~/.config/openstack/clouds.yaml + shell: bash + env: + CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }} + - name: Setup ssh + run: | + set -x + mkdir ~/.ssh + echo "$SSH_KEY" > ~/.ssh/id_rsa + chmod 0600 ~/.ssh/id_rsa + env: + SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }} + + - name: Add bastion's ssh key to known_hosts + run: cat environments/smslabs-example/bastion_fingerprint >> ~/.ssh/known_hosts + shell: bash + - name: Reimage compute nodes via slurm and check cluster still up + run: | + . venv/bin/activate + . environments/smslabs-example/activate + ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-compute.yml + ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml + env: + ANSIBLE_FORCE_COLOR: True + OS_CLOUD: openstack + + - name: Reimage login nodes via openstack and check cluster still up + run: | + . venv/bin/activate + . environments/smslabs-example/activate + ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-login.yml + ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml + env: + ANSIBLE_FORCE_COLOR: True + OS_CLOUD: openstack # - name: Get server provisioning failure messages @@ -83,15 +229,6 @@ jobs: # TF_VAR_cluster_name: ci${{ github.run_id }} # if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }} -# - name: Configure infrastructure -# run: | -# . venv/bin/activate -# . environments/smslabs-example/activate -# ansible all -m wait_for_connection -# ansible-playbook ansible/adhoc/generate-passwords.yml -# ansible-playbook -vv ansible/site.yml -# env: -# ANSIBLE_FORCE_COLOR: True # - name: Build login and compute images # run: | @@ -128,15 +265,48 @@ jobs: # env: # ANSIBLE_FORCE_COLOR: True -# - name: Delete infrastructure -# run: | -# . venv/bin/activate -# . environments/smslabs-example/activate -# cd $APPLIANCES_ENVIRONMENT_ROOT/terraform -# terraform destroy -auto-approve -# env: -# OS_CLOUD: openstack -# TF_VAR_cluster_name: ci${{ github.run_id }} -# if: ${{ success() || cancelled() }} + destroy: + needs: + - deploy + - directdeploy + - reimage + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + + - name: Install ansible etc + run: dev/setup-env.sh + + - name: Install terraform + uses: hashicorp/setup-terraform@v1 + + - name: Initialise terraform + run: terraform init + working-directory: ${{ github.workspace }}/environments/smslabs-example/terraform + + - name: Write clouds.yaml + run: | + mkdir -p ~/.config/openstack/ + echo "$CLOUDS_YAML" > ~/.config/openstack/clouds.yaml + shell: bash + env: + CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }} + + - name: Download terraform state + uses: actions/download-artifact@v2 + with: + name: tfstate + path: ${{ env.APPLIANCES_ENVIRONMENT_ROOT }}/terraform + + - name: Delete infrastructure + run: | + . venv/bin/activate + . environments/smslabs-example/activate + cd $APPLIANCES_ENVIRONMENT_ROOT/terraform + terraform destroy -auto-approve + env: + OS_CLOUD: openstack + TF_VAR_cluster_name: ci${{ github.run_id }} + if: ${{ success() || cancelled() }} # # TODO: delete images! \ No newline at end of file From 2f8b5d4fafbff9d14af1d47637b1474318f2d7af Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 28 Feb 2022 13:48:06 +0000 Subject: [PATCH 096/133] use simplier arcus CI workflow --- .github/workflows/arcus.yml | 275 +++++++----------------------------- 1 file changed, 50 insertions(+), 225 deletions(-) diff --git a/.github/workflows/arcus.yml b/.github/workflows/arcus.yml index 67e756523..6135a5c7e 100644 --- a/.github/workflows/arcus.yml +++ b/.github/workflows/arcus.yml @@ -8,11 +8,24 @@ on: pull_request: concurrency: rcp-cloud-portal_demo # openstack project jobs: - deploy: + arcus: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 + - name: Setup ssh + run: | + set -x + mkdir ~/.ssh + echo "$SSH_KEY" > ~/.ssh/id_rsa + chmod 0600 ~/.ssh/id_rsa + env: + SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }} + + - name: Add bastion's ssh key to known_hosts + run: cat environments/arcus/bastion_fingerprint >> ~/.ssh/known_hosts + shell: bash + - name: Install ansible etc run: dev/setup-env.sh @@ -21,7 +34,7 @@ jobs: - name: Initialise terraform run: terraform init - working-directory: ${{ github.workspace }}/environments/smslabs-example/terraform + working-directory: ${{ github.workspace }}/environments/arcus/terraform - name: Write clouds.yaml run: | @@ -30,278 +43,90 @@ jobs: shell: bash env: CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }} - + - name: Provision infrastructure id: provision run: | . venv/bin/activate . environments/arcus/activate cd $APPLIANCES_ENVIRONMENT_ROOT/terraform - terraform init terraform apply -auto-approve env: + OS_CLOUD: openstack TF_VAR_cluster_name: ci${{ github.run_id }} - - - name: Upload terraform state - uses: actions/upload-artifact@v2 - with: - name: tfstate - path: ${{ env.APPLIANCES_ENVIRONMENT_ROOT }}/terraform/terraform.tfstate - - - name: Upload templated inventory hosts file - uses: actions/upload-artifact@v2 - with: - name: inventory_hosts - path: ${{ env.APPLIANCES_ENVIRONMENT_ROOT }}/inventory/hosts - - imagebuild: - needs: deploy - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - - name: Install ansible etc - run: dev/setup-env.sh - - - name: Download templated inventory hosts file - uses: actions/download-artifact@v2 - with: - name: inventory_hosts - path: ${{ env.APPLIANCES_ENVIRONMENT_ROOT }}/inventory - - name: Write clouds.yaml - run: | - mkdir -p ~/.config/openstack/ - echo "$CLOUDS_YAML" > ~/.config/openstack/clouds.yaml - shell: bash - env: - CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }} - - - name: Setup ssh - run: | - set -x - mkdir ~/.ssh - echo "$SSH_KEY" > ~/.ssh/id_rsa - chmod 0600 ~/.ssh/id_rsa - env: - SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }} - - - name: Add bastion's ssh key to known_hosts - run: cat environments/smslabs-example/bastion_fingerprint >> ~/.ssh/known_hosts - shell: bash - - - name: Build login and compute images + - name: Get server provisioning failure messages + id: provision_failure run: | . venv/bin/activate - . environments/smslabs-example/activate - cd packer - PACKER_LOG=1 PACKER_LOG_PATH=build.log packer build -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl + . environments/arcus/activate + cd $APPLIANCES_ENVIRONMENT_ROOT/terraform + echo "::set-output name=messages::$(./getfaults.py)" env: OS_CLOUD: openstack - - directdeploy: - needs: deploy - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - - name: Install ansible etc - run: dev/setup-env.sh - - - name: Download templated inventory hosts file - uses: actions/download-artifact@v2 - with: - name: inventory_hosts - path: ${{ env.APPLIANCES_ENVIRONMENT_ROOT }}/inventory - - - name: Write clouds.yaml - run: | - mkdir -p ~/.config/openstack/ - echo "$CLOUDS_YAML" > ~/.config/openstack/clouds.yaml - shell: bash - env: - CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }} - - - name: Setup ssh + TF_VAR_cluster_name: ci${{ github.run_id }} + if: always() && steps.provision.outcome == 'failure' + + - name: Delete infrastructure if failed due to lack of hosts run: | - set -x - mkdir ~/.ssh - echo "$SSH_KEY" > ~/.ssh/id_rsa - chmod 0600 ~/.ssh/id_rsa + . venv/bin/activate + . environments/arcus/activate + cd $APPLIANCES_ENVIRONMENT_ROOT/terraform + terraform destroy -auto-approve env: - SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }} - - - name: Add bastion's ssh key to known_hosts - run: cat environments/smslabs-example/bastion_fingerprint >> ~/.ssh/known_hosts - shell: bash + OS_CLOUD: openstack + TF_VAR_cluster_name: ci${{ github.run_id }} + if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }} - name: Configure infrastructure run: | . venv/bin/activate - . environments/smslabs-example/activate + . environments/arcus/activate ansible all -m wait_for_connection ansible-playbook ansible/adhoc/generate-passwords.yml ansible-playbook -vv ansible/site.yml env: ANSIBLE_FORCE_COLOR: True - OS_CLOUD: openstack - - reimage: - needs: - - deploy - - directdeploy - - imagebuild - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - - name: Install ansible etc - run: dev/setup-env.sh - - - name: Download templated inventory hosts file - uses: actions/download-artifact@v2 - with: - name: inventory_hosts - path: ${{ env.APPLIANCES_ENVIRONMENT_ROOT }}/inventory - - name: Write clouds.yaml - run: | - mkdir -p ~/.config/openstack/ - echo "$CLOUDS_YAML" > ~/.config/openstack/clouds.yaml - shell: bash - env: - CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }} - - - name: Setup ssh + - name: Build login and compute images run: | - set -x - mkdir ~/.ssh - echo "$SSH_KEY" > ~/.ssh/id_rsa - chmod 0600 ~/.ssh/id_rsa + . venv/bin/activate + . environments/arcus/activate + cd packer + PACKER_LOG=1 PACKER_LOG_PATH=build.log packer build -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl env: - SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }} - - - name: Add bastion's ssh key to known_hosts - run: cat environments/smslabs-example/bastion_fingerprint >> ~/.ssh/known_hosts - shell: bash + OS_CLOUD: openstack - name: Reimage compute nodes via slurm and check cluster still up run: | . venv/bin/activate - . environments/smslabs-example/activate + . environments/arcus/activate ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-compute.yml ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml env: - ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack - name: Reimage login nodes via openstack and check cluster still up run: | . venv/bin/activate - . environments/smslabs-example/activate + . environments/arcus/activate ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-login.yml ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml env: - ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack - -# - name: Get server provisioning failure messages -# id: provision_failure -# run: | -# . venv/bin/activate -# . environments/smslabs-example/activate -# cd $APPLIANCES_ENVIRONMENT_ROOT/terraform -# echo "::set-output name=messages::$(./getfaults.py)" -# env: -# OS_CLOUD: openstack -# TF_VAR_cluster_name: ci${{ github.run_id }} -# if: always() && steps.provision.outcome == 'failure' - -# - name: Delete infrastructure if failed due to lack of hosts -# run: | -# . venv/bin/activate -# . environments/smslabs-example/activate -# cd $APPLIANCES_ENVIRONMENT_ROOT/terraform -# terraform destroy -auto-approve -# env: -# OS_CLOUD: openstack -# TF_VAR_cluster_name: ci${{ github.run_id }} -# if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }} - - -# - name: Build login and compute images -# run: | -# . venv/bin/activate -# . environments/smslabs-example/activate -# cd packer -# PACKER_LOG=1 PACKER_LOG_PATH=build.log packer build -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl -# env: -# OS_CLOUD: openstack - -# - name: Reimage compute nodes via slurm and check cluster still up -# run: | -# . venv/bin/activate -# . environments/smslabs-example/activate -# ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-compute.yml -# ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml -# env: -# OS_CLOUD: openstack - -# - name: Reimage login nodes via openstack and check cluster still up -# run: | -# . venv/bin/activate -# . environments/smslabs-example/activate -# ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-login.yml -# ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml -# env: -# OS_CLOUD: openstack - -# - name: Run MPI-based tests, triggering autoscaling -# run: | -# . venv/bin/activate -# . environments/smslabs-example/activate -# ansible-playbook -vv ansible/adhoc/hpctests.yml -# env: -# ANSIBLE_FORCE_COLOR: True - - destroy: - needs: - - deploy - - directdeploy - - reimage - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - - name: Install ansible etc - run: dev/setup-env.sh - - - name: Install terraform - uses: hashicorp/setup-terraform@v1 - - - name: Initialise terraform - run: terraform init - working-directory: ${{ github.workspace }}/environments/smslabs-example/terraform - - - name: Write clouds.yaml + - name: Run MPI-based tests, triggering autoscaling run: | - mkdir -p ~/.config/openstack/ - echo "$CLOUDS_YAML" > ~/.config/openstack/clouds.yaml - shell: bash + . venv/bin/activate + . environments/arcus/activate + ansible-playbook -vv ansible/adhoc/hpctests.yml env: - CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }} - - - name: Download terraform state - uses: actions/download-artifact@v2 - with: - name: tfstate - path: ${{ env.APPLIANCES_ENVIRONMENT_ROOT }}/terraform - + ANSIBLE_FORCE_COLOR: True + - name: Delete infrastructure run: | . venv/bin/activate - . environments/smslabs-example/activate + . environments/arcus/activate cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform destroy -auto-approve env: @@ -309,4 +134,4 @@ jobs: TF_VAR_cluster_name: ci${{ github.run_id }} if: ${{ success() || cancelled() }} -# # TODO: delete images! \ No newline at end of file +# TODO: delete images! \ No newline at end of file From 5f627c20bc7aa05879dd010870a2d9a6d17eb41a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 28 Feb 2022 16:13:37 +0000 Subject: [PATCH 097/133] add /etc/hosts creation to arcus as no working DNS --- environments/arcus/hooks/pre.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/environments/arcus/hooks/pre.yml b/environments/arcus/hooks/pre.yml index ec26e6978..536e855f9 100644 --- a/environments/arcus/hooks/pre.yml +++ b/environments/arcus/hooks/pre.yml @@ -1,8 +1,27 @@ - hosts: all become: true + tags: squid tasks: - name: Configure yum proxy lineinfile: path: /etc/yum.conf regexp: '^proxy=http://10\.60\.102\.179:3128' line: 'proxy=http://10.60.102.179:3128' + +- hosts: all + become: true + tags: etc_hosts + tasks: + - name: Generate /etc/hosts file content + # which interface is used as ansible_host is defined by terraform (see `access_network`) so this is deterministic for multi-rail hosts + set_fact: + etc_hosts_content: | + {% for host in groups['cluster'] %}{{ hostvars[host]['ansible_host'] }} {{ host }}.novalocal {{ host }} + {% endfor %} + run_once: true + - name: Create /etc/hosts for all nodes + blockinfile: + path: /etc/hosts + create: yes + state: present + block: "{{ hostvars[ansible_play_hosts | first].etc_hosts_content }}" From e5c0f459a0ad71bdfb98992b1be181b06e07984b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 28 Feb 2022 20:13:39 +0000 Subject: [PATCH 098/133] create /etc/hosts from port info in arcus CI --- environments/arcus/hooks/pre.yml | 9 +-------- environments/arcus/terraform/inventory.tf | 1 + environments/arcus/terraform/inventory.tpl | 1 + 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/environments/arcus/hooks/pre.yml b/environments/arcus/hooks/pre.yml index 536e855f9..d2be77261 100644 --- a/environments/arcus/hooks/pre.yml +++ b/environments/arcus/hooks/pre.yml @@ -12,16 +12,9 @@ become: true tags: etc_hosts tasks: - - name: Generate /etc/hosts file content - # which interface is used as ansible_host is defined by terraform (see `access_network`) so this is deterministic for multi-rail hosts - set_fact: - etc_hosts_content: | - {% for host in groups['cluster'] %}{{ hostvars[host]['ansible_host'] }} {{ host }}.novalocal {{ host }} - {% endfor %} - run_once: true - name: Create /etc/hosts for all nodes blockinfile: path: /etc/hosts create: yes state: present - block: "{{ hostvars[ansible_play_hosts | first].etc_hosts_content }}" + block: "{{ appliance_addresses | from_json | to_nice_yaml | replace(':', '') }}" diff --git a/environments/arcus/terraform/inventory.tf b/environments/arcus/terraform/inventory.tf index e85c5dcc4..b7eeeb2d8 100644 --- a/environments/arcus/terraform/inventory.tf +++ b/environments/arcus/terraform/inventory.tf @@ -7,6 +7,7 @@ resource "local_file" "hosts" { "computes": openstack_compute_instance_v2.compute, "compute_types": var.compute_types, "compute_nodes": var.compute_nodes, + "ports": openstack_networking_port_v2.rdma }, ) filename = "../inventory/hosts" diff --git a/environments/arcus/terraform/inventory.tpl b/environments/arcus/terraform/inventory.tpl index 2a3ec5c1d..ba95d568a 100644 --- a/environments/arcus/terraform/inventory.tpl +++ b/environments/arcus/terraform/inventory.tpl @@ -1,6 +1,7 @@ [all:vars] ansible_user=rocky openhpc_cluster_name=${cluster_name} +appliance_addresses='${jsonencode({for portname, port in ports: port.all_fixed_ips[0] => join("-", [cluster_name, portname]) })}' [control] ${control.name} ansible_host=${[for n in control.network: n.fixed_ip_v4 if n.access_network][0]} server_networks='${jsonencode({for net in control.network: net.name => [ net.fixed_ip_v4 ] })}' From bcc10149fd180aed003e101b754a6aedd64c0bd4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 28 Feb 2022 20:18:04 +0000 Subject: [PATCH 099/133] try to set OS_CLOUD correctly in arcus CI --- .github/workflows/arcus.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/arcus.yml b/.github/workflows/arcus.yml index 6135a5c7e..32d10e992 100644 --- a/.github/workflows/arcus.yml +++ b/.github/workflows/arcus.yml @@ -86,6 +86,7 @@ jobs: ansible-playbook ansible/adhoc/generate-passwords.yml ansible-playbook -vv ansible/site.yml env: + OS_CLOUD: openstack ANSIBLE_FORCE_COLOR: True - name: Build login and compute images @@ -105,6 +106,7 @@ jobs: ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml env: OS_CLOUD: openstack + ANSIBLE_FORCE_COLOR: True - name: Reimage login nodes via openstack and check cluster still up run: | @@ -114,6 +116,7 @@ jobs: ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml env: OS_CLOUD: openstack + ANSIBLE_FORCE_COLOR: True - name: Run MPI-based tests, triggering autoscaling run: | @@ -130,7 +133,6 @@ jobs: cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform destroy -auto-approve env: - OS_CLOUD: openstack TF_VAR_cluster_name: ci${{ github.run_id }} if: ${{ success() || cancelled() }} From d9c267ced5db42127b087989bd7b98a6270c157c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 28 Feb 2022 20:38:11 +0000 Subject: [PATCH 100/133] fix autoscale partition definition in arcus CI --- .../arcus/inventory/group_vars/openhpc/overrides.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/environments/arcus/inventory/group_vars/openhpc/overrides.yml b/environments/arcus/inventory/group_vars/openhpc/overrides.yml index a8d82a032..888ec5be0 100644 --- a/environments/arcus/inventory/group_vars/openhpc/overrides.yml +++ b/environments/arcus/inventory/group_vars/openhpc/overrides.yml @@ -3,9 +3,10 @@ openhpc_config_extra: SlurmdDebug: debug openhpc_slurm_partitions: - name: small - cloud_nodes: autoscale-compute-[2-3] + ram_mb: "{{ 808 * 0.9 | int }}" # from free --mebi + cloud_nodes: "{{ openhpc_cluster_name }}-compute-[2-3]" # see TF variable 'cloud_nodes' cloud_instances: # TODO: can we somehow check these when templating?? - flavor: general.v1.tiny + flavor: vm.alaska.cpu.general.tiny image: "{{ openhpc_autoscale_image | default('IMAGE_PLACEHOLDER') }}" keypair: slurm-app-ci - network: "{{ server_networks.keys() | first }}" # TODO: bit hacky?? + network: "{{ hostvars[groups['control'] | first]['server_networks'].keys() | first }}" # Defined in inventory, so only defined for control during Packer build From 1490816fbf3b20bb999b1ed0e1a191889c2a79ee Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 1 Mar 2022 11:39:58 +0000 Subject: [PATCH 101/133] fix arcus partition definition --- environments/arcus/inventory/group_vars/openhpc/overrides.yml | 2 +- environments/common/inventory/group_vars/all/openhpc.yml | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/environments/arcus/inventory/group_vars/openhpc/overrides.yml b/environments/arcus/inventory/group_vars/openhpc/overrides.yml index 888ec5be0..ae0bc1d63 100644 --- a/environments/arcus/inventory/group_vars/openhpc/overrides.yml +++ b/environments/arcus/inventory/group_vars/openhpc/overrides.yml @@ -2,7 +2,7 @@ openhpc_config_extra: SlurmctldDebug: debug SlurmdDebug: debug openhpc_slurm_partitions: -- name: small +- name: tiny ram_mb: "{{ 808 * 0.9 | int }}" # from free --mebi cloud_nodes: "{{ openhpc_cluster_name }}-compute-[2-3]" # see TF variable 'cloud_nodes' cloud_instances: # TODO: can we somehow check these when templating?? diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 0b3912622..f757eb04e 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -28,4 +28,3 @@ openhpc_config_default: - enable_configless openhpc_config_extra: {} openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" -openhpc_ram_multiplier: 0.90 # TODO: DOCS: needs to be available to stackhpc.slurm_openstack_tools.autoscale role, plus lowered a bit to cope with autoscale problems From 453eac6ab300ba97f5810815d7cc00892fcf9b36 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 1 Mar 2022 11:41:59 +0000 Subject: [PATCH 102/133] use branch of ansible slurm/openstack tools which enforces ram_mb defined --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 8393938fe..7d1fd6ff1 100644 --- a/requirements.yml +++ b/requirements.yml @@ -21,5 +21,5 @@ collections: - name: community.grafana - name: https://github.com/stackhpc/ansible_collection_slurm_openstack_tools type: git - version: v0.2.0 + version: feature/autoscale2 ... From cd9c777edcd7c491bd41cfc96c3adadd7a763a52 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 1 Mar 2022 15:13:02 +0000 Subject: [PATCH 103/133] autodetect partition name in arcus CI check --- environments/arcus/hooks/post.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/arcus/hooks/post.yml b/environments/arcus/hooks/post.yml index e764f99fc..27c66f2e3 100644 --- a/environments/arcus/hooks/post.yml +++ b/environments/arcus/hooks/post.yml @@ -19,7 +19,7 @@ vars: expected_sinfo: - - "{{ openhpc_cluster_name }}-compute-[0-1] small* up 60-00:00:00 2 idle" - - "{{ openhpc_cluster_name }}-compute-[2-3] small* up 60-00:00:00 2 idle~" + - "{{ openhpc_cluster_name }}-compute-[0-1] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle" + - "{{ openhpc_cluster_name }}-compute-[2-3] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle~" when: "'builder' not in group_names" # won't have a slurm control daemon when in build From b89dc37ae19fdcc47793057b489f346a9af3db62 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 1 Mar 2022 15:13:57 +0000 Subject: [PATCH 104/133] fix arcus memory definition --- environments/arcus/inventory/group_vars/openhpc/overrides.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/arcus/inventory/group_vars/openhpc/overrides.yml b/environments/arcus/inventory/group_vars/openhpc/overrides.yml index ae0bc1d63..9fd130eb1 100644 --- a/environments/arcus/inventory/group_vars/openhpc/overrides.yml +++ b/environments/arcus/inventory/group_vars/openhpc/overrides.yml @@ -3,7 +3,7 @@ openhpc_config_extra: SlurmdDebug: debug openhpc_slurm_partitions: - name: tiny - ram_mb: "{{ 808 * 0.9 | int }}" # from free --mebi + ram_mb: "{{ (808 * 0.9) | int }}" # from free --mebi cloud_nodes: "{{ openhpc_cluster_name }}-compute-[2-3]" # see TF variable 'cloud_nodes' cloud_instances: # TODO: can we somehow check these when templating?? flavor: vm.alaska.cpu.general.tiny From 55549667354225aeaaa061da32d5df2dac3a6e53 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 2 Mar 2022 10:07:14 +0000 Subject: [PATCH 105/133] use larger arcus flavor as runing out of memory --- .../arcus/inventory/group_vars/openhpc/overrides.yml | 4 ++-- environments/arcus/terraform/variables.tf | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/environments/arcus/inventory/group_vars/openhpc/overrides.yml b/environments/arcus/inventory/group_vars/openhpc/overrides.yml index 9fd130eb1..d94d26944 100644 --- a/environments/arcus/inventory/group_vars/openhpc/overrides.yml +++ b/environments/arcus/inventory/group_vars/openhpc/overrides.yml @@ -2,11 +2,11 @@ openhpc_config_extra: SlurmctldDebug: debug SlurmdDebug: debug openhpc_slurm_partitions: -- name: tiny +- name: small ram_mb: "{{ (808 * 0.9) | int }}" # from free --mebi cloud_nodes: "{{ openhpc_cluster_name }}-compute-[2-3]" # see TF variable 'cloud_nodes' cloud_instances: # TODO: can we somehow check these when templating?? - flavor: vm.alaska.cpu.general.tiny + flavor: vm.alaska.cpu.general.small image: "{{ openhpc_autoscale_image | default('IMAGE_PLACEHOLDER') }}" keypair: slurm-app-ci network: "{{ hostvars[groups['control'] | first]['server_networks'].keys() | first }}" # Defined in inventory, so only defined for control during Packer build diff --git a/environments/arcus/terraform/variables.tf b/environments/arcus/terraform/variables.tf index 7ac093dca..ca55e4327 100644 --- a/environments/arcus/terraform/variables.tf +++ b/environments/arcus/terraform/variables.tf @@ -25,7 +25,7 @@ variable "control_node" { type = map description = "Mapping {flavor: flavor_name, image: image_name_or_id }" default = { - flavor: "vm.alaska.cpu.general.tiny" + flavor: "vm.alaska.cpu.general.small" image: "RockyLinux-8.5-20211114.2" } } @@ -45,8 +45,8 @@ variable "compute_types" { type = map description = "Mapping defining types of compute nodes: key -> (str) name of type, value -> mapping {flavor: flavor_name, image: image_name_or_id }" default = { - tiny: { - flavor: "vm.alaska.cpu.general.tiny" + small: { + flavor: "vm.alaska.cpu.general.small" image: "RockyLinux-8.5-20211114.2" } } @@ -56,8 +56,8 @@ variable "compute_nodes" { type = map(string) description = "Mapping of compute nodename suffix -> key in compute_types" default = { - compute-0: "tiny" - compute-1: "tiny" + compute-0: "small" + compute-1: "small" } } From c1c18697db40496ad8565025b148e610e73a3743 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 2 Mar 2022 10:24:52 +0000 Subject: [PATCH 106/133] add validation of rebuild --- ansible/validate.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ansible/validate.yml b/ansible/validate.yml index 741a524d8..a9c3b939e 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -45,3 +45,13 @@ name: stackhpc.slurm_openstack_tools.rebuild tasks_from: validate.yml tags: validate + +- name: Validate autoscale configuration + hosts: openstack_autoscale + gather_facts: false + tags: openstack_autoscale + tasks: + - import_role: + name: stackhpc.slurm_openstack_tools.autoscale + tasks_from: validate.yml + tags: validate From 54e4c8758408ac35ede69d87ae215d5e96fd39c2 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 2 Mar 2022 12:34:47 +0000 Subject: [PATCH 107/133] change pytools version during dev --- environments/arcus/inventory/group_vars/all/pytools.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 environments/arcus/inventory/group_vars/all/pytools.yml diff --git a/environments/arcus/inventory/group_vars/all/pytools.yml b/environments/arcus/inventory/group_vars/all/pytools.yml new file mode 100644 index 000000000..87f5a5d67 --- /dev/null +++ b/environments/arcus/inventory/group_vars/all/pytools.yml @@ -0,0 +1 @@ +pytools_gitref: feature/resumefail From a84114990b40187bac9ea46c3b04d0f69e66b9df Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 2 Mar 2022 13:49:41 +0000 Subject: [PATCH 108/133] fix Packer vars for arcus CI --- environments/arcus/builder.pkrvars.hcl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/environments/arcus/builder.pkrvars.hcl b/environments/arcus/builder.pkrvars.hcl index 059f49afb..35c05c1e0 100644 --- a/environments/arcus/builder.pkrvars.hcl +++ b/environments/arcus/builder.pkrvars.hcl @@ -1,7 +1,7 @@ -flavor = "general.v1.tiny" -networks = ["c245901d-6b84-4dc4-b02b-eec0fb6122b2"] # stackhpc-ci-geneve -source_image_name = "Rocky-8-GenericCloud-8.5-20211114.2.x86_64" +flavor = "vm.alaska.cpu.general.small" +networks = ["a262aabd-e6bf-4440-a155-13dbc1b5db0e"] # WCDC-iLab-60 +source_image_name = "RockyLinux-8.5-20211114.2" ssh_keypair_name = "slurm-app-ci" security_groups = ["default", "SSH"] -ssh_bastion_host = "185.45.78.150" +ssh_bastion_host = "128.232.222.183" ssh_bastion_username = "slurm-app-ci" From bff9f7cf1743082ad985ed4fed4268d1d0a3d8eb Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 2 Mar 2022 14:46:32 +0000 Subject: [PATCH 109/133] remove port creation in arcus CI to make failure cleanup easier --- environments/arcus/hooks/pre.yml | 6 +++++- environments/arcus/terraform/inventory.tf | 1 - environments/arcus/terraform/inventory.tpl | 1 - environments/arcus/terraform/network.tf | 14 -------------- environments/arcus/terraform/nodes.tf | 6 +++--- 5 files changed, 8 insertions(+), 20 deletions(-) diff --git a/environments/arcus/hooks/pre.yml b/environments/arcus/hooks/pre.yml index d2be77261..3024c63b4 100644 --- a/environments/arcus/hooks/pre.yml +++ b/environments/arcus/hooks/pre.yml @@ -17,4 +17,8 @@ path: /etc/hosts create: yes state: present - block: "{{ appliance_addresses | from_json | to_nice_yaml | replace(':', '') }}" + block: "{{ host_ips | zip(host_names) | map('join', ' ') | join('\n') }}" + vars: + host_ips: "{{ groups['all'] | map('extract', hostvars, 'ansible_host') | list }}" + host_names: "{{ groups['all'] | map('extract', hostvars, 'inventory_hostname') | list }}" + diff --git a/environments/arcus/terraform/inventory.tf b/environments/arcus/terraform/inventory.tf index b7eeeb2d8..e85c5dcc4 100644 --- a/environments/arcus/terraform/inventory.tf +++ b/environments/arcus/terraform/inventory.tf @@ -7,7 +7,6 @@ resource "local_file" "hosts" { "computes": openstack_compute_instance_v2.compute, "compute_types": var.compute_types, "compute_nodes": var.compute_nodes, - "ports": openstack_networking_port_v2.rdma }, ) filename = "../inventory/hosts" diff --git a/environments/arcus/terraform/inventory.tpl b/environments/arcus/terraform/inventory.tpl index ba95d568a..2a3ec5c1d 100644 --- a/environments/arcus/terraform/inventory.tpl +++ b/environments/arcus/terraform/inventory.tpl @@ -1,7 +1,6 @@ [all:vars] ansible_user=rocky openhpc_cluster_name=${cluster_name} -appliance_addresses='${jsonencode({for portname, port in ports: port.all_fixed_ips[0] => join("-", [cluster_name, portname]) })}' [control] ${control.name} ansible_host=${[for n in control.network: n.fixed_ip_v4 if n.access_network][0]} server_networks='${jsonencode({for net in control.network: net.name => [ net.fixed_ip_v4 ] })}' diff --git a/environments/arcus/terraform/network.tf b/environments/arcus/terraform/network.tf index 68f7c92a0..8ba285f67 100644 --- a/environments/arcus/terraform/network.tf +++ b/environments/arcus/terraform/network.tf @@ -6,17 +6,3 @@ data "openstack_networking_subnet_v2" "cluster_subnet" { name = var.cluster_subnet } - -resource "openstack_networking_port_v2" "rdma" { - - for_each = toset(concat(["control"], keys(var.login_nodes), keys(var.compute_nodes), var.cloud_nodes)) - - name = "${var.cluster_name}-${each.key}" - network_id = data.openstack_networking_network_v2.cluster_net.id - admin_state_up = "true" - - binding { - vnic_type = "direct" - } - -} diff --git a/environments/arcus/terraform/nodes.tf b/environments/arcus/terraform/nodes.tf index db4f4762d..d9e9d1852 100644 --- a/environments/arcus/terraform/nodes.tf +++ b/environments/arcus/terraform/nodes.tf @@ -9,7 +9,7 @@ resource "openstack_compute_instance_v2" "control" { security_groups = ["default", "SSH"] network { - port = openstack_networking_port_v2.rdma["control"].id + name = var.cluster_net access_network = true } @@ -27,7 +27,7 @@ resource "openstack_compute_instance_v2" "login" { security_groups = ["default", "SSH"] network { - port = openstack_networking_port_v2.rdma[each.key].id + name = var.cluster_net access_network = true } @@ -45,7 +45,7 @@ resource "openstack_compute_instance_v2" "compute" { security_groups = ["default", "SSH"] network { - port = openstack_networking_port_v2.rdma[each.key].id + name = var.cluster_net access_network = true } From 4da09c87e60aa99ce7faf6351a1ab1045ce25659 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 2 Mar 2022 16:22:07 +0000 Subject: [PATCH 110/133] remove debugging end from CI --- environments/arcus/ci/reimage-compute.yml | 1 - environments/smslabs-example/ci/reimage-compute.yml | 1 - 2 files changed, 2 deletions(-) diff --git a/environments/arcus/ci/reimage-compute.yml b/environments/arcus/ci/reimage-compute.yml index 42989800a..f8dd05214 100644 --- a/environments/arcus/ci/reimage-compute.yml +++ b/environments/arcus/ci/reimage-compute.yml @@ -20,7 +20,6 @@ content: | openhpc_autoscale_image: {{ compute_build.artifact_id }} delegate_to: localhost - - meta: end_here - name: Request compute node rebuild via Slurm shell: diff --git a/environments/smslabs-example/ci/reimage-compute.yml b/environments/smslabs-example/ci/reimage-compute.yml index 42989800a..f8dd05214 100644 --- a/environments/smslabs-example/ci/reimage-compute.yml +++ b/environments/smslabs-example/ci/reimage-compute.yml @@ -20,7 +20,6 @@ content: | openhpc_autoscale_image: {{ compute_build.artifact_id }} delegate_to: localhost - - meta: end_here - name: Request compute node rebuild via Slurm shell: From a44d7bb296cb37d717b94e7485716ebc0b1ab02f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 3 Mar 2022 10:10:45 +0000 Subject: [PATCH 111/133] make arcus login node flavor match packer builder size --- environments/arcus/terraform/variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/arcus/terraform/variables.tf b/environments/arcus/terraform/variables.tf index ca55e4327..45e0b52d0 100644 --- a/environments/arcus/terraform/variables.tf +++ b/environments/arcus/terraform/variables.tf @@ -35,7 +35,7 @@ variable "login_nodes" { description = "Mapping defining login nodes: key -> (str) nodename suffix, value -> mapping {flavor: flavor_name, image: image_name_or_id }" default = { login-0: { - flavor: "vm.alaska.cpu.general.tiny" + flavor: "vm.alaska.cpu.general.small" image: "RockyLinux-8.5-20211114.2" } } From 56358132f4473b85fddace7c771304fefc3a266c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 3 Mar 2022 14:23:41 +0000 Subject: [PATCH 112/133] increase timeout after reimage for arcus CI --- environments/arcus/ci/reimage-compute.yml | 1 + environments/arcus/ci/reimage-login.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/environments/arcus/ci/reimage-compute.yml b/environments/arcus/ci/reimage-compute.yml index f8dd05214..424aedf83 100644 --- a/environments/arcus/ci/reimage-compute.yml +++ b/environments/arcus/ci/reimage-compute.yml @@ -42,3 +42,4 @@ tasks: - name: Wait for nodes to boot wait_for_connection: + timeout: 800 diff --git a/environments/arcus/ci/reimage-login.yml b/environments/arcus/ci/reimage-login.yml index f76f6e8d0..db39bd941 100644 --- a/environments/arcus/ci/reimage-login.yml +++ b/environments/arcus/ci/reimage-login.yml @@ -21,3 +21,4 @@ - name: Wait for connection wait_for_connection: + timeout: 800 From bf2e8aec66f0f0446f16f835c688865b496a4950 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 3 Mar 2022 20:52:27 +0000 Subject: [PATCH 113/133] do image build in parallel in arcus CI --- .github/workflows/arcus.yml | 32 ++------ environments/arcus/ci/reimage-compute.yml | 45 ------------ environments/arcus/ci/reimage-login.yml | 24 ------ environments/arcus/hooks/check_slurm.yml | 21 ++++++ environments/arcus/hooks/post.yml | 90 +++++++++++++++++------ environments/arcus/hooks/pre.yml | 22 +++++- environments/arcus/hooks/update_image.yml | 22 ++++++ 7 files changed, 139 insertions(+), 117 deletions(-) delete mode 100644 environments/arcus/ci/reimage-compute.yml delete mode 100644 environments/arcus/ci/reimage-login.yml create mode 100644 environments/arcus/hooks/check_slurm.yml create mode 100644 environments/arcus/hooks/update_image.yml diff --git a/.github/workflows/arcus.yml b/.github/workflows/arcus.yml index 32d10e992..d868e7754 100644 --- a/.github/workflows/arcus.yml +++ b/.github/workflows/arcus.yml @@ -78,7 +78,8 @@ jobs: TF_VAR_cluster_name: ci${{ github.run_id }} if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }} - - name: Configure infrastructure + - name: Directly configure cluster and build/test images + # see post-hook for the image build/test run: | . venv/bin/activate . environments/arcus/activate @@ -89,36 +90,17 @@ jobs: OS_CLOUD: openstack ANSIBLE_FORCE_COLOR: True - - name: Build login and compute images + - name: Update cloud image and reconfigure Slurm run: | . venv/bin/activate - . environments/arcus/activate - cd packer - PACKER_LOG=1 PACKER_LOG_PATH=build.log packer build -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - env: - OS_CLOUD: openstack - - - name: Reimage compute nodes via slurm and check cluster still up - run: | - . venv/bin/activate - . environments/arcus/activate - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-compute.yml - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml + . environments/smslabs-example/activate + ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/update_image.yml + ansible-playbook -vv ansible/slurm.yml --tags openhpc --skip-tags install env: - OS_CLOUD: openstack ANSIBLE_FORCE_COLOR: True - - - name: Reimage login nodes via openstack and check cluster still up - run: | - . venv/bin/activate - . environments/arcus/activate - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-login.yml - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml - env: OS_CLOUD: openstack - ANSIBLE_FORCE_COLOR: True - - name: Run MPI-based tests, triggering autoscaling + - name: Run MPI-based tests (triggers autoscaling) run: | . venv/bin/activate . environments/arcus/activate diff --git a/environments/arcus/ci/reimage-compute.yml b/environments/arcus/ci/reimage-compute.yml deleted file mode 100644 index 424aedf83..000000000 --- a/environments/arcus/ci/reimage-compute.yml +++ /dev/null @@ -1,45 +0,0 @@ -# Reimage compute nodes via Slurm with latest packer-build images - -- hosts: login[0] - become: no - tasks: - - name: Read packer build manifest - set_fact: - manifest: "{{ lookup('file', manifest_path) | from_json }}" - vars: - manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" - delegate_to: localhost - - - name: Get latest compute image build - set_fact: - compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" - - - name: Add compute image ID to autoscale definition - copy: - dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/openhpc/autoscale.yml" - content: | - openhpc_autoscale_image: {{ compute_build.artifact_id }} - delegate_to: localhost - - - name: Request compute node rebuild via Slurm - shell: - cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1] - become: true - - - name: Check compute node rebuild completed - shell: - cmd: openstack server show {{ item }} --format value -c image - register: openstack_server - loop: "{{ groups['compute'] }}" - retries: 5 - delay: 30 - until: compute_build.artifact_id in openstack_server.stdout - delegate_to: localhost - -- hosts: compute - become: no - gather_facts: no - tasks: - - name: Wait for nodes to boot - wait_for_connection: - timeout: 800 diff --git a/environments/arcus/ci/reimage-login.yml b/environments/arcus/ci/reimage-login.yml deleted file mode 100644 index db39bd941..000000000 --- a/environments/arcus/ci/reimage-login.yml +++ /dev/null @@ -1,24 +0,0 @@ -# Reimage login nodes via OpenStack - -- hosts: login - become: no - tasks: - - name: Read packer build manifest - set_fact: - manifest: "{{ lookup('file', manifest_path) | from_json }}" - vars: - manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" - delegate_to: localhost - - - name: Get latest login image build - set_fact: - login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}" - - - name: Reimage node via openstack - shell: - cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ login_build.artifact_id }}" - delegate_to: localhost - - - name: Wait for connection - wait_for_connection: - timeout: 800 diff --git a/environments/arcus/hooks/check_slurm.yml b/environments/arcus/hooks/check_slurm.yml new file mode 100644 index 000000000..b2ae67c7b --- /dev/null +++ b/environments/arcus/hooks/check_slurm.yml @@ -0,0 +1,21 @@ +- name: Run sinfo + shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name + register: sinfo + changed_when: false + until: "'boot' not in sinfo.stdout_lines" + retries: 5 + delay: 10 +- name: Check nodes have expected slurm state + assert: + that: sinfo.stdout_lines == expected_sinfo + fail_msg: | + sinfo output not as expected: + actual: + {{ sinfo.stdout_lines }} + expected: + {{ expected_sinfo }} + + vars: + expected_sinfo: + - "{{ openhpc_cluster_name }}-compute-[0-1] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle" + - "{{ openhpc_cluster_name }}-compute-[2-3] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle~" diff --git a/environments/arcus/hooks/post.yml b/environments/arcus/hooks/post.yml index 27c66f2e3..ac61cc2ac 100644 --- a/environments/arcus/hooks/post.yml +++ b/environments/arcus/hooks/post.yml @@ -1,25 +1,73 @@ -- hosts: login +- hosts: login:!builder # won't have a slurm control daemon when in build become: no gather_facts: false tasks: - - block: - - name: Run sinfo - shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name - register: sinfo - changed_when: false - - name: Check nodes have expected slurm state - assert: - that: sinfo.stdout_lines == expected_sinfo - fail_msg: | - sinfo output not as expected: - actual: - {{ sinfo.stdout_lines }} - expected: - {{ expected_sinfo }} - - vars: - expected_sinfo: - - "{{ openhpc_cluster_name }}-compute-[0-1] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle" - - "{{ openhpc_cluster_name }}-compute-[2-3] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle~" + - name: Check slurm up after direct deploy + import_tasks: check_slurm.yml - when: "'builder' not in group_names" # won't have a slurm control daemon when in build +- hosts: localhost + become: false + tags: build + tasks: + - name: Check Packer build finished + async_status: + jid: "{{ packer_run.ansible_job_id }}" + register: packer_result + until: packer_result.finished + retries: 30 # allow 15 mins + delay: 30 + when: packer_run is defined # allows rerunning post.yml + +- hosts: login:!builder + become: no + tasks: + + - name: Reimage login node via openstack + shell: + cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ login_build.artifact_id }}" + delegate_to: localhost + + - name: Check login node rebuild completed + shell: + cmd: openstack server show {{ inventory_hostname }} --format value -c image + register: openstack_login + delegate_to: localhost + retries: 5 + delay: 30 + until: login_build.artifact_id in openstack_login.stdout + changed_when: false + + - name: Wait for login connection + wait_for_connection: + timeout: 800 + + - name: Check slurm up after reimaging login node + import_tasks: check_slurm.yml + + - name: Request compute node rebuild via Slurm + shell: + cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1] + become: yes + + - name: Check compute node rebuild completed + shell: + cmd: openstack server show {{ item }} --format value -c image + register: openstack_compute + delegate_to: localhost + loop: "{{ groups['compute'] }}" + retries: 5 + delay: 30 + until: compute_build.artifact_id in openstack_compute.stdout + changed_when: false + +- hosts: compute:!builder + become: no + gather_facts: no + tasks: + - name: Wait for compute connection + wait_for_connection: + timeout: 800 + + - name: Check slurm up after reimaging login node + import_tasks: check_slurm.yml + run_once: true diff --git a/environments/arcus/hooks/pre.yml b/environments/arcus/hooks/pre.yml index 3024c63b4..739b65fea 100644 --- a/environments/arcus/hooks/pre.yml +++ b/environments/arcus/hooks/pre.yml @@ -1,3 +1,22 @@ +- hosts: localhost + become: false + tags: build + tasks: + - name: Ensure secrets generated + include_role: + name: passwords + + - name: Build packer images + shell: + cmd: | + cd packer + PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl + chdir: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}" + when: "'builder' not in group_names" # avoid recursion! + register: packer_run + async: 2700 # 45 minutes + poll: 0 + - hosts: all become: true tags: squid @@ -12,7 +31,7 @@ become: true tags: etc_hosts tasks: - - name: Create /etc/hosts for all nodes + - name: Create /etc/hosts for all nodes as DNS doesn't work blockinfile: path: /etc/hosts create: yes @@ -21,4 +40,3 @@ vars: host_ips: "{{ groups['all'] | map('extract', hostvars, 'ansible_host') | list }}" host_names: "{{ groups['all'] | map('extract', hostvars, 'inventory_hostname') | list }}" - diff --git a/environments/arcus/hooks/update_image.yml b/environments/arcus/hooks/update_image.yml new file mode 100644 index 000000000..a171d4de8 --- /dev/null +++ b/environments/arcus/hooks/update_image.yml @@ -0,0 +1,22 @@ +- hosts: localhost + become: no + tasks: + - name: Read packer build manifest + set_fact: + manifest: "{{ lookup('file', manifest_path) | from_json }}" + vars: + manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" + delegate_to: localhost + + - name: Get latest image builds + set_fact: + login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}" + compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" + + - name: Add compute image ID to autoscale definition (for later autoscaling tests) + copy: + dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/openhpc/autoscale.yml" + content: | + openhpc_autoscale_image: {{ compute_build.artifact_id }} + delegate_to: localhost + run_once: true From 15152d56c161c23ec3c28bf3833f37708a3763c6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 4 Mar 2022 15:09:33 +0000 Subject: [PATCH 114/133] separate image rebuild from site.yml in arcus CI --- .github/workflows/arcus.yml | 16 ++++- environments/arcus/ci/test_reimage.yml | 64 +++++++++++++++++++ .../update_cloudnode_image.yml} | 0 environments/arcus/hooks/post.yml | 54 ---------------- 4 files changed, 77 insertions(+), 57 deletions(-) create mode 100644 environments/arcus/ci/test_reimage.yml rename environments/arcus/{hooks/update_image.yml => ci/update_cloudnode_image.yml} (100%) diff --git a/.github/workflows/arcus.yml b/.github/workflows/arcus.yml index d868e7754..841687289 100644 --- a/.github/workflows/arcus.yml +++ b/.github/workflows/arcus.yml @@ -78,8 +78,8 @@ jobs: TF_VAR_cluster_name: ci${{ github.run_id }} if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }} - - name: Directly configure cluster and build/test images - # see post-hook for the image build/test + - name: Directly configure cluster and build compute + login images + # see pre-hook for the image build run: | . venv/bin/activate . environments/arcus/activate @@ -90,11 +90,21 @@ jobs: OS_CLOUD: openstack ANSIBLE_FORCE_COLOR: True + - name: Test reimage of login and compute nodes + run: | + . venv/bin/activate + . environments/arcus/activate + ansible all -m wait_for_connection + ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/test_reimage.yml + env: + OS_CLOUD: openstack + ANSIBLE_FORCE_COLOR: True + - name: Update cloud image and reconfigure Slurm run: | . venv/bin/activate . environments/smslabs-example/activate - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/update_image.yml + ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/update_cloudnode_image.yml ansible-playbook -vv ansible/slurm.yml --tags openhpc --skip-tags install env: ANSIBLE_FORCE_COLOR: True diff --git a/environments/arcus/ci/test_reimage.yml b/environments/arcus/ci/test_reimage.yml new file mode 100644 index 000000000..baf316d22 --- /dev/null +++ b/environments/arcus/ci/test_reimage.yml @@ -0,0 +1,64 @@ +- hosts: login:!builder + become: no + tasks: + - name: Read packer build manifest + set_fact: + manifest: "{{ lookup('file', manifest_path) | from_json }}" + vars: + manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" + delegate_to: localhost + + - name: Get latest image builds + set_fact: + login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}" + compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" + + - name: Reimage login node via openstack + shell: + cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ login_build.artifact_id }}" + delegate_to: localhost + + - name: Check login node rebuild completed + shell: + cmd: openstack server show {{ inventory_hostname }} --format value -c image + register: openstack_login + delegate_to: localhost + retries: 5 + delay: 30 + until: login_build.artifact_id in openstack_login.stdout + changed_when: false + + - name: Wait for login connection + wait_for_connection: + timeout: 800 + + - name: Check slurm up after reimaging login node + import_tasks: check_slurm.yml + + - name: Request compute node rebuild via Slurm + shell: + cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1] + become: yes + + - name: Check compute node rebuild completed + shell: + cmd: openstack server show {{ item }} --format value -c image + register: openstack_compute + delegate_to: localhost + loop: "{{ groups['compute'] }}" + retries: 5 + delay: 30 + until: compute_build.artifact_id in openstack_compute.stdout + changed_when: false + +- hosts: compute:!builder + become: no + gather_facts: no + tasks: + - name: Wait for compute connection + wait_for_connection: + timeout: 800 + + - name: Check slurm up after reimaging login node + import_tasks: check_slurm.yml + run_once: true diff --git a/environments/arcus/hooks/update_image.yml b/environments/arcus/ci/update_cloudnode_image.yml similarity index 100% rename from environments/arcus/hooks/update_image.yml rename to environments/arcus/ci/update_cloudnode_image.yml diff --git a/environments/arcus/hooks/post.yml b/environments/arcus/hooks/post.yml index ac61cc2ac..15878b796 100644 --- a/environments/arcus/hooks/post.yml +++ b/environments/arcus/hooks/post.yml @@ -17,57 +17,3 @@ retries: 30 # allow 15 mins delay: 30 when: packer_run is defined # allows rerunning post.yml - -- hosts: login:!builder - become: no - tasks: - - - name: Reimage login node via openstack - shell: - cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ login_build.artifact_id }}" - delegate_to: localhost - - - name: Check login node rebuild completed - shell: - cmd: openstack server show {{ inventory_hostname }} --format value -c image - register: openstack_login - delegate_to: localhost - retries: 5 - delay: 30 - until: login_build.artifact_id in openstack_login.stdout - changed_when: false - - - name: Wait for login connection - wait_for_connection: - timeout: 800 - - - name: Check slurm up after reimaging login node - import_tasks: check_slurm.yml - - - name: Request compute node rebuild via Slurm - shell: - cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1] - become: yes - - - name: Check compute node rebuild completed - shell: - cmd: openstack server show {{ item }} --format value -c image - register: openstack_compute - delegate_to: localhost - loop: "{{ groups['compute'] }}" - retries: 5 - delay: 30 - until: compute_build.artifact_id in openstack_compute.stdout - changed_when: false - -- hosts: compute:!builder - become: no - gather_facts: no - tasks: - - name: Wait for compute connection - wait_for_connection: - timeout: 800 - - - name: Check slurm up after reimaging login node - import_tasks: check_slurm.yml - run_once: true From 67a3603fbadda6eb8a1830cc1ba40a5a29512400 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 4 Mar 2022 15:38:21 +0000 Subject: [PATCH 115/133] try to fix check_slurm tasks location in arcus CI --- environments/arcus/ci/test_reimage.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/arcus/ci/test_reimage.yml b/environments/arcus/ci/test_reimage.yml index baf316d22..ae459d1a9 100644 --- a/environments/arcus/ci/test_reimage.yml +++ b/environments/arcus/ci/test_reimage.yml @@ -33,7 +33,7 @@ timeout: 800 - name: Check slurm up after reimaging login node - import_tasks: check_slurm.yml + import_tasks: ../hooks/check_slurm.yml - name: Request compute node rebuild via Slurm shell: @@ -60,5 +60,5 @@ timeout: 800 - name: Check slurm up after reimaging login node - import_tasks: check_slurm.yml + import_tasks: ../hooks/check_slurm.yml run_once: true From f693eb8b3c72fbd86202b4b8923b55f6aef30662 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 4 Mar 2022 22:15:27 +0000 Subject: [PATCH 116/133] fix arcus CI bug for cloud image update --- .github/workflows/arcus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/arcus.yml b/.github/workflows/arcus.yml index 841687289..73835b1d1 100644 --- a/.github/workflows/arcus.yml +++ b/.github/workflows/arcus.yml @@ -103,7 +103,7 @@ jobs: - name: Update cloud image and reconfigure Slurm run: | . venv/bin/activate - . environments/smslabs-example/activate + . environments/arcus/activate ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/update_cloudnode_image.yml ansible-playbook -vv ansible/slurm.yml --tags openhpc --skip-tags install env: From 17c7a94d004d5d6fd4d7338475357da04dc9bb3b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Mon, 7 Mar 2022 16:32:37 +0000 Subject: [PATCH 117/133] use predefined RDMA-capable ports for arcus CI --- environments/arcus/hooks/pre.yml | 5 +---- .../arcus/inventory/group_vars/all/pytools.yml | 2 +- .../inventory/group_vars/openhpc/overrides.yml | 1 + environments/arcus/terraform/inventory.tf | 1 + environments/arcus/terraform/inventory.tpl | 1 + environments/arcus/terraform/network.tf | 14 ++++++++++++++ environments/arcus/terraform/nodes.tf | 6 +++--- 7 files changed, 22 insertions(+), 8 deletions(-) diff --git a/environments/arcus/hooks/pre.yml b/environments/arcus/hooks/pre.yml index 739b65fea..583858b1f 100644 --- a/environments/arcus/hooks/pre.yml +++ b/environments/arcus/hooks/pre.yml @@ -36,7 +36,4 @@ path: /etc/hosts create: yes state: present - block: "{{ host_ips | zip(host_names) | map('join', ' ') | join('\n') }}" - vars: - host_ips: "{{ groups['all'] | map('extract', hostvars, 'ansible_host') | list }}" - host_names: "{{ groups['all'] | map('extract', hostvars, 'inventory_hostname') | list }}" + block: "{{ appliance_addresses | from_json | to_nice_yaml | replace(':', '') }}" diff --git a/environments/arcus/inventory/group_vars/all/pytools.yml b/environments/arcus/inventory/group_vars/all/pytools.yml index 87f5a5d67..0fbd2452c 100644 --- a/environments/arcus/inventory/group_vars/all/pytools.yml +++ b/environments/arcus/inventory/group_vars/all/pytools.yml @@ -1 +1 @@ -pytools_gitref: feature/resumefail +pytools_gitref: feature/ports diff --git a/environments/arcus/inventory/group_vars/openhpc/overrides.yml b/environments/arcus/inventory/group_vars/openhpc/overrides.yml index d94d26944..2298c339d 100644 --- a/environments/arcus/inventory/group_vars/openhpc/overrides.yml +++ b/environments/arcus/inventory/group_vars/openhpc/overrides.yml @@ -10,3 +10,4 @@ openhpc_slurm_partitions: image: "{{ openhpc_autoscale_image | default('IMAGE_PLACEHOLDER') }}" keypair: slurm-app-ci network: "{{ hostvars[groups['control'] | first]['server_networks'].keys() | first }}" # Defined in inventory, so only defined for control during Packer build + port_prefix: "{{ openhpc_cluster_name }}-" diff --git a/environments/arcus/terraform/inventory.tf b/environments/arcus/terraform/inventory.tf index e85c5dcc4..b7eeeb2d8 100644 --- a/environments/arcus/terraform/inventory.tf +++ b/environments/arcus/terraform/inventory.tf @@ -7,6 +7,7 @@ resource "local_file" "hosts" { "computes": openstack_compute_instance_v2.compute, "compute_types": var.compute_types, "compute_nodes": var.compute_nodes, + "ports": openstack_networking_port_v2.rdma }, ) filename = "../inventory/hosts" diff --git a/environments/arcus/terraform/inventory.tpl b/environments/arcus/terraform/inventory.tpl index 2a3ec5c1d..ba95d568a 100644 --- a/environments/arcus/terraform/inventory.tpl +++ b/environments/arcus/terraform/inventory.tpl @@ -1,6 +1,7 @@ [all:vars] ansible_user=rocky openhpc_cluster_name=${cluster_name} +appliance_addresses='${jsonencode({for portname, port in ports: port.all_fixed_ips[0] => join("-", [cluster_name, portname]) })}' [control] ${control.name} ansible_host=${[for n in control.network: n.fixed_ip_v4 if n.access_network][0]} server_networks='${jsonencode({for net in control.network: net.name => [ net.fixed_ip_v4 ] })}' diff --git a/environments/arcus/terraform/network.tf b/environments/arcus/terraform/network.tf index 8ba285f67..68f7c92a0 100644 --- a/environments/arcus/terraform/network.tf +++ b/environments/arcus/terraform/network.tf @@ -6,3 +6,17 @@ data "openstack_networking_subnet_v2" "cluster_subnet" { name = var.cluster_subnet } + +resource "openstack_networking_port_v2" "rdma" { + + for_each = toset(concat(["control"], keys(var.login_nodes), keys(var.compute_nodes), var.cloud_nodes)) + + name = "${var.cluster_name}-${each.key}" + network_id = data.openstack_networking_network_v2.cluster_net.id + admin_state_up = "true" + + binding { + vnic_type = "direct" + } + +} diff --git a/environments/arcus/terraform/nodes.tf b/environments/arcus/terraform/nodes.tf index d9e9d1852..db4f4762d 100644 --- a/environments/arcus/terraform/nodes.tf +++ b/environments/arcus/terraform/nodes.tf @@ -9,7 +9,7 @@ resource "openstack_compute_instance_v2" "control" { security_groups = ["default", "SSH"] network { - name = var.cluster_net + port = openstack_networking_port_v2.rdma["control"].id access_network = true } @@ -27,7 +27,7 @@ resource "openstack_compute_instance_v2" "login" { security_groups = ["default", "SSH"] network { - name = var.cluster_net + port = openstack_networking_port_v2.rdma[each.key].id access_network = true } @@ -45,7 +45,7 @@ resource "openstack_compute_instance_v2" "compute" { security_groups = ["default", "SSH"] network { - name = var.cluster_net + port = openstack_networking_port_v2.rdma[each.key].id access_network = true } From ec3dcbde4798036fab531e940437a10aac5c7ae2 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 8 Mar 2022 10:07:11 +0000 Subject: [PATCH 118/133] remove unused port_prefix from arcus (finds it from nodename) --- environments/arcus/inventory/group_vars/openhpc/overrides.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/environments/arcus/inventory/group_vars/openhpc/overrides.yml b/environments/arcus/inventory/group_vars/openhpc/overrides.yml index 2298c339d..d94d26944 100644 --- a/environments/arcus/inventory/group_vars/openhpc/overrides.yml +++ b/environments/arcus/inventory/group_vars/openhpc/overrides.yml @@ -10,4 +10,3 @@ openhpc_slurm_partitions: image: "{{ openhpc_autoscale_image | default('IMAGE_PLACEHOLDER') }}" keypair: slurm-app-ci network: "{{ hostvars[groups['control'] | first]['server_networks'].keys() | first }}" # Defined in inventory, so only defined for control during Packer build - port_prefix: "{{ openhpc_cluster_name }}-" From 3c5c2628dff45eaf5f8b00345644c63b4979c7f1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 8 Mar 2022 10:08:58 +0000 Subject: [PATCH 119/133] add cpu info for Arcus --- ansible/validate.yml | 10 ---------- .../arcus/inventory/group_vars/openhpc/overrides.yml | 3 +++ 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/ansible/validate.yml b/ansible/validate.yml index a9c3b939e..741a524d8 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -45,13 +45,3 @@ name: stackhpc.slurm_openstack_tools.rebuild tasks_from: validate.yml tags: validate - -- name: Validate autoscale configuration - hosts: openstack_autoscale - gather_facts: false - tags: openstack_autoscale - tasks: - - import_role: - name: stackhpc.slurm_openstack_tools.autoscale - tasks_from: validate.yml - tags: validate diff --git a/environments/arcus/inventory/group_vars/openhpc/overrides.yml b/environments/arcus/inventory/group_vars/openhpc/overrides.yml index d94d26944..4ebfe2d6d 100644 --- a/environments/arcus/inventory/group_vars/openhpc/overrides.yml +++ b/environments/arcus/inventory/group_vars/openhpc/overrides.yml @@ -4,6 +4,9 @@ openhpc_config_extra: openhpc_slurm_partitions: - name: small ram_mb: "{{ (808 * 0.9) | int }}" # from free --mebi + sockets: 1 + cores_per_socket: 2 + threads_per_core: 2 cloud_nodes: "{{ openhpc_cluster_name }}-compute-[2-3]" # see TF variable 'cloud_nodes' cloud_instances: # TODO: can we somehow check these when templating?? flavor: vm.alaska.cpu.general.small From c1d15f0a9c48e405d023ebe551ea4e90033a5a48 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 8 Mar 2022 11:44:06 +0000 Subject: [PATCH 120/133] tidy up arcus CI openstack after successful run --- .github/workflows/arcus.yml | 18 ++++++++++++- environments/arcus/ci/delete_images.yml | 23 ++++++++++++++++ environments/arcus/ci/wait_for_scaledown.yml | 28 ++++++++++++++++++++ 3 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 environments/arcus/ci/delete_images.yml create mode 100644 environments/arcus/ci/wait_for_scaledown.yml diff --git a/.github/workflows/arcus.yml b/.github/workflows/arcus.yml index 73835b1d1..c9099a95c 100644 --- a/.github/workflows/arcus.yml +++ b/.github/workflows/arcus.yml @@ -118,6 +118,15 @@ jobs: env: ANSIBLE_FORCE_COLOR: True + - name: Wait for CLOUD nodes to be destroyed + run: | + . venv/bin/activate + . environments/arcus/activate + ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/wait_for_scaledown.yml + env: + OS_CLOUD: openstack + ANSIBLE_FORCE_COLOR: True + - name: Delete infrastructure run: | . venv/bin/activate @@ -128,4 +137,11 @@ jobs: TF_VAR_cluster_name: ci${{ github.run_id }} if: ${{ success() || cancelled() }} -# TODO: delete images! \ No newline at end of file + - name: Delete images + run: | + . venv/bin/activate + . environments/arcus/activate + ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/delete_images.yml + env: + OS_CLOUD: openstack + ANSIBLE_FORCE_COLOR: True diff --git a/environments/arcus/ci/delete_images.yml b/environments/arcus/ci/delete_images.yml new file mode 100644 index 000000000..133e7d0a7 --- /dev/null +++ b/environments/arcus/ci/delete_images.yml @@ -0,0 +1,23 @@ +- hosts: login:!builder + become: no + gather_facts: no + tasks: + - name: Read packer build manifest + set_fact: + manifest: "{{ lookup('file', manifest_path) | from_json }}" + vars: + manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" + delegate_to: localhost + + - name: Get latest image builds + set_fact: + login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}" + compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" + + - name: Delete images + shell: + cmd: | + openstack image delete {{ login_build.artifact_id }} + openstack image delete {{ compute_build.artifact_id }} + delegate_to: localhost + \ No newline at end of file diff --git a/environments/arcus/ci/wait_for_scaledown.yml b/environments/arcus/ci/wait_for_scaledown.yml new file mode 100644 index 000000000..cb1c51a47 --- /dev/null +++ b/environments/arcus/ci/wait_for_scaledown.yml @@ -0,0 +1,28 @@ +- hosts: login:!builder + become: no + gather_facts: no + tasks: + - name: List CLOUD-state nodes + shell: + cmd: sinfo --noheader --Node --Format NodeList -t CLOUD + register: sinfo_cloudnodes + changed_when: false + + - name: Get SuspendTime + shell: + cmd: scontrol show config | grep '^SuspendTime ' + register: suspendtime + + - name: Wait for SuspendTime + pause: + seconds: "{{ suspendtime.stdout.split()[2] }}" + + - name: Wait for CLOUD nodes to be destroyed + shell: + cmd: "openstack server list -f value -c Name" + changed_when: false + delegate_to: localhost + register: openstack_servers + until: "sinfo_cloudnodes.stdout_lines | map('trim') | intersect(openstack_servers.stdout_lines) | length == 0" # cloud nodes aren't found in openstack_servers + retries: 10 + delay: 30 From d34600ca22de8f973e25d1b4842a5232a65658bf Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 8 Mar 2022 19:13:11 +0000 Subject: [PATCH 121/133] revert smslabs environment to main --- .../smslabs-example/ci/reimage-compute.yml | 7 ------- environments/smslabs-example/hooks/post.yml | 17 +++-------------- .../inventory/group_vars/openhpc/overrides.yml | 8 +------- environments/smslabs-example/inventory/groups | 3 --- environments/smslabs-example/terraform/nodes.tf | 4 ++-- 5 files changed, 6 insertions(+), 33 deletions(-) diff --git a/environments/smslabs-example/ci/reimage-compute.yml b/environments/smslabs-example/ci/reimage-compute.yml index f8dd05214..3efa4e47c 100644 --- a/environments/smslabs-example/ci/reimage-compute.yml +++ b/environments/smslabs-example/ci/reimage-compute.yml @@ -14,13 +14,6 @@ set_fact: compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" - - name: Add compute image ID to autoscale definition - copy: - dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/openhpc/autoscale.yml" - content: | - openhpc_autoscale_image: {{ compute_build.artifact_id }} - delegate_to: localhost - - name: Request compute node rebuild via Slurm shell: cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1] diff --git a/environments/smslabs-example/hooks/post.yml b/environments/smslabs-example/hooks/post.yml index e764f99fc..68303c5cb 100644 --- a/environments/smslabs-example/hooks/post.yml +++ b/environments/smslabs-example/hooks/post.yml @@ -4,22 +4,11 @@ tasks: - block: - name: Run sinfo - shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name + shell: 'sinfo --noheader --format="%N %P %a %l %D %t"' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name register: sinfo changed_when: false - name: Check nodes have expected slurm state assert: - that: sinfo.stdout_lines == expected_sinfo - fail_msg: | - sinfo output not as expected: - actual: - {{ sinfo.stdout_lines }} - expected: - {{ expected_sinfo }} - - vars: - expected_sinfo: - - "{{ openhpc_cluster_name }}-compute-[0-1] small* up 60-00:00:00 2 idle" - - "{{ openhpc_cluster_name }}-compute-[2-3] small* up 60-00:00:00 2 idle~" - + that: "(sinfo.stdout_lines[0] | split)[1:] == ['small*', 'up', '60-00:00:00', '2', 'idle']" # don't know what instance names are as have CI run ID in them + fail_msg: "sinfo output not as expected: {{ sinfo.stdout }}" when: "'builder' not in group_names" # won't have a slurm control daemon when in build diff --git a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml index a8d82a032..3585ae073 100644 --- a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml +++ b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml @@ -1,11 +1,5 @@ -openhpc_config_extra: +openhpc_config: SlurmctldDebug: debug SlurmdDebug: debug openhpc_slurm_partitions: - name: small - cloud_nodes: autoscale-compute-[2-3] - cloud_instances: # TODO: can we somehow check these when templating?? - flavor: general.v1.tiny - image: "{{ openhpc_autoscale_image | default('IMAGE_PLACEHOLDER') }}" - keypair: slurm-app-ci - network: "{{ server_networks.keys() | first }}" # TODO: bit hacky?? diff --git a/environments/smslabs-example/inventory/groups b/environments/smslabs-example/inventory/groups index a7172681c..c0c0903d1 100644 --- a/environments/smslabs-example/inventory/groups +++ b/environments/smslabs-example/inventory/groups @@ -35,6 +35,3 @@ compute [update:children] cluster - -[openstack_autoscale:children] -control diff --git a/environments/smslabs-example/terraform/nodes.tf b/environments/smslabs-example/terraform/nodes.tf index 4b849f0bb..3bca7fb36 100644 --- a/environments/smslabs-example/terraform/nodes.tf +++ b/environments/smslabs-example/terraform/nodes.tf @@ -6,7 +6,7 @@ resource "openstack_compute_instance_v2" "control" { flavor_name = var.control_node.flavor key_pair = var.key_pair config_drive = true - security_groups = ["default", "SSH"] + security_groups = ["default", "ssh"] network { uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id # ensures nodes not created till subnet created @@ -24,7 +24,7 @@ resource "openstack_compute_instance_v2" "login" { flavor_name = each.value.flavor key_pair = var.key_pair config_drive = true - security_groups = ["default", "SSH"] + security_groups = ["default", "ssh"] network { uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id From 3320e8c5276a443883a4a79bba678f43cb5cb1ab Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 8 Mar 2022 19:14:25 +0000 Subject: [PATCH 122/133] remove podman override (copied from smslabs) from arcus environment --- environments/arcus/inventory/group_vars/podman/overrides.yml | 1 - 1 file changed, 1 deletion(-) delete mode 100644 environments/arcus/inventory/group_vars/podman/overrides.yml diff --git a/environments/arcus/inventory/group_vars/podman/overrides.yml b/environments/arcus/inventory/group_vars/podman/overrides.yml deleted file mode 100644 index fc90e22f4..000000000 --- a/environments/arcus/inventory/group_vars/podman/overrides.yml +++ /dev/null @@ -1 +0,0 @@ -podman_cidr: 192.168.1.0/24 # default podman network range clashes with stackhpc-ipv4-geneve-subnet From 0b04a663e7be002cb8fdadce34382788d56bbab3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 8 Mar 2022 19:15:24 +0000 Subject: [PATCH 123/133] set arcus CI to run only on push to main and PRs --- .github/workflows/arcus.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/arcus.yml b/.github/workflows/arcus.yml index c9099a95c..1142d3754 100644 --- a/.github/workflows/arcus.yml +++ b/.github/workflows/arcus.yml @@ -4,7 +4,6 @@ on: push: branches: - main - - ci/arcus pull_request: concurrency: rcp-cloud-portal_demo # openstack project jobs: From 0ced47dc5bd1f1d7c5b28641b4a7459c5accac11 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 9 Mar 2022 11:48:47 +0000 Subject: [PATCH 124/133] remove (broken) smslabs CI --- .github/workflows/smslabs.yml | 136 ---------------------------------- 1 file changed, 136 deletions(-) delete mode 100644 .github/workflows/smslabs.yml diff --git a/.github/workflows/smslabs.yml b/.github/workflows/smslabs.yml deleted file mode 100644 index aed4aa433..000000000 --- a/.github/workflows/smslabs.yml +++ /dev/null @@ -1,136 +0,0 @@ - -name: Test on OpenStack via smslabs -on: - push: - branches: - - main - pull_request: -concurrency: stackhpc-ci # openstack project -jobs: - openstack-example: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - - name: Setup ssh - run: | - set -x - mkdir ~/.ssh - echo "$SSH_KEY" > ~/.ssh/id_rsa - chmod 0600 ~/.ssh/id_rsa - env: - SSH_KEY: ${{ secrets.SSH_KEY }} - - - name: Add bastion's ssh key to known_hosts - run: cat environments/smslabs-example/bastion_fingerprint >> ~/.ssh/known_hosts - shell: bash - - - name: Install ansible etc - run: dev/setup-env.sh - - - name: Install terraform - uses: hashicorp/setup-terraform@v1 - - - name: Initialise terraform - run: terraform init - working-directory: ${{ github.workspace }}/environments/smslabs-example/terraform - - - name: Write clouds.yaml - run: | - mkdir -p ~/.config/openstack/ - echo "$CLOUDS_YAML" > ~/.config/openstack/clouds.yaml - shell: bash - env: - CLOUDS_YAML: ${{ secrets.CLOUDS_YAML }} - - - name: Provision infrastructure - id: provision - run: | - . venv/bin/activate - . environments/smslabs-example/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/terraform - terraform apply -auto-approve - env: - OS_CLOUD: openstack - TF_VAR_cluster_name: ci${{ github.run_id }} - - - name: Get server provisioning failure messages - id: provision_failure - run: | - . venv/bin/activate - . environments/smslabs-example/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/terraform - echo "::set-output name=messages::$(./getfaults.py)" - env: - OS_CLOUD: openstack - TF_VAR_cluster_name: ci${{ github.run_id }} - if: always() && steps.provision.outcome == 'failure' - - - name: Delete infrastructure if failed due to lack of hosts - run: | - . venv/bin/activate - . environments/smslabs-example/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/terraform - terraform destroy -auto-approve - env: - OS_CLOUD: openstack - TF_VAR_cluster_name: ci${{ github.run_id }} - if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }} - - - name: Configure infrastructure - run: | - . venv/bin/activate - . environments/smslabs-example/activate - ansible all -m wait_for_connection - ansible-playbook ansible/adhoc/generate-passwords.yml - ansible-playbook -vv ansible/site.yml - env: - ANSIBLE_FORCE_COLOR: True - - - name: Build login and compute images - run: | - . venv/bin/activate - . environments/smslabs-example/activate - cd packer - PACKER_LOG=1 PACKER_LOG_PATH=build.log packer build -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - env: - OS_CLOUD: openstack - - - name: Reimage compute nodes via slurm and check cluster still up - run: | - . venv/bin/activate - . environments/smslabs-example/activate - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-compute.yml - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml - env: - OS_CLOUD: openstack - - - name: Reimage login nodes via openstack and check cluster still up - run: | - . venv/bin/activate - . environments/smslabs-example/activate - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-login.yml - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml - env: - OS_CLOUD: openstack - - - name: Run MPI-based tests, triggering autoscaling - run: | - . venv/bin/activate - . environments/smslabs-example/activate - ansible-playbook -vv ansible/adhoc/hpctests.yml - env: - ANSIBLE_FORCE_COLOR: True - - - name: Delete infrastructure - run: | - . venv/bin/activate - . environments/smslabs-example/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/terraform - terraform destroy -auto-approve - env: - OS_CLOUD: openstack - TF_VAR_cluster_name: ci${{ github.run_id }} - if: ${{ success() || cancelled() }} - -# TODO: delete images! \ No newline at end of file From bd1e0ccac910a3b25661e512024ab05e7077a66b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 9 Mar 2022 14:52:10 +0000 Subject: [PATCH 125/133] rename smslabs environment --- environments/{smslabs-example => smslabs}/.gitignore | 0 environments/{smslabs-example => smslabs}/activate | 0 environments/{smslabs-example => smslabs}/ansible.cfg | 0 environments/{smslabs-example => smslabs}/bastion_fingerprint | 0 environments/{smslabs-example => smslabs}/builder.pkrvars.hcl | 0 environments/{smslabs-example => smslabs}/ci/reimage-compute.yml | 0 environments/{smslabs-example => smslabs}/ci/reimage-login.yml | 0 environments/{smslabs-example => smslabs}/hooks/post.yml | 0 .../inventory/group_vars/all/.gitkeep | 0 .../inventory/group_vars/all/bastion.yml | 0 .../inventory/group_vars/openhpc/overrides.yml | 0 .../inventory/group_vars/podman/overrides.yml | 0 environments/{smslabs-example => smslabs}/inventory/groups | 0 .../{smslabs-example => smslabs}/terraform/.terraform.lock.hcl | 0 environments/{smslabs-example => smslabs}/terraform/getfaults.py | 0 environments/{smslabs-example => smslabs}/terraform/inventory.tf | 0 environments/{smslabs-example => smslabs}/terraform/inventory.tpl | 0 environments/{smslabs-example => smslabs}/terraform/main.tf | 0 environments/{smslabs-example => smslabs}/terraform/network.tf | 0 environments/{smslabs-example => smslabs}/terraform/nodes.tf | 0 environments/{smslabs-example => smslabs}/terraform/variables.tf | 0 21 files changed, 0 insertions(+), 0 deletions(-) rename environments/{smslabs-example => smslabs}/.gitignore (100%) rename environments/{smslabs-example => smslabs}/activate (100%) rename environments/{smslabs-example => smslabs}/ansible.cfg (100%) rename environments/{smslabs-example => smslabs}/bastion_fingerprint (100%) rename environments/{smslabs-example => smslabs}/builder.pkrvars.hcl (100%) rename environments/{smslabs-example => smslabs}/ci/reimage-compute.yml (100%) rename environments/{smslabs-example => smslabs}/ci/reimage-login.yml (100%) rename environments/{smslabs-example => smslabs}/hooks/post.yml (100%) rename environments/{smslabs-example => smslabs}/inventory/group_vars/all/.gitkeep (100%) rename environments/{smslabs-example => smslabs}/inventory/group_vars/all/bastion.yml (100%) rename environments/{smslabs-example => smslabs}/inventory/group_vars/openhpc/overrides.yml (100%) rename environments/{smslabs-example => smslabs}/inventory/group_vars/podman/overrides.yml (100%) rename environments/{smslabs-example => smslabs}/inventory/groups (100%) rename environments/{smslabs-example => smslabs}/terraform/.terraform.lock.hcl (100%) rename environments/{smslabs-example => smslabs}/terraform/getfaults.py (100%) rename environments/{smslabs-example => smslabs}/terraform/inventory.tf (100%) rename environments/{smslabs-example => smslabs}/terraform/inventory.tpl (100%) rename environments/{smslabs-example => smslabs}/terraform/main.tf (100%) rename environments/{smslabs-example => smslabs}/terraform/network.tf (100%) rename environments/{smslabs-example => smslabs}/terraform/nodes.tf (100%) rename environments/{smslabs-example => smslabs}/terraform/variables.tf (100%) diff --git a/environments/smslabs-example/.gitignore b/environments/smslabs/.gitignore similarity index 100% rename from environments/smslabs-example/.gitignore rename to environments/smslabs/.gitignore diff --git a/environments/smslabs-example/activate b/environments/smslabs/activate similarity index 100% rename from environments/smslabs-example/activate rename to environments/smslabs/activate diff --git a/environments/smslabs-example/ansible.cfg b/environments/smslabs/ansible.cfg similarity index 100% rename from environments/smslabs-example/ansible.cfg rename to environments/smslabs/ansible.cfg diff --git a/environments/smslabs-example/bastion_fingerprint b/environments/smslabs/bastion_fingerprint similarity index 100% rename from environments/smslabs-example/bastion_fingerprint rename to environments/smslabs/bastion_fingerprint diff --git a/environments/smslabs-example/builder.pkrvars.hcl b/environments/smslabs/builder.pkrvars.hcl similarity index 100% rename from environments/smslabs-example/builder.pkrvars.hcl rename to environments/smslabs/builder.pkrvars.hcl diff --git a/environments/smslabs-example/ci/reimage-compute.yml b/environments/smslabs/ci/reimage-compute.yml similarity index 100% rename from environments/smslabs-example/ci/reimage-compute.yml rename to environments/smslabs/ci/reimage-compute.yml diff --git a/environments/smslabs-example/ci/reimage-login.yml b/environments/smslabs/ci/reimage-login.yml similarity index 100% rename from environments/smslabs-example/ci/reimage-login.yml rename to environments/smslabs/ci/reimage-login.yml diff --git a/environments/smslabs-example/hooks/post.yml b/environments/smslabs/hooks/post.yml similarity index 100% rename from environments/smslabs-example/hooks/post.yml rename to environments/smslabs/hooks/post.yml diff --git a/environments/smslabs-example/inventory/group_vars/all/.gitkeep b/environments/smslabs/inventory/group_vars/all/.gitkeep similarity index 100% rename from environments/smslabs-example/inventory/group_vars/all/.gitkeep rename to environments/smslabs/inventory/group_vars/all/.gitkeep diff --git a/environments/smslabs-example/inventory/group_vars/all/bastion.yml b/environments/smslabs/inventory/group_vars/all/bastion.yml similarity index 100% rename from environments/smslabs-example/inventory/group_vars/all/bastion.yml rename to environments/smslabs/inventory/group_vars/all/bastion.yml diff --git a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs/inventory/group_vars/openhpc/overrides.yml similarity index 100% rename from environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml rename to environments/smslabs/inventory/group_vars/openhpc/overrides.yml diff --git a/environments/smslabs-example/inventory/group_vars/podman/overrides.yml b/environments/smslabs/inventory/group_vars/podman/overrides.yml similarity index 100% rename from environments/smslabs-example/inventory/group_vars/podman/overrides.yml rename to environments/smslabs/inventory/group_vars/podman/overrides.yml diff --git a/environments/smslabs-example/inventory/groups b/environments/smslabs/inventory/groups similarity index 100% rename from environments/smslabs-example/inventory/groups rename to environments/smslabs/inventory/groups diff --git a/environments/smslabs-example/terraform/.terraform.lock.hcl b/environments/smslabs/terraform/.terraform.lock.hcl similarity index 100% rename from environments/smslabs-example/terraform/.terraform.lock.hcl rename to environments/smslabs/terraform/.terraform.lock.hcl diff --git a/environments/smslabs-example/terraform/getfaults.py b/environments/smslabs/terraform/getfaults.py similarity index 100% rename from environments/smslabs-example/terraform/getfaults.py rename to environments/smslabs/terraform/getfaults.py diff --git a/environments/smslabs-example/terraform/inventory.tf b/environments/smslabs/terraform/inventory.tf similarity index 100% rename from environments/smslabs-example/terraform/inventory.tf rename to environments/smslabs/terraform/inventory.tf diff --git a/environments/smslabs-example/terraform/inventory.tpl b/environments/smslabs/terraform/inventory.tpl similarity index 100% rename from environments/smslabs-example/terraform/inventory.tpl rename to environments/smslabs/terraform/inventory.tpl diff --git a/environments/smslabs-example/terraform/main.tf b/environments/smslabs/terraform/main.tf similarity index 100% rename from environments/smslabs-example/terraform/main.tf rename to environments/smslabs/terraform/main.tf diff --git a/environments/smslabs-example/terraform/network.tf b/environments/smslabs/terraform/network.tf similarity index 100% rename from environments/smslabs-example/terraform/network.tf rename to environments/smslabs/terraform/network.tf diff --git a/environments/smslabs-example/terraform/nodes.tf b/environments/smslabs/terraform/nodes.tf similarity index 100% rename from environments/smslabs-example/terraform/nodes.tf rename to environments/smslabs/terraform/nodes.tf diff --git a/environments/smslabs-example/terraform/variables.tf b/environments/smslabs/terraform/variables.tf similarity index 100% rename from environments/smslabs-example/terraform/variables.tf rename to environments/smslabs/terraform/variables.tf From 248cbe42e6c5269f233415182ca5ea5b3e4dd479 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 9 Mar 2022 14:53:57 +0000 Subject: [PATCH 126/133] fix additional openhpc config in smslabs environment --- environments/smslabs/inventory/group_vars/openhpc/overrides.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/smslabs/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs/inventory/group_vars/openhpc/overrides.yml index 3585ae073..4cf1e5bc1 100644 --- a/environments/smslabs/inventory/group_vars/openhpc/overrides.yml +++ b/environments/smslabs/inventory/group_vars/openhpc/overrides.yml @@ -1,4 +1,4 @@ -openhpc_config: +openhpc_config_extra: SlurmctldDebug: debug SlurmdDebug: debug openhpc_slurm_partitions: From b881529803c9d60364122478740e7e3bd56e73e7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 9 Mar 2022 15:29:44 +0000 Subject: [PATCH 127/133] smslabs terraform fixes for security group + clouds.yaml --- environments/smslabs/terraform/main.tf | 4 ++++ environments/smslabs/terraform/nodes.tf | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/environments/smslabs/terraform/main.tf b/environments/smslabs/terraform/main.tf index 49a84ffce..03beb0adc 100644 --- a/environments/smslabs/terraform/main.tf +++ b/environments/smslabs/terraform/main.tf @@ -6,3 +6,7 @@ terraform { } } } + +provider "openstack" { + cloud = "openstack" +} diff --git a/environments/smslabs/terraform/nodes.tf b/environments/smslabs/terraform/nodes.tf index 3bca7fb36..7d728e2c4 100644 --- a/environments/smslabs/terraform/nodes.tf +++ b/environments/smslabs/terraform/nodes.tf @@ -6,7 +6,7 @@ resource "openstack_compute_instance_v2" "control" { flavor_name = var.control_node.flavor key_pair = var.key_pair config_drive = true - security_groups = ["default", "ssh"] + security_groups = ["default", "SSH"] network { uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id # ensures nodes not created till subnet created @@ -24,7 +24,7 @@ resource "openstack_compute_instance_v2" "login" { flavor_name = each.value.flavor key_pair = var.key_pair config_drive = true - security_groups = ["default", "ssh"] + security_groups = ["default", "SSH"] network { uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id @@ -42,7 +42,7 @@ resource "openstack_compute_instance_v2" "compute" { flavor_name = var.compute_types[each.value].flavor key_pair = var.key_pair config_drive = true - security_groups = ["default", "ssh"] + security_groups = ["default", "SSH"] network { uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id From 9273f6d39e637fd251bed7680d7f63a84f1fc2fd Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 9 Mar 2022 16:30:04 +0000 Subject: [PATCH 128/133] align smslabs environment with arcus CI --- .../inventory/group_vars/all/pytools.yml | 0 environments/smslabs/hooks/check_slurm.yml | 21 +++++++++++++ environments/smslabs/hooks/post.yml | 27 +++++++++------- environments/smslabs/hooks/pre.yml | 31 +++++++++++++++++++ .../inventory/group_vars/all/squid.yml | 1 + .../group_vars/openhpc/overrides.yml | 10 ++++++ environments/smslabs/inventory/groups | 3 ++ 7 files changed, 82 insertions(+), 11 deletions(-) rename environments/{arcus => common}/inventory/group_vars/all/pytools.yml (100%) create mode 100644 environments/smslabs/hooks/check_slurm.yml create mode 100644 environments/smslabs/hooks/pre.yml create mode 100644 environments/smslabs/inventory/group_vars/all/squid.yml diff --git a/environments/arcus/inventory/group_vars/all/pytools.yml b/environments/common/inventory/group_vars/all/pytools.yml similarity index 100% rename from environments/arcus/inventory/group_vars/all/pytools.yml rename to environments/common/inventory/group_vars/all/pytools.yml diff --git a/environments/smslabs/hooks/check_slurm.yml b/environments/smslabs/hooks/check_slurm.yml new file mode 100644 index 000000000..b2ae67c7b --- /dev/null +++ b/environments/smslabs/hooks/check_slurm.yml @@ -0,0 +1,21 @@ +- name: Run sinfo + shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name + register: sinfo + changed_when: false + until: "'boot' not in sinfo.stdout_lines" + retries: 5 + delay: 10 +- name: Check nodes have expected slurm state + assert: + that: sinfo.stdout_lines == expected_sinfo + fail_msg: | + sinfo output not as expected: + actual: + {{ sinfo.stdout_lines }} + expected: + {{ expected_sinfo }} + + vars: + expected_sinfo: + - "{{ openhpc_cluster_name }}-compute-[0-1] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle" + - "{{ openhpc_cluster_name }}-compute-[2-3] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle~" diff --git a/environments/smslabs/hooks/post.yml b/environments/smslabs/hooks/post.yml index 68303c5cb..15878b796 100644 --- a/environments/smslabs/hooks/post.yml +++ b/environments/smslabs/hooks/post.yml @@ -1,14 +1,19 @@ -- hosts: login +- hosts: login:!builder # won't have a slurm control daemon when in build become: no gather_facts: false tasks: - - block: - - name: Run sinfo - shell: 'sinfo --noheader --format="%N %P %a %l %D %t"' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name - register: sinfo - changed_when: false - - name: Check nodes have expected slurm state - assert: - that: "(sinfo.stdout_lines[0] | split)[1:] == ['small*', 'up', '60-00:00:00', '2', 'idle']" # don't know what instance names are as have CI run ID in them - fail_msg: "sinfo output not as expected: {{ sinfo.stdout }}" - when: "'builder' not in group_names" # won't have a slurm control daemon when in build + - name: Check slurm up after direct deploy + import_tasks: check_slurm.yml + +- hosts: localhost + become: false + tags: build + tasks: + - name: Check Packer build finished + async_status: + jid: "{{ packer_run.ansible_job_id }}" + register: packer_result + until: packer_result.finished + retries: 30 # allow 15 mins + delay: 30 + when: packer_run is defined # allows rerunning post.yml diff --git a/environments/smslabs/hooks/pre.yml b/environments/smslabs/hooks/pre.yml new file mode 100644 index 000000000..2fb943528 --- /dev/null +++ b/environments/smslabs/hooks/pre.yml @@ -0,0 +1,31 @@ +- hosts: localhost + become: false + tags: build + tasks: + - name: Ensure secrets generated + include_role: + name: passwords + + - name: Build packer images + shell: + cmd: | + cd packer + PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl + chdir: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}" + when: "'builder' not in group_names" # avoid recursion! + register: packer_run + async: 2700 # 45 minutes + poll: 0 + +# For some reason squid shows TCP_MISS_ABORTED/200 on everything +# - hosts: all +# become: yes +# gather_facts: no +# tasks: +# - name: Configure dnf proxy +# community.general.ini_file: +# path: /etc/dnf/dnf.conf +# section: main +# option: proxy +# value: "{{ squid_proxy }}" +# no_extra_spaces: true diff --git a/environments/smslabs/inventory/group_vars/all/squid.yml b/environments/smslabs/inventory/group_vars/all/squid.yml new file mode 100644 index 000000000..8524b5843 --- /dev/null +++ b/environments/smslabs/inventory/group_vars/all/squid.yml @@ -0,0 +1 @@ +squid_proxy: http://10.20.2.12:3128 diff --git a/environments/smslabs/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs/inventory/group_vars/openhpc/overrides.yml index 4cf1e5bc1..86f14c3f4 100644 --- a/environments/smslabs/inventory/group_vars/openhpc/overrides.yml +++ b/environments/smslabs/inventory/group_vars/openhpc/overrides.yml @@ -3,3 +3,13 @@ openhpc_config_extra: SlurmdDebug: debug openhpc_slurm_partitions: - name: small + ram_mb: "{{ (3362 * 0.95) | int }}" # free --mebi * default openhpc_ram_multiplier + sockets: 1 + cores_per_socket: 1 + threads_per_core: 1 + cloud_nodes: "{{ openhpc_cluster_name }}-compute-[2-3]" + cloud_instances: + flavor: general.v1.tiny + image: "{{ openhpc_autoscale_image | default('IMAGE_PLACEHOLDER') }}" # Gets set by CI after image build task. + keypair: slurm-app-ci + network: "{{ hostvars[groups['control'] | first]['server_networks'].keys() | first }}" # Defined in inventory, so only defined for control during Packer build. diff --git a/environments/smslabs/inventory/groups b/environments/smslabs/inventory/groups index c0c0903d1..a7172681c 100644 --- a/environments/smslabs/inventory/groups +++ b/environments/smslabs/inventory/groups @@ -35,3 +35,6 @@ compute [update:children] cluster + +[openstack_autoscale:children] +control From 8b8b00328ad4e17d18e3b33b56a1049c6fedeb10 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 9 Mar 2022 16:43:39 +0000 Subject: [PATCH 129/133] replicate arcus CI workflow on smslabs w/ shared ansible/ci/ plays --- .github/workflows/arcus.yml | 8 +- .github/workflows/smslabs.yml | 146 ++++++++++++++++++ .../arcus => ansible}/ci/delete_images.yml | 0 .../arcus => ansible}/ci/test_reimage.yml | 1 + .../ci/update_cloudnode_image.yml | 0 .../ci/wait_for_scaledown.yml | 0 environments/smslabs/ci/reimage-compute.yml | 37 ----- environments/smslabs/ci/reimage-login.yml | 23 --- 8 files changed, 151 insertions(+), 64 deletions(-) create mode 100644 .github/workflows/smslabs.yml rename {environments/arcus => ansible}/ci/delete_images.yml (100%) rename {environments/arcus => ansible}/ci/test_reimage.yml (95%) rename {environments/arcus => ansible}/ci/update_cloudnode_image.yml (100%) rename {environments/arcus => ansible}/ci/wait_for_scaledown.yml (100%) delete mode 100644 environments/smslabs/ci/reimage-compute.yml delete mode 100644 environments/smslabs/ci/reimage-login.yml diff --git a/.github/workflows/arcus.yml b/.github/workflows/arcus.yml index 1142d3754..582997db4 100644 --- a/.github/workflows/arcus.yml +++ b/.github/workflows/arcus.yml @@ -94,7 +94,7 @@ jobs: . venv/bin/activate . environments/arcus/activate ansible all -m wait_for_connection - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/test_reimage.yml + ansible-playbook -vv ansible/ci/test_reimage.yml env: OS_CLOUD: openstack ANSIBLE_FORCE_COLOR: True @@ -103,7 +103,7 @@ jobs: run: | . venv/bin/activate . environments/arcus/activate - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/update_cloudnode_image.yml + ansible-playbook -vv ansible/ci/update_cloudnode_image.yml ansible-playbook -vv ansible/slurm.yml --tags openhpc --skip-tags install env: ANSIBLE_FORCE_COLOR: True @@ -121,7 +121,7 @@ jobs: run: | . venv/bin/activate . environments/arcus/activate - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/wait_for_scaledown.yml + ansible-playbook -vv ansible/ci/wait_for_scaledown.yml env: OS_CLOUD: openstack ANSIBLE_FORCE_COLOR: True @@ -140,7 +140,7 @@ jobs: run: | . venv/bin/activate . environments/arcus/activate - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/delete_images.yml + ansible-playbook -vv ansible/ci/delete_images.yml env: OS_CLOUD: openstack ANSIBLE_FORCE_COLOR: True diff --git a/.github/workflows/smslabs.yml b/.github/workflows/smslabs.yml new file mode 100644 index 000000000..da6a64005 --- /dev/null +++ b/.github/workflows/smslabs.yml @@ -0,0 +1,146 @@ + +name: Test on SMS-Labs OpenStack in stackhpc-ci +on: + push: + branches: + - main + pull_request: +concurrency: stackhpc-ci # openstack project +jobs: + smslabs: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + + - name: Setup ssh + run: | + set -x + mkdir ~/.ssh + echo "$SSH_KEY" > ~/.ssh/id_rsa + chmod 0600 ~/.ssh/id_rsa + env: + SSH_KEY: ${{ secrets.SSH_KEY }} + + - name: Add bastion's ssh key to known_hosts + run: cat environments/smslabs/bastion_fingerprint >> ~/.ssh/known_hosts + shell: bash + + - name: Install ansible etc + run: dev/setup-env.sh + + - name: Install terraform + uses: hashicorp/setup-terraform@v1 + + - name: Initialise terraform + run: terraform init + working-directory: ${{ github.workspace }}/environments/smslabs/terraform + + - name: Write clouds.yaml + run: | + mkdir -p ~/.config/openstack/ + echo "$CLOUDS_YAML" > ~/.config/openstack/clouds.yaml + shell: bash + env: + CLOUDS_YAML: ${{ secrets.CLOUDS_YAML }} + + - name: Provision infrastructure + id: provision + run: | + . venv/bin/activate + . environments/smslabs/activate + cd $APPLIANCES_ENVIRONMENT_ROOT/terraform + terraform apply -auto-approve + env: + OS_CLOUD: openstack + TF_VAR_cluster_name: ci${{ github.run_id }} + + - name: Get server provisioning failure messages + id: provision_failure + run: | + . venv/bin/activate + . environments/smslabs/activate + cd $APPLIANCES_ENVIRONMENT_ROOT/terraform + echo "::set-output name=messages::$(./getfaults.py)" + env: + OS_CLOUD: openstack + TF_VAR_cluster_name: ci${{ github.run_id }} + if: always() && steps.provision.outcome == 'failure' + + - name: Delete infrastructure if failed due to lack of hosts + run: | + . venv/bin/activate + . environments/smslabs/activate + cd $APPLIANCES_ENVIRONMENT_ROOT/terraform + terraform destroy -auto-approve + env: + OS_CLOUD: openstack + TF_VAR_cluster_name: ci${{ github.run_id }} + if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }} + + - name: Directly configure cluster and build compute + login images + # see pre-hook for the image build + run: | + . venv/bin/activate + . environments/smslabs/activate + ansible all -m wait_for_connection + ansible-playbook ansible/adhoc/generate-passwords.yml + ansible-playbook -vv ansible/site.yml + env: + OS_CLOUD: openstack + ANSIBLE_FORCE_COLOR: True + + - name: Test reimage of login and compute nodes + run: | + . venv/bin/activate + . environments/smslabs/activate + ansible all -m wait_for_connection + ansible-playbook -vv ansible/ci/test_reimage.yml + env: + OS_CLOUD: openstack + ANSIBLE_FORCE_COLOR: True + + - name: Update cloud image and reconfigure Slurm + run: | + . venv/bin/activate + . environments/smslabs/activate + ansible-playbook -vv ansible/ci/update_cloudnode_image.yml + ansible-playbook -vv ansible/slurm.yml --tags openhpc --skip-tags install + env: + ANSIBLE_FORCE_COLOR: True + OS_CLOUD: openstack + + - name: Run MPI-based tests (triggers autoscaling) + run: | + . venv/bin/activate + . environments/smslabs/activate + ansible-playbook -vv ansible/adhoc/hpctests.yml + env: + ANSIBLE_FORCE_COLOR: True + + - name: Wait for CLOUD nodes to be destroyed + run: | + . venv/bin/activate + . environments/smslabs/activate + ansible-playbook -vv ansible/ci/wait_for_scaledown.yml + env: + OS_CLOUD: openstack + ANSIBLE_FORCE_COLOR: True + + - name: Delete infrastructure + run: | + . venv/bin/activate + . environments/smslabs/activate + cd $APPLIANCES_ENVIRONMENT_ROOT/terraform + terraform destroy -auto-approve + env: + TF_VAR_cluster_name: ci${{ github.run_id }} + if: ${{ success() || cancelled() }} + + - name: Delete images + run: | + . venv/bin/activate + . environments/smslabs/activate + ansible-playbook -vv ansible/ci/delete_images.yml + env: + OS_CLOUD: openstack + ANSIBLE_FORCE_COLOR: True diff --git a/environments/arcus/ci/delete_images.yml b/ansible/ci/delete_images.yml similarity index 100% rename from environments/arcus/ci/delete_images.yml rename to ansible/ci/delete_images.yml diff --git a/environments/arcus/ci/test_reimage.yml b/ansible/ci/test_reimage.yml similarity index 95% rename from environments/arcus/ci/test_reimage.yml rename to ansible/ci/test_reimage.yml index ae459d1a9..046ff8115 100644 --- a/environments/arcus/ci/test_reimage.yml +++ b/ansible/ci/test_reimage.yml @@ -35,6 +35,7 @@ - name: Check slurm up after reimaging login node import_tasks: ../hooks/check_slurm.yml + # TODO: This is specific to smslabs/arcus environment config - could generalise to all compute nodes - name: Request compute node rebuild via Slurm shell: cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1] diff --git a/environments/arcus/ci/update_cloudnode_image.yml b/ansible/ci/update_cloudnode_image.yml similarity index 100% rename from environments/arcus/ci/update_cloudnode_image.yml rename to ansible/ci/update_cloudnode_image.yml diff --git a/environments/arcus/ci/wait_for_scaledown.yml b/ansible/ci/wait_for_scaledown.yml similarity index 100% rename from environments/arcus/ci/wait_for_scaledown.yml rename to ansible/ci/wait_for_scaledown.yml diff --git a/environments/smslabs/ci/reimage-compute.yml b/environments/smslabs/ci/reimage-compute.yml deleted file mode 100644 index 3efa4e47c..000000000 --- a/environments/smslabs/ci/reimage-compute.yml +++ /dev/null @@ -1,37 +0,0 @@ -# Reimage compute nodes via Slurm with latest packer-build images - -- hosts: login[0] - become: no - tasks: - - name: Read packer build manifest - set_fact: - manifest: "{{ lookup('file', manifest_path) | from_json }}" - vars: - manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" - delegate_to: localhost - - - name: Get latest compute image build - set_fact: - compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" - - - name: Request compute node rebuild via Slurm - shell: - cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1] - become: true - - - name: Check compute node rebuild completed - shell: - cmd: openstack server show {{ item }} --format value -c image - register: openstack_server - loop: "{{ groups['compute'] }}" - retries: 5 - delay: 30 - until: compute_build.artifact_id in openstack_server.stdout - delegate_to: localhost - -- hosts: compute - become: no - gather_facts: no - tasks: - - name: Wait for nodes to boot - wait_for_connection: diff --git a/environments/smslabs/ci/reimage-login.yml b/environments/smslabs/ci/reimage-login.yml deleted file mode 100644 index f76f6e8d0..000000000 --- a/environments/smslabs/ci/reimage-login.yml +++ /dev/null @@ -1,23 +0,0 @@ -# Reimage login nodes via OpenStack - -- hosts: login - become: no - tasks: - - name: Read packer build manifest - set_fact: - manifest: "{{ lookup('file', manifest_path) | from_json }}" - vars: - manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" - delegate_to: localhost - - - name: Get latest login image build - set_fact: - login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}" - - - name: Reimage node via openstack - shell: - cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ login_build.artifact_id }}" - delegate_to: localhost - - - name: Wait for connection - wait_for_connection: From 5176449eee8b91caa02d9e16277e46f67ecfd1d3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 9 Mar 2022 17:13:45 +0000 Subject: [PATCH 130/133] move check_slurm.yml path --- ansible/ci/test_reimage.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/ci/test_reimage.yml b/ansible/ci/test_reimage.yml index 046ff8115..3470b132e 100644 --- a/ansible/ci/test_reimage.yml +++ b/ansible/ci/test_reimage.yml @@ -33,7 +33,7 @@ timeout: 800 - name: Check slurm up after reimaging login node - import_tasks: ../hooks/check_slurm.yml + import_tasks: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hooks/check_slurm.yml" # TODO: This is specific to smslabs/arcus environment config - could generalise to all compute nodes - name: Request compute node rebuild via Slurm @@ -61,5 +61,5 @@ timeout: 800 - name: Check slurm up after reimaging login node - import_tasks: ../hooks/check_slurm.yml + import_tasks: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hooks/check_slurm.yml" run_once: true From 90262b0e7b2b54e14a6142b43ea68df1be84aee7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 22 Mar 2022 12:04:30 +0000 Subject: [PATCH 131/133] add environment path to instance metadata --- environments/arcus/terraform/nodes.tf | 12 ++++++++++++ environments/arcus/terraform/variables.tf | 5 +++++ environments/smslabs/terraform/nodes.tf | 12 ++++++++++++ environments/smslabs/terraform/variables.tf | 5 +++++ 4 files changed, 34 insertions(+) diff --git a/environments/arcus/terraform/nodes.tf b/environments/arcus/terraform/nodes.tf index db4f4762d..3e4287352 100644 --- a/environments/arcus/terraform/nodes.tf +++ b/environments/arcus/terraform/nodes.tf @@ -13,6 +13,10 @@ resource "openstack_compute_instance_v2" "control" { access_network = true } + metadata = { + environment_root = var.environment_root + } + } resource "openstack_compute_instance_v2" "login" { @@ -31,6 +35,10 @@ resource "openstack_compute_instance_v2" "login" { access_network = true } + metadata = { + environment_root = var.environment_root + } + } resource "openstack_compute_instance_v2" "compute" { @@ -49,4 +57,8 @@ resource "openstack_compute_instance_v2" "compute" { access_network = true } + metadata = { + environment_root = var.environment_root + } + } diff --git a/environments/arcus/terraform/variables.tf b/environments/arcus/terraform/variables.tf index 45e0b52d0..4e5316493 100644 --- a/environments/arcus/terraform/variables.tf +++ b/environments/arcus/terraform/variables.tf @@ -75,3 +75,8 @@ variable "compute_images" { default = {} description = "Mapping to override compute images from compute_types: key ->(str) node name, value -> (str) image name" } + +variable "environment_root" { + type = string + description = "Path to environment root, automatically set by activate script" +} diff --git a/environments/smslabs/terraform/nodes.tf b/environments/smslabs/terraform/nodes.tf index 7d728e2c4..832876e58 100644 --- a/environments/smslabs/terraform/nodes.tf +++ b/environments/smslabs/terraform/nodes.tf @@ -13,6 +13,10 @@ resource "openstack_compute_instance_v2" "control" { access_network = true } + metadata = { + environment_root = var.environment_root + } + } resource "openstack_compute_instance_v2" "login" { @@ -31,6 +35,10 @@ resource "openstack_compute_instance_v2" "login" { access_network = true } + metadata = { + environment_root = var.environment_root + } + } resource "openstack_compute_instance_v2" "compute" { @@ -49,4 +57,8 @@ resource "openstack_compute_instance_v2" "compute" { access_network = true } + metadata = { + environment_root = var.environment_root + } + } diff --git a/environments/smslabs/terraform/variables.tf b/environments/smslabs/terraform/variables.tf index b6e82e90a..3a42a8d7f 100644 --- a/environments/smslabs/terraform/variables.tf +++ b/environments/smslabs/terraform/variables.tf @@ -66,3 +66,8 @@ variable "compute_images" { default = {} description = "Mapping to override compute images from compute_types: key ->(str) node name, value -> (str) image name" } + +variable "environment_root" { + type = string + description = "Path to environment root, automatically set by activate script" +} From c88e9063045f12a82e0095e27b0fb2bd517026bf Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 22 Mar 2022 12:04:58 +0000 Subject: [PATCH 132/133] fix slurm /home/slurm-app-ci owner to permit pip upgrades on slurm-openstack-tools --- ansible/slurm.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index a60918070..642bf4854 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -32,6 +32,12 @@ name: stackhpc.openhpc tasks_from: install.yml when: groups.get('openstack_autoscale', []) | length > 0 + - name: Fix slurm directory owner + file: + path: /etc/slurm + state: directory + owner: slurm + group: slurm - name: Setup autoscaling on OpenStack hosts: openstack_autoscale From ca640ffecce0e0ddfe54f3c206d10730200d980b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 22 Mar 2022 12:21:05 +0000 Subject: [PATCH 133/133] simplify smslabs test user handling --- .github/workflows/smslabs.yml | 5 +---- .../smslabs/inventory/group_vars/basic_users/overrides.yml | 2 -- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/smslabs.yml b/.github/workflows/smslabs.yml index 977c1b65a..5ff2eda9f 100644 --- a/.github/workflows/smslabs.yml +++ b/.github/workflows/smslabs.yml @@ -84,6 +84,7 @@ jobs: . environments/smslabs/activate ansible all -m wait_for_connection ansible-playbook ansible/adhoc/generate-passwords.yml + echo test_user_password: "$TEST_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/basic_users/defaults.yml ansible-playbook -vv ansible/site.yml env: OS_CLOUD: openstack @@ -99,7 +100,6 @@ jobs: env: OS_CLOUD: openstack ANSIBLE_FORCE_COLOR: True - TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - name: Update cloud image and reconfigure Slurm run: | @@ -110,7 +110,6 @@ jobs: env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack - TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - name: Run MPI-based tests (triggers autoscaling) run: | @@ -120,7 +119,6 @@ jobs: env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack - TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - name: Wait for CLOUD nodes to be destroyed run: | @@ -130,7 +128,6 @@ jobs: env: OS_CLOUD: openstack ANSIBLE_FORCE_COLOR: True - TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - name: Delete infrastructure run: | diff --git a/environments/smslabs/inventory/group_vars/basic_users/overrides.yml b/environments/smslabs/inventory/group_vars/basic_users/overrides.yml index 32fdd2af7..312c3f03c 100644 --- a/environments/smslabs/inventory/group_vars/basic_users/overrides.yml +++ b/environments/smslabs/inventory/group_vars/basic_users/overrides.yml @@ -1,5 +1,3 @@ -test_user_password: "{{ lookup('env', 'TEST_USER_PASSWORD') | default(vault_testuser_password, true) }}" # CI uses env, debug can set vault_testuser_password - basic_users_users: - name: testuser # can't use rocky as $HOME isn't shared! password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent