diff --git a/.github/workflows/arcus.yml b/.github/workflows/arcus.yml new file mode 100644 index 000000000..582997db4 --- /dev/null +++ b/.github/workflows/arcus.yml @@ -0,0 +1,146 @@ + +name: Test on Arcus OpenStack in rcp-cloud-portal-demo +on: + push: + branches: + - main + pull_request: +concurrency: rcp-cloud-portal_demo # openstack project +jobs: + arcus: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + + - name: Setup ssh + run: | + set -x + mkdir ~/.ssh + echo "$SSH_KEY" > ~/.ssh/id_rsa + chmod 0600 ~/.ssh/id_rsa + env: + SSH_KEY: ${{ secrets.ARCUS_SSH_KEY }} + + - name: Add bastion's ssh key to known_hosts + run: cat environments/arcus/bastion_fingerprint >> ~/.ssh/known_hosts + shell: bash + + - name: Install ansible etc + run: dev/setup-env.sh + + - name: Install terraform + uses: hashicorp/setup-terraform@v1 + + - name: Initialise terraform + run: terraform init + working-directory: ${{ github.workspace }}/environments/arcus/terraform + + - name: Write clouds.yaml + run: | + mkdir -p ~/.config/openstack/ + echo "$CLOUDS_YAML" > ~/.config/openstack/clouds.yaml + shell: bash + env: + CLOUDS_YAML: ${{ secrets.ARCUS_CLOUDS_YAML }} + + - name: Provision infrastructure + id: provision + run: | + . venv/bin/activate + . environments/arcus/activate + cd $APPLIANCES_ENVIRONMENT_ROOT/terraform + terraform apply -auto-approve + env: + OS_CLOUD: openstack + TF_VAR_cluster_name: ci${{ github.run_id }} + + - name: Get server provisioning failure messages + id: provision_failure + run: | + . venv/bin/activate + . environments/arcus/activate + cd $APPLIANCES_ENVIRONMENT_ROOT/terraform + echo "::set-output name=messages::$(./getfaults.py)" + env: + OS_CLOUD: openstack + TF_VAR_cluster_name: ci${{ github.run_id }} + if: always() && steps.provision.outcome == 'failure' + + - name: Delete infrastructure if failed due to lack of hosts + run: | + . venv/bin/activate + . environments/arcus/activate + cd $APPLIANCES_ENVIRONMENT_ROOT/terraform + terraform destroy -auto-approve + env: + OS_CLOUD: openstack + TF_VAR_cluster_name: ci${{ github.run_id }} + if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }} + + - name: Directly configure cluster and build compute + login images + # see pre-hook for the image build + run: | + . venv/bin/activate + . environments/arcus/activate + ansible all -m wait_for_connection + ansible-playbook ansible/adhoc/generate-passwords.yml + ansible-playbook -vv ansible/site.yml + env: + OS_CLOUD: openstack + ANSIBLE_FORCE_COLOR: True + + - name: Test reimage of login and compute nodes + run: | + . venv/bin/activate + . environments/arcus/activate + ansible all -m wait_for_connection + ansible-playbook -vv ansible/ci/test_reimage.yml + env: + OS_CLOUD: openstack + ANSIBLE_FORCE_COLOR: True + + - name: Update cloud image and reconfigure Slurm + run: | + . venv/bin/activate + . environments/arcus/activate + ansible-playbook -vv ansible/ci/update_cloudnode_image.yml + ansible-playbook -vv ansible/slurm.yml --tags openhpc --skip-tags install + env: + ANSIBLE_FORCE_COLOR: True + OS_CLOUD: openstack + + - name: Run MPI-based tests (triggers autoscaling) + run: | + . venv/bin/activate + . environments/arcus/activate + ansible-playbook -vv ansible/adhoc/hpctests.yml + env: + ANSIBLE_FORCE_COLOR: True + + - name: Wait for CLOUD nodes to be destroyed + run: | + . venv/bin/activate + . environments/arcus/activate + ansible-playbook -vv ansible/ci/wait_for_scaledown.yml + env: + OS_CLOUD: openstack + ANSIBLE_FORCE_COLOR: True + + - name: Delete infrastructure + run: | + . venv/bin/activate + . environments/arcus/activate + cd $APPLIANCES_ENVIRONMENT_ROOT/terraform + terraform destroy -auto-approve + env: + TF_VAR_cluster_name: ci${{ github.run_id }} + if: ${{ success() || cancelled() }} + + - name: Delete images + run: | + . venv/bin/activate + . environments/arcus/activate + ansible-playbook -vv ansible/ci/delete_images.yml + env: + OS_CLOUD: openstack + ANSIBLE_FORCE_COLOR: True diff --git a/.github/workflows/smslabs.yml b/.github/workflows/smslabs.yml index 7a2c8ed80..5ff2eda9f 100644 --- a/.github/workflows/smslabs.yml +++ b/.github/workflows/smslabs.yml @@ -1,5 +1,5 @@ -name: Test on OpenStack via smslabs +name: Test on SMS-Labs OpenStack in stackhpc-ci on: push: branches: @@ -7,7 +7,7 @@ on: pull_request: concurrency: stackhpc-ci # openstack project jobs: - openstack-example: + smslabs: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 @@ -22,7 +22,7 @@ jobs: SSH_KEY: ${{ secrets.SSH_KEY }} - name: Add bastion's ssh key to known_hosts - run: cat environments/smslabs-example/bastion_fingerprint >> ~/.ssh/known_hosts + run: cat environments/smslabs/bastion_fingerprint >> ~/.ssh/known_hosts shell: bash - name: Install ansible etc @@ -33,7 +33,7 @@ jobs: - name: Initialise terraform run: terraform init - working-directory: ${{ github.workspace }}/environments/smslabs-example/terraform + working-directory: ${{ github.workspace }}/environments/smslabs/terraform - name: Write clouds.yaml run: | @@ -47,7 +47,7 @@ jobs: id: provision run: | . venv/bin/activate - . environments/smslabs-example/activate + . environments/smslabs/activate cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform apply -auto-approve env: @@ -58,7 +58,7 @@ jobs: id: provision_failure run: | . venv/bin/activate - . environments/smslabs-example/activate + . environments/smslabs/activate cd $APPLIANCES_ENVIRONMENT_ROOT/terraform echo "::set-output name=messages::$(./getfaults.py)" env: @@ -69,7 +69,7 @@ jobs: - name: Delete infrastructure if failed due to lack of hosts run: | . venv/bin/activate - . environments/smslabs-example/activate + . environments/smslabs/activate cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform destroy -auto-approve env: @@ -77,63 +77,73 @@ jobs: TF_VAR_cluster_name: ci${{ github.run_id }} if: ${{ always() && steps.provision.outcome == 'failure' && contains('not enough hosts available', steps.provision_failure.messages) }} - - name: Configure infrastructure + - name: Directly configure cluster and build compute + login images + # see pre-hook for the image build run: | . venv/bin/activate - . environments/smslabs-example/activate + . environments/smslabs/activate ansible all -m wait_for_connection ansible-playbook ansible/adhoc/generate-passwords.yml + echo test_user_password: "$TEST_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/basic_users/defaults.yml ansible-playbook -vv ansible/site.yml env: + OS_CLOUD: openstack ANSIBLE_FORCE_COLOR: True TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - - name: Run MPI-based tests + - name: Test reimage of login and compute nodes run: | . venv/bin/activate - . environments/smslabs-example/activate - ansible-playbook -vv ansible/adhoc/hpctests.yml + . environments/smslabs/activate + ansible all -m wait_for_connection + ansible-playbook -vv ansible/ci/test_reimage.yml env: + OS_CLOUD: openstack ANSIBLE_FORCE_COLOR: True - TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - - name: Build control and compute images + - name: Update cloud image and reconfigure Slurm run: | . venv/bin/activate - . environments/smslabs-example/activate - cd packer - PACKER_LOG=1 PACKER_LOG_PATH=build.log packer build -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl + . environments/smslabs/activate + ansible-playbook -vv ansible/ci/update_cloudnode_image.yml + ansible-playbook -vv ansible/slurm.yml --tags openhpc --skip-tags install env: + ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack - TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - - name: Reimage compute nodes via slurm and check cluster still up + - name: Run MPI-based tests (triggers autoscaling) run: | . venv/bin/activate - . environments/smslabs-example/activate - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-compute.yml - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml + . environments/smslabs/activate + ansible-playbook -vv ansible/adhoc/hpctests.yml env: + ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack - TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - - name: Reimage login nodes via openstack and check cluster still up + - name: Wait for CLOUD nodes to be destroyed run: | . venv/bin/activate - . environments/smslabs-example/activate - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/ci/reimage-login.yml - ansible-playbook -vv $APPLIANCES_ENVIRONMENT_ROOT/hooks/post.yml + . environments/smslabs/activate + ansible-playbook -vv ansible/ci/wait_for_scaledown.yml env: OS_CLOUD: openstack - TEST_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} + ANSIBLE_FORCE_COLOR: True - name: Delete infrastructure run: | . venv/bin/activate - . environments/smslabs-example/activate + . environments/smslabs/activate cd $APPLIANCES_ENVIRONMENT_ROOT/terraform terraform destroy -auto-approve env: - OS_CLOUD: openstack TF_VAR_cluster_name: ci${{ github.run_id }} if: ${{ success() || cancelled() }} + + - name: Delete images + run: | + . venv/bin/activate + . environments/smslabs/activate + ansible-playbook -vv ansible/ci/delete_images.yml + env: + OS_CLOUD: openstack + ANSIBLE_FORCE_COLOR: True diff --git a/ansible/adhoc/restart-slurm.yml b/ansible/adhoc/restart-slurm.yml index 41b9dcb50..cf523ddee 100644 --- a/ansible/adhoc/restart-slurm.yml +++ b/ansible/adhoc/restart-slurm.yml @@ -20,7 +20,7 @@ name: slurmctld state: restarted -- hosts: compute,login +- hosts: compute,login # FIXME: doesn't work if using `login` as combined slurmctld become: yes gather_facts: no tasks: diff --git a/environments/smslabs-example/ci/reimage-login.yml b/ansible/ci/delete_images.yml similarity index 55% rename from environments/smslabs-example/ci/reimage-login.yml rename to ansible/ci/delete_images.yml index f76f6e8d0..133e7d0a7 100644 --- a/environments/smslabs-example/ci/reimage-login.yml +++ b/ansible/ci/delete_images.yml @@ -1,7 +1,6 @@ -# Reimage login nodes via OpenStack - -- hosts: login +- hosts: login:!builder become: no + gather_facts: no tasks: - name: Read packer build manifest set_fact: @@ -9,15 +8,16 @@ vars: manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" delegate_to: localhost - - - name: Get latest login image build + + - name: Get latest image builds set_fact: login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}" + compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" - - name: Reimage node via openstack + - name: Delete images shell: - cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ login_build.artifact_id }}" + cmd: | + openstack image delete {{ login_build.artifact_id }} + openstack image delete {{ compute_build.artifact_id }} delegate_to: localhost - - - name: Wait for connection - wait_for_connection: + \ No newline at end of file diff --git a/ansible/ci/test_reimage.yml b/ansible/ci/test_reimage.yml new file mode 100644 index 000000000..3470b132e --- /dev/null +++ b/ansible/ci/test_reimage.yml @@ -0,0 +1,65 @@ +- hosts: login:!builder + become: no + tasks: + - name: Read packer build manifest + set_fact: + manifest: "{{ lookup('file', manifest_path) | from_json }}" + vars: + manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" + delegate_to: localhost + + - name: Get latest image builds + set_fact: + login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}" + compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" + + - name: Reimage login node via openstack + shell: + cmd: "openstack server rebuild {{ instance_id | default(inventory_hostname) }} --image {{ login_build.artifact_id }}" + delegate_to: localhost + + - name: Check login node rebuild completed + shell: + cmd: openstack server show {{ inventory_hostname }} --format value -c image + register: openstack_login + delegate_to: localhost + retries: 5 + delay: 30 + until: login_build.artifact_id in openstack_login.stdout + changed_when: false + + - name: Wait for login connection + wait_for_connection: + timeout: 800 + + - name: Check slurm up after reimaging login node + import_tasks: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hooks/check_slurm.yml" + + # TODO: This is specific to smslabs/arcus environment config - could generalise to all compute nodes + - name: Request compute node rebuild via Slurm + shell: + cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1] + become: yes + + - name: Check compute node rebuild completed + shell: + cmd: openstack server show {{ item }} --format value -c image + register: openstack_compute + delegate_to: localhost + loop: "{{ groups['compute'] }}" + retries: 5 + delay: 30 + until: compute_build.artifact_id in openstack_compute.stdout + changed_when: false + +- hosts: compute:!builder + become: no + gather_facts: no + tasks: + - name: Wait for compute connection + wait_for_connection: + timeout: 800 + + - name: Check slurm up after reimaging login node + import_tasks: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hooks/check_slurm.yml" + run_once: true diff --git a/ansible/ci/update_cloudnode_image.yml b/ansible/ci/update_cloudnode_image.yml new file mode 100644 index 000000000..a171d4de8 --- /dev/null +++ b/ansible/ci/update_cloudnode_image.yml @@ -0,0 +1,22 @@ +- hosts: localhost + become: no + tasks: + - name: Read packer build manifest + set_fact: + manifest: "{{ lookup('file', manifest_path) | from_json }}" + vars: + manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" + delegate_to: localhost + + - name: Get latest image builds + set_fact: + login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}" + compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" + + - name: Add compute image ID to autoscale definition (for later autoscaling tests) + copy: + dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/openhpc/autoscale.yml" + content: | + openhpc_autoscale_image: {{ compute_build.artifact_id }} + delegate_to: localhost + run_once: true diff --git a/ansible/ci/wait_for_scaledown.yml b/ansible/ci/wait_for_scaledown.yml new file mode 100644 index 000000000..cb1c51a47 --- /dev/null +++ b/ansible/ci/wait_for_scaledown.yml @@ -0,0 +1,28 @@ +- hosts: login:!builder + become: no + gather_facts: no + tasks: + - name: List CLOUD-state nodes + shell: + cmd: sinfo --noheader --Node --Format NodeList -t CLOUD + register: sinfo_cloudnodes + changed_when: false + + - name: Get SuspendTime + shell: + cmd: scontrol show config | grep '^SuspendTime ' + register: suspendtime + + - name: Wait for SuspendTime + pause: + seconds: "{{ suspendtime.stdout.split()[2] }}" + + - name: Wait for CLOUD nodes to be destroyed + shell: + cmd: "openstack server list -f value -c Name" + changed_when: false + delegate_to: localhost + register: openstack_servers + until: "sinfo_cloudnodes.stdout_lines | map('trim') | intersect(openstack_servers.stdout_lines) | length == 0" # cloud nodes aren't found in openstack_servers + retries: 10 + delay: 30 diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 39fbd254c..642bf4854 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -9,24 +9,60 @@ - include_role: name: geerlingguy.mysql -- name: Setup slurm +- name: Setup Slurm-driven reimage on OpenStack + hosts: rebuild + become: yes + tags: + - rebuild + - openhpc + tasks: + - import_role: + name: stackhpc.slurm_openstack_tools.rebuild + +- name: Preinstall Slurm packages to create slurm user + # This is an optimisation for speed as it avoids having to do this once for `control` then again for `openhpc` nodes. hosts: openhpc become: yes tags: + - openstack_autoscale - openhpc + - install tasks: - import_role: name: stackhpc.openhpc + tasks_from: install.yml + when: groups.get('openstack_autoscale', []) | length > 0 + - name: Fix slurm directory owner + file: + path: /etc/slurm + state: directory + owner: slurm + group: slurm -- name: Setup slurm-driven reimage - hosts: rebuild +- name: Setup autoscaling on OpenStack + hosts: openstack_autoscale become: yes tags: - - rebuild + - openstack_autoscale - openhpc tasks: - import_role: - name: stackhpc.slurm_openstack_tools.rebuild + name: stackhpc.slurm_openstack_tools.autoscale + +- name: Setup slurm + hosts: openhpc + become: yes + tags: + - openhpc + tasks: + - assert: + that: "'enable_configless' in openhpc_config.SlurmctldParameters | default([])" + fail_msg: | + 'enable_configless' not found in openhpc_config.SlurmctldParameters - is variable openhpc_config overridden? + Additional slurm.conf parameters should be provided using variable openhpc_config_extra. + success_msg: Checked Slurm will be configured for configless operation + - import_role: + name: stackhpc.openhpc - name: Set locked memory limits on user-facing nodes hosts: diff --git a/environments/smslabs-example/.gitignore b/environments/arcus/.gitignore similarity index 100% rename from environments/smslabs-example/.gitignore rename to environments/arcus/.gitignore diff --git a/environments/smslabs-example/activate b/environments/arcus/activate similarity index 100% rename from environments/smslabs-example/activate rename to environments/arcus/activate diff --git a/environments/smslabs-example/ansible.cfg b/environments/arcus/ansible.cfg similarity index 100% rename from environments/smslabs-example/ansible.cfg rename to environments/arcus/ansible.cfg diff --git a/environments/arcus/bastion_fingerprint b/environments/arcus/bastion_fingerprint new file mode 100644 index 000000000..713026452 --- /dev/null +++ b/environments/arcus/bastion_fingerprint @@ -0,0 +1,3 @@ +|1|BwhEZQPqvZcdf9Phmh2mTPmIivU=|bHi1Nf8dYI8z1C+qsqQFPAty1xA= ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQChxwhZggdwj55gNzfDBzah0G8IeTPQjgMZrpboxp2BO4J+o1iZSwDj+2fqyhBGTE43vCJR13uEygz49XIy+t17qBNwHz4fVVR7jdMNymtbZoOsq9oAoBdGEICHrMzQsYZmT9+Wt74ZP2PKOOn+a+f2vg7YdeSy1UhT08iJlbXwCx56fCQnMJMOnZM9MXVLd4NUFN1TeOCIBQHwRiMJyJ7S7CdUKpyUqHOG85peKiPJ07C0RZ/W5HkYKqltwtvPGQd262p5eLC9j3nhOYSG2meRV8yTxYz3lDIPDx0+189CZ5NaxFSPCgqSYA24zavhPVLQqoct7nd7fcEw9JiTs+abZC6GckCONSHDLM+iRtWC/i5u21ZZDLxM9SIqPI96cYFszGeqyZoXxS5qPaIDHbQNAEqJp9ygNXgh9vuBo7E+aWYbFDTG0RuvW02fbmFfZw2/yXIr37+cQX+GPOnkfIRuHE3Hx5eN8C04v+BMrAfK2minawhG3A2ONJs9LI6QoeE= +|1|whGSPLhKW4xt/7PWOZ1treg3PtA=|F5gwV8j0JYWDzjb6DvHHaqO+sxs= ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBCpCG881Gt3dr+nuVIC2uGEQkeVwG6WDdS1WcCoxXC7AG+Oi5bfdqtf4IfeLpWmeuEaAaSFH48ODFr76ViygSjU= +|1|0V6eQ1FKO5NMKaHZeNFbw62mrJs=|H1vuGTbbtZD2MEgZxQf1PXPk+yU= ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEnOtYByM3s2qvRT8SS1sn5z5sbwjzb1alm0B3emPcHJ \ No newline at end of file diff --git a/environments/arcus/builder.pkrvars.hcl b/environments/arcus/builder.pkrvars.hcl new file mode 100644 index 000000000..35c05c1e0 --- /dev/null +++ b/environments/arcus/builder.pkrvars.hcl @@ -0,0 +1,7 @@ +flavor = "vm.alaska.cpu.general.small" +networks = ["a262aabd-e6bf-4440-a155-13dbc1b5db0e"] # WCDC-iLab-60 +source_image_name = "RockyLinux-8.5-20211114.2" +ssh_keypair_name = "slurm-app-ci" +security_groups = ["default", "SSH"] +ssh_bastion_host = "128.232.222.183" +ssh_bastion_username = "slurm-app-ci" diff --git a/environments/arcus/hooks/check_slurm.yml b/environments/arcus/hooks/check_slurm.yml new file mode 100644 index 000000000..b2ae67c7b --- /dev/null +++ b/environments/arcus/hooks/check_slurm.yml @@ -0,0 +1,21 @@ +- name: Run sinfo + shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name + register: sinfo + changed_when: false + until: "'boot' not in sinfo.stdout_lines" + retries: 5 + delay: 10 +- name: Check nodes have expected slurm state + assert: + that: sinfo.stdout_lines == expected_sinfo + fail_msg: | + sinfo output not as expected: + actual: + {{ sinfo.stdout_lines }} + expected: + {{ expected_sinfo }} + + vars: + expected_sinfo: + - "{{ openhpc_cluster_name }}-compute-[0-1] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle" + - "{{ openhpc_cluster_name }}-compute-[2-3] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle~" diff --git a/environments/arcus/hooks/post.yml b/environments/arcus/hooks/post.yml new file mode 100644 index 000000000..15878b796 --- /dev/null +++ b/environments/arcus/hooks/post.yml @@ -0,0 +1,19 @@ +- hosts: login:!builder # won't have a slurm control daemon when in build + become: no + gather_facts: false + tasks: + - name: Check slurm up after direct deploy + import_tasks: check_slurm.yml + +- hosts: localhost + become: false + tags: build + tasks: + - name: Check Packer build finished + async_status: + jid: "{{ packer_run.ansible_job_id }}" + register: packer_result + until: packer_result.finished + retries: 30 # allow 15 mins + delay: 30 + when: packer_run is defined # allows rerunning post.yml diff --git a/environments/arcus/hooks/pre.yml b/environments/arcus/hooks/pre.yml new file mode 100644 index 000000000..583858b1f --- /dev/null +++ b/environments/arcus/hooks/pre.yml @@ -0,0 +1,39 @@ +- hosts: localhost + become: false + tags: build + tasks: + - name: Ensure secrets generated + include_role: + name: passwords + + - name: Build packer images + shell: + cmd: | + cd packer + PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl + chdir: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}" + when: "'builder' not in group_names" # avoid recursion! + register: packer_run + async: 2700 # 45 minutes + poll: 0 + +- hosts: all + become: true + tags: squid + tasks: + - name: Configure yum proxy + lineinfile: + path: /etc/yum.conf + regexp: '^proxy=http://10\.60\.102\.179:3128' + line: 'proxy=http://10.60.102.179:3128' + +- hosts: all + become: true + tags: etc_hosts + tasks: + - name: Create /etc/hosts for all nodes as DNS doesn't work + blockinfile: + path: /etc/hosts + create: yes + state: present + block: "{{ appliance_addresses | from_json | to_nice_yaml | replace(':', '') }}" diff --git a/environments/smslabs-example/inventory/group_vars/all/.gitkeep b/environments/arcus/inventory/group_vars/all/.gitkeep similarity index 100% rename from environments/smslabs-example/inventory/group_vars/all/.gitkeep rename to environments/arcus/inventory/group_vars/all/.gitkeep diff --git a/environments/arcus/inventory/group_vars/all/bastion.yml b/environments/arcus/inventory/group_vars/all/bastion.yml new file mode 100644 index 000000000..e6d5f7699 --- /dev/null +++ b/environments/arcus/inventory/group_vars/all/bastion.yml @@ -0,0 +1 @@ +ansible_ssh_common_args: '-o ProxyCommand="ssh slurm-app-ci@128.232.222.183 -W %h:%p"' diff --git a/environments/arcus/inventory/group_vars/openhpc/overrides.yml b/environments/arcus/inventory/group_vars/openhpc/overrides.yml new file mode 100644 index 000000000..4ebfe2d6d --- /dev/null +++ b/environments/arcus/inventory/group_vars/openhpc/overrides.yml @@ -0,0 +1,15 @@ +openhpc_config_extra: + SlurmctldDebug: debug + SlurmdDebug: debug +openhpc_slurm_partitions: +- name: small + ram_mb: "{{ (808 * 0.9) | int }}" # from free --mebi + sockets: 1 + cores_per_socket: 2 + threads_per_core: 2 + cloud_nodes: "{{ openhpc_cluster_name }}-compute-[2-3]" # see TF variable 'cloud_nodes' + cloud_instances: # TODO: can we somehow check these when templating?? + flavor: vm.alaska.cpu.general.small + image: "{{ openhpc_autoscale_image | default('IMAGE_PLACEHOLDER') }}" + keypair: slurm-app-ci + network: "{{ hostvars[groups['control'] | first]['server_networks'].keys() | first }}" # Defined in inventory, so only defined for control during Packer build diff --git a/environments/arcus/inventory/groups b/environments/arcus/inventory/groups new file mode 100644 index 000000000..b721d0e93 --- /dev/null +++ b/environments/arcus/inventory/groups @@ -0,0 +1,43 @@ +[nfs:children] +openhpc + +[hpctests:children] +# Login node to use for running mpi-based testing. +login + +[mysql:children] +control + +[prometheus:children] +control + +[grafana:children] +control + +[alertmanager:children] +control + +[node_exporter:children] +cluster + +[opendistro:children] +control + +[kibana:children] +control + +[slurm_stats:children] +control + +[filebeat:children] +slurm_stats + +[rebuild:children] +control +compute + +[update:children] +cluster + +[openstack_autoscale:children] +control diff --git a/environments/smslabs-example/terraform/.terraform.lock.hcl b/environments/arcus/terraform/.terraform.lock.hcl similarity index 100% rename from environments/smslabs-example/terraform/.terraform.lock.hcl rename to environments/arcus/terraform/.terraform.lock.hcl diff --git a/environments/smslabs-example/terraform/getfaults.py b/environments/arcus/terraform/getfaults.py similarity index 100% rename from environments/smslabs-example/terraform/getfaults.py rename to environments/arcus/terraform/getfaults.py diff --git a/environments/arcus/terraform/inventory.tf b/environments/arcus/terraform/inventory.tf new file mode 100644 index 000000000..b7eeeb2d8 --- /dev/null +++ b/environments/arcus/terraform/inventory.tf @@ -0,0 +1,14 @@ +resource "local_file" "hosts" { + content = templatefile("${path.module}/inventory.tpl", + { + "cluster_name": var.cluster_name + "control": openstack_compute_instance_v2.control, + "logins": openstack_compute_instance_v2.login, + "computes": openstack_compute_instance_v2.compute, + "compute_types": var.compute_types, + "compute_nodes": var.compute_nodes, + "ports": openstack_networking_port_v2.rdma + }, + ) + filename = "../inventory/hosts" +} diff --git a/environments/arcus/terraform/inventory.tpl b/environments/arcus/terraform/inventory.tpl new file mode 100644 index 000000000..ba95d568a --- /dev/null +++ b/environments/arcus/terraform/inventory.tpl @@ -0,0 +1,25 @@ +[all:vars] +ansible_user=rocky +openhpc_cluster_name=${cluster_name} +appliance_addresses='${jsonencode({for portname, port in ports: port.all_fixed_ips[0] => join("-", [cluster_name, portname]) })}' + +[control] +${control.name} ansible_host=${[for n in control.network: n.fixed_ip_v4 if n.access_network][0]} server_networks='${jsonencode({for net in control.network: net.name => [ net.fixed_ip_v4 ] })}' + +[login] +%{ for login in logins ~} +${login.name} ansible_host=${[for n in login.network: n.fixed_ip_v4 if n.access_network][0]} server_networks='${jsonencode({for net in login.network: net.name => [ net.fixed_ip_v4 ] })}' +%{ endfor ~} + +[compute] +%{ for compute in computes ~} +${compute.name} ansible_host=${[for n in compute.network: n.fixed_ip_v4 if n.access_network][0]} server_networks='${jsonencode({for net in compute.network: net.name => [ net.fixed_ip_v4 ] })}' +%{ endfor ~} + +# Define groups for slurm parititions: +%{~ for type_name, type_descr in compute_types} +[${cluster_name}_${type_name}] + %{~ for node_name, node_type in compute_nodes ~} + %{~ if node_type == type_name }${cluster_name}-${node_name}%{ endif } + %{~ endfor ~} +%{ endfor ~} diff --git a/environments/smslabs-example/terraform/main.tf b/environments/arcus/terraform/main.tf similarity index 76% rename from environments/smslabs-example/terraform/main.tf rename to environments/arcus/terraform/main.tf index 49a84ffce..03beb0adc 100644 --- a/environments/smslabs-example/terraform/main.tf +++ b/environments/arcus/terraform/main.tf @@ -6,3 +6,7 @@ terraform { } } } + +provider "openstack" { + cloud = "openstack" +} diff --git a/environments/arcus/terraform/network.tf b/environments/arcus/terraform/network.tf new file mode 100644 index 000000000..68f7c92a0 --- /dev/null +++ b/environments/arcus/terraform/network.tf @@ -0,0 +1,22 @@ +data "openstack_networking_network_v2" "cluster_net" { + name = var.cluster_net +} + +data "openstack_networking_subnet_v2" "cluster_subnet" { + + name = var.cluster_subnet +} + +resource "openstack_networking_port_v2" "rdma" { + + for_each = toset(concat(["control"], keys(var.login_nodes), keys(var.compute_nodes), var.cloud_nodes)) + + name = "${var.cluster_name}-${each.key}" + network_id = data.openstack_networking_network_v2.cluster_net.id + admin_state_up = "true" + + binding { + vnic_type = "direct" + } + +} diff --git a/environments/arcus/terraform/nodes.tf b/environments/arcus/terraform/nodes.tf new file mode 100644 index 000000000..3e4287352 --- /dev/null +++ b/environments/arcus/terraform/nodes.tf @@ -0,0 +1,64 @@ + +resource "openstack_compute_instance_v2" "control" { + + name = "${var.cluster_name}-control" + image_name = var.control_node.image + flavor_name = var.control_node.flavor + key_pair = var.key_pair + config_drive = true + security_groups = ["default", "SSH"] + + network { + port = openstack_networking_port_v2.rdma["control"].id + access_network = true + } + + metadata = { + environment_root = var.environment_root + } + +} + +resource "openstack_compute_instance_v2" "login" { + + for_each = var.login_nodes + + name = "${var.cluster_name}-${each.key}" + image_name = each.value.image + flavor_name = each.value.flavor + key_pair = var.key_pair + config_drive = true + security_groups = ["default", "SSH"] + + network { + port = openstack_networking_port_v2.rdma[each.key].id + access_network = true + } + + metadata = { + environment_root = var.environment_root + } + +} + +resource "openstack_compute_instance_v2" "compute" { + + for_each = var.compute_nodes + + name = "${var.cluster_name}-${each.key}" + image_name = lookup(var.compute_images, each.key, var.compute_types[each.value].image) + flavor_name = var.compute_types[each.value].flavor + key_pair = var.key_pair + config_drive = true + security_groups = ["default", "SSH"] + + network { + port = openstack_networking_port_v2.rdma[each.key].id + access_network = true + } + + metadata = { + environment_root = var.environment_root + } + +} diff --git a/environments/arcus/terraform/variables.tf b/environments/arcus/terraform/variables.tf new file mode 100644 index 000000000..4e5316493 --- /dev/null +++ b/environments/arcus/terraform/variables.tf @@ -0,0 +1,82 @@ +variable "cluster_name" { + type = string + description = "Name for cluster, used as prefix for resources" +} + +variable "cluster_net" { + type = string + description = "Name of existing cluster network" + default = "WCDC-iLab-60" +} + +variable "cluster_subnet" { + type = string + description = "Name of existing cluster subnet" + default = "WCDC-iLab-60" +} + +variable "key_pair" { + type = string + description = "Name of an existing keypair in OpenStack" + default = "slurm-app-ci" +} + +variable "control_node" { + type = map + description = "Mapping {flavor: flavor_name, image: image_name_or_id }" + default = { + flavor: "vm.alaska.cpu.general.small" + image: "RockyLinux-8.5-20211114.2" + } +} + +variable "login_nodes" { + type = map + description = "Mapping defining login nodes: key -> (str) nodename suffix, value -> mapping {flavor: flavor_name, image: image_name_or_id }" + default = { + login-0: { + flavor: "vm.alaska.cpu.general.small" + image: "RockyLinux-8.5-20211114.2" + } + } +} + +variable "compute_types" { + type = map + description = "Mapping defining types of compute nodes: key -> (str) name of type, value -> mapping {flavor: flavor_name, image: image_name_or_id }" + default = { + small: { + flavor: "vm.alaska.cpu.general.small" + image: "RockyLinux-8.5-20211114.2" + } + } +} + +variable "compute_nodes" { + type = map(string) + description = "Mapping of compute nodename suffix -> key in compute_types" + default = { + compute-0: "small" + compute-1: "small" + } +} + +variable "cloud_nodes" { + type = list(string) + description = "Cloud nodename suffixes to precreate RDMA-capable ports" + default = [ + "compute-2", + "compute-3", + ] +} + +variable "compute_images" { + type = map(string) + default = {} + description = "Mapping to override compute images from compute_types: key ->(str) node name, value -> (str) image name" +} + +variable "environment_root" { + type = string + description = "Path to environment root, automatically set by activate script" +} diff --git a/environments/common/inventory/group_vars/all/autoscale.yml b/environments/common/inventory/group_vars/all/autoscale.yml new file mode 100644 index 000000000..b4816f571 --- /dev/null +++ b/environments/common/inventory/group_vars/all/autoscale.yml @@ -0,0 +1,4 @@ +autoscale_rebuild_clouds: ~/.config/openstack/clouds.yaml +autoscale_suspend_exc_nodes_default: "{{ (groups.get('compute', []) + groups.get('login', [])) }}" # i.e. all non-CLOUD nodes, and prevent login-only slurmd nodes getting powered down +autoscale_suspend_exc_nodes_extra: [] +autoscale_suspend_exc_nodes: "{{ autoscale_suspend_exc_nodes_default + autoscale_suspend_exc_nodes_extra }}" diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index a3f2fdc23..f757eb04e 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -22,5 +22,9 @@ openhpc_packages_default: openhpc_packages_extra: [] openhpc_packages: "{{ openhpc_packages_default + openhpc_packages_extra }}" openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}" -openhpc_slurm_configless: true -openhpc_login_only_nodes: login \ No newline at end of file +openhpc_login_only_nodes: login +openhpc_config_default: + SlurmctldParameters: + - enable_configless +openhpc_config_extra: {} +openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" diff --git a/environments/common/inventory/group_vars/all/pytools.yml b/environments/common/inventory/group_vars/all/pytools.yml new file mode 100644 index 000000000..0fbd2452c --- /dev/null +++ b/environments/common/inventory/group_vars/all/pytools.yml @@ -0,0 +1 @@ +pytools_gitref: feature/ports diff --git a/environments/common/inventory/group_vars/all/rebuild.yml b/environments/common/inventory/group_vars/all/rebuild.yml new file mode 100644 index 000000000..e40ffe66c --- /dev/null +++ b/environments/common/inventory/group_vars/all/rebuild.yml @@ -0,0 +1 @@ +openhpc_rebuild_clouds: ~/.config/openstack/clouds.yaml diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index f7ddd01fd..2aa570bd6 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -71,6 +71,10 @@ cluster [update] # All hosts to (optionally) run yum update on. +[autoscale] +# Add control to enable autoscaling on OpenStack. +# See ansible/collections/ansible_collections/stackhpc/slurm_openstack_tools/roles/autoscale/README.md + [block_devices] # Superset of hosts to configure filesystems on - see ansible/roles/block_devices/README.md diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 225a7bc2b..d57017788 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -39,6 +39,9 @@ cluster [basic_users] # Add `openhpc` group to add Slurm users via creation of users on each node. +[openstack_autoscale] +# Add `control` group to configure autoscaling on OpenStack clouds. + [openondemand] # Host to run Open Ondemand server on - subset of login login diff --git a/environments/smslabs-example/ci/reimage-compute.yml b/environments/smslabs-example/ci/reimage-compute.yml deleted file mode 100644 index 3efa4e47c..000000000 --- a/environments/smslabs-example/ci/reimage-compute.yml +++ /dev/null @@ -1,37 +0,0 @@ -# Reimage compute nodes via Slurm with latest packer-build images - -- hosts: login[0] - become: no - tasks: - - name: Read packer build manifest - set_fact: - manifest: "{{ lookup('file', manifest_path) | from_json }}" - vars: - manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" - delegate_to: localhost - - - name: Get latest compute image build - set_fact: - compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" - - - name: Request compute node rebuild via Slurm - shell: - cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1] - become: true - - - name: Check compute node rebuild completed - shell: - cmd: openstack server show {{ item }} --format value -c image - register: openstack_server - loop: "{{ groups['compute'] }}" - retries: 5 - delay: 30 - until: compute_build.artifact_id in openstack_server.stdout - delegate_to: localhost - -- hosts: compute - become: no - gather_facts: no - tasks: - - name: Wait for nodes to boot - wait_for_connection: diff --git a/environments/smslabs-example/hooks/post.yml b/environments/smslabs-example/hooks/post.yml deleted file mode 100644 index 660e7a47e..000000000 --- a/environments/smslabs-example/hooks/post.yml +++ /dev/null @@ -1,30 +0,0 @@ -- hosts: login - become: no - gather_facts: false - tags: checks - tasks: - - block: - - name: Run sinfo - shell: 'sinfo --noheader --format="%N %P %a %l %D %t"' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name - register: sinfo - changed_when: false - - name: Check nodes have expected slurm state - assert: - that: "(sinfo.stdout_lines[0] | split)[1:] == ['small*', 'up', '60-00:00:00', '2', 'idle']" # don't know what instance names are as have CI run ID in them - fail_msg: "sinfo output not as expected: {{ sinfo.stdout }}" - when: "'builder' not in group_names" # won't have a slurm control daemon when in build - -- hosts: openondemand - name: Check Open Ondemand is running - tags: - - checks - - openondemand - - openondemand_server - tasks: - - uri: - url: https://localhost - validate_certs: false # selfsigned - force_basic_auth: yes # as otherwise we get 401 - url_username: testuser - url_password: "{{ test_user_password }}" - status_code: 200 diff --git a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml deleted file mode 100644 index 3585ae073..000000000 --- a/environments/smslabs-example/inventory/group_vars/openhpc/overrides.yml +++ /dev/null @@ -1,5 +0,0 @@ -openhpc_config: - SlurmctldDebug: debug - SlurmdDebug: debug -openhpc_slurm_partitions: -- name: small diff --git a/environments/smslabs/.gitignore b/environments/smslabs/.gitignore new file mode 100644 index 000000000..12b21a20f --- /dev/null +++ b/environments/smslabs/.gitignore @@ -0,0 +1,3 @@ +secrets.yml +.vscode +hosts diff --git a/environments/smslabs/activate b/environments/smslabs/activate new file mode 100644 index 000000000..e74031095 --- /dev/null +++ b/environments/smslabs/activate @@ -0,0 +1,23 @@ +export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) +echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" + +APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) +export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" + +export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") +echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" + +export TF_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") +echo "Setting TF_VAR_environment_root to $TF_VAR_environment_root" + +export PKR_VAR_environment_root=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT") +echo "Setting PKR_VAR_environment_root to $PKR_VAR_environment_root" + +export PKR_VAR_repo_root=$(realpath "$APPLIANCES_REPO_ROOT") +echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" + +if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then + export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg +fi + + diff --git a/environments/smslabs/ansible.cfg b/environments/smslabs/ansible.cfg new file mode 100644 index 000000000..d7a3783fa --- /dev/null +++ b/environments/smslabs/ansible.cfg @@ -0,0 +1,15 @@ +[defaults] +any_errors_fatal = True +stdout_callback = debug +stderr_callback = debug +gathering = smart +forks = 30 +host_key_checking = False +inventory = ../common/inventory,inventory +collections_path = ../../ansible/collections +roles_path = ../../ansible/roles +filter_plugins = ../../ansible/filter_plugins + +[ssh_connection] +ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +pipelining = True diff --git a/environments/smslabs-example/bastion_fingerprint b/environments/smslabs/bastion_fingerprint similarity index 100% rename from environments/smslabs-example/bastion_fingerprint rename to environments/smslabs/bastion_fingerprint diff --git a/environments/smslabs-example/builder.pkrvars.hcl b/environments/smslabs/builder.pkrvars.hcl similarity index 100% rename from environments/smslabs-example/builder.pkrvars.hcl rename to environments/smslabs/builder.pkrvars.hcl diff --git a/environments/smslabs/hooks/check_slurm.yml b/environments/smslabs/hooks/check_slurm.yml new file mode 100644 index 000000000..b2ae67c7b --- /dev/null +++ b/environments/smslabs/hooks/check_slurm.yml @@ -0,0 +1,21 @@ +- name: Run sinfo + shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name + register: sinfo + changed_when: false + until: "'boot' not in sinfo.stdout_lines" + retries: 5 + delay: 10 +- name: Check nodes have expected slurm state + assert: + that: sinfo.stdout_lines == expected_sinfo + fail_msg: | + sinfo output not as expected: + actual: + {{ sinfo.stdout_lines }} + expected: + {{ expected_sinfo }} + + vars: + expected_sinfo: + - "{{ openhpc_cluster_name }}-compute-[0-1] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle" + - "{{ openhpc_cluster_name }}-compute-[2-3] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle~" diff --git a/environments/smslabs/hooks/post.yml b/environments/smslabs/hooks/post.yml new file mode 100644 index 000000000..15878b796 --- /dev/null +++ b/environments/smslabs/hooks/post.yml @@ -0,0 +1,19 @@ +- hosts: login:!builder # won't have a slurm control daemon when in build + become: no + gather_facts: false + tasks: + - name: Check slurm up after direct deploy + import_tasks: check_slurm.yml + +- hosts: localhost + become: false + tags: build + tasks: + - name: Check Packer build finished + async_status: + jid: "{{ packer_run.ansible_job_id }}" + register: packer_result + until: packer_result.finished + retries: 30 # allow 15 mins + delay: 30 + when: packer_run is defined # allows rerunning post.yml diff --git a/environments/smslabs/hooks/pre.yml b/environments/smslabs/hooks/pre.yml new file mode 100644 index 000000000..2fb943528 --- /dev/null +++ b/environments/smslabs/hooks/pre.yml @@ -0,0 +1,31 @@ +- hosts: localhost + become: false + tags: build + tasks: + - name: Ensure secrets generated + include_role: + name: passwords + + - name: Build packer images + shell: + cmd: | + cd packer + PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl + chdir: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}" + when: "'builder' not in group_names" # avoid recursion! + register: packer_run + async: 2700 # 45 minutes + poll: 0 + +# For some reason squid shows TCP_MISS_ABORTED/200 on everything +# - hosts: all +# become: yes +# gather_facts: no +# tasks: +# - name: Configure dnf proxy +# community.general.ini_file: +# path: /etc/dnf/dnf.conf +# section: main +# option: proxy +# value: "{{ squid_proxy }}" +# no_extra_spaces: true diff --git a/environments/smslabs/inventory/group_vars/all/.gitkeep b/environments/smslabs/inventory/group_vars/all/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/environments/smslabs-example/inventory/group_vars/all/bastion.yml b/environments/smslabs/inventory/group_vars/all/bastion.yml similarity index 100% rename from environments/smslabs-example/inventory/group_vars/all/bastion.yml rename to environments/smslabs/inventory/group_vars/all/bastion.yml diff --git a/environments/smslabs-example/inventory/group_vars/all/openondemand.yml b/environments/smslabs/inventory/group_vars/all/openondemand.yml similarity index 100% rename from environments/smslabs-example/inventory/group_vars/all/openondemand.yml rename to environments/smslabs/inventory/group_vars/all/openondemand.yml diff --git a/environments/smslabs/inventory/group_vars/all/squid.yml b/environments/smslabs/inventory/group_vars/all/squid.yml new file mode 100644 index 000000000..8524b5843 --- /dev/null +++ b/environments/smslabs/inventory/group_vars/all/squid.yml @@ -0,0 +1 @@ +squid_proxy: http://10.20.2.12:3128 diff --git a/environments/smslabs-example/inventory/group_vars/basic_users/overrides.yml b/environments/smslabs/inventory/group_vars/basic_users/overrides.yml similarity index 58% rename from environments/smslabs-example/inventory/group_vars/basic_users/overrides.yml rename to environments/smslabs/inventory/group_vars/basic_users/overrides.yml index 32fdd2af7..312c3f03c 100644 --- a/environments/smslabs-example/inventory/group_vars/basic_users/overrides.yml +++ b/environments/smslabs/inventory/group_vars/basic_users/overrides.yml @@ -1,5 +1,3 @@ -test_user_password: "{{ lookup('env', 'TEST_USER_PASSWORD') | default(vault_testuser_password, true) }}" # CI uses env, debug can set vault_testuser_password - basic_users_users: - name: testuser # can't use rocky as $HOME isn't shared! password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent diff --git a/environments/smslabs-example/inventory/group_vars/grafana/overrides.yml b/environments/smslabs/inventory/group_vars/grafana/overrides.yml similarity index 100% rename from environments/smslabs-example/inventory/group_vars/grafana/overrides.yml rename to environments/smslabs/inventory/group_vars/grafana/overrides.yml diff --git a/environments/smslabs/inventory/group_vars/openhpc/overrides.yml b/environments/smslabs/inventory/group_vars/openhpc/overrides.yml new file mode 100644 index 000000000..86f14c3f4 --- /dev/null +++ b/environments/smslabs/inventory/group_vars/openhpc/overrides.yml @@ -0,0 +1,15 @@ +openhpc_config_extra: + SlurmctldDebug: debug + SlurmdDebug: debug +openhpc_slurm_partitions: +- name: small + ram_mb: "{{ (3362 * 0.95) | int }}" # free --mebi * default openhpc_ram_multiplier + sockets: 1 + cores_per_socket: 1 + threads_per_core: 1 + cloud_nodes: "{{ openhpc_cluster_name }}-compute-[2-3]" + cloud_instances: + flavor: general.v1.tiny + image: "{{ openhpc_autoscale_image | default('IMAGE_PLACEHOLDER') }}" # Gets set by CI after image build task. + keypair: slurm-app-ci + network: "{{ hostvars[groups['control'] | first]['server_networks'].keys() | first }}" # Defined in inventory, so only defined for control during Packer build. diff --git a/environments/smslabs-example/inventory/group_vars/openondemand/overrides.yml b/environments/smslabs/inventory/group_vars/openondemand/overrides.yml similarity index 100% rename from environments/smslabs-example/inventory/group_vars/openondemand/overrides.yml rename to environments/smslabs/inventory/group_vars/openondemand/overrides.yml diff --git a/environments/smslabs-example/inventory/group_vars/podman/overrides.yml b/environments/smslabs/inventory/group_vars/podman/overrides.yml similarity index 100% rename from environments/smslabs-example/inventory/group_vars/podman/overrides.yml rename to environments/smslabs/inventory/group_vars/podman/overrides.yml diff --git a/environments/smslabs-example/inventory/groups b/environments/smslabs/inventory/groups similarity index 93% rename from environments/smslabs-example/inventory/groups rename to environments/smslabs/inventory/groups index fd2ef7336..a4b7fe702 100644 --- a/environments/smslabs-example/inventory/groups +++ b/environments/smslabs/inventory/groups @@ -36,6 +36,9 @@ compute [update:children] cluster +[openstack_autoscale:children] +control + [openondemand:children] login diff --git a/environments/smslabs/terraform/.terraform.lock.hcl b/environments/smslabs/terraform/.terraform.lock.hcl new file mode 100644 index 000000000..6f55d88a6 --- /dev/null +++ b/environments/smslabs/terraform/.terraform.lock.hcl @@ -0,0 +1,40 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/local" { + version = "2.1.0" + hashes = [ + "h1:EYZdckuGU3n6APs97nS2LxZm3dDtGqyM4qaIvsmac8o=", + "zh:0f1ec65101fa35050978d483d6e8916664b7556800348456ff3d09454ac1eae2", + "zh:36e42ac19f5d68467aacf07e6adcf83c7486f2e5b5f4339e9671f68525fc87ab", + "zh:6db9db2a1819e77b1642ec3b5e95042b202aee8151a0256d289f2e141bf3ceb3", + "zh:719dfd97bb9ddce99f7d741260b8ece2682b363735c764cac83303f02386075a", + "zh:7598bb86e0378fd97eaa04638c1a4c75f960f62f69d3662e6d80ffa5a89847fe", + "zh:ad0a188b52517fec9eca393f1e2c9daea362b33ae2eb38a857b6b09949a727c1", + "zh:c46846c8df66a13fee6eff7dc5d528a7f868ae0dcf92d79deaac73cc297ed20c", + "zh:dc1a20a2eec12095d04bf6da5321f535351a594a636912361db20eb2a707ccc4", + "zh:e57ab4771a9d999401f6badd8b018558357d3cbdf3d33cc0c4f83e818ca8e94b", + "zh:ebdcde208072b4b0f8d305ebf2bfdc62c926e0717599dcf8ec2fd8c5845031c3", + "zh:ef34c52b68933bedd0868a13ccfd59ff1c820f299760b3c02e008dc95e2ece91", + ] +} + +provider "registry.terraform.io/terraform-provider-openstack/openstack" { + version = "1.43.0" + hashes = [ + "h1:1QwVWBH4Boye5QDpB3YG/WE2grF3m9c3afX3tcGv/A8=", + "zh:08af4c5b2136a95cd16789c5cb4945ad288d02b91d06018c74ed14b97b335857", + "zh:2c99eaf2a86ae1ab8186226c1be5395d45a91d93f4e65cc8731afbc736aea4e9", + "zh:3f0226ce9737e7e47822d009419a78477d5286bf30896b85cbe3af0cf9ff7c90", + "zh:40811116da43f6cab91016150462da847413b188b3e7060759a37dcd0ebbfb8d", + "zh:447678224527eeb9c8a145ad8aaec6c0e032e2e789d68708aeb3e2b488fd7e63", + "zh:49adbdcd112edd29bb71b03e5e0060c63c2904358cd34f199dcd606b63521a0e", + "zh:51054fed551149aa2962ec4192dc8a7f3b25ef170d161a4e7f68e0ea099c4c78", + "zh:635181a35d224433a2adecdf72c01e0d1873929a51ebea8730d512ecc5b5c9e0", + "zh:71752e30bfac741e8040f52d3722d3c804e7edc022e989d7ebe47537e80a6267", + "zh:75262bc0087d0f119066d156d9e5c139db93695b551c794af711f3c2b03b2fa3", + "zh:aa640e5f357c08dffce9cfbc35251a81851c2c9696d9752f5e5201d330a84627", + "zh:bbb6164d149891b340d3293ef3a26d80738f9ef5025863e30b36c3854eea0149", + "zh:d2c08432fe39c8dfb3ec929e181bb8235b0073944d96811f4654ca578fb090b1", + ] +} diff --git a/environments/smslabs/terraform/getfaults.py b/environments/smslabs/terraform/getfaults.py new file mode 100755 index 000000000..b3e9ecf35 --- /dev/null +++ b/environments/smslabs/terraform/getfaults.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +""" Display any failure messages for openstack servers in current terraform state. """ +import json, subprocess + +def get_openstack_server(uuid): + """ Return json with openstack server info """ + cmd = ['openstack', 'server', 'show', uuid, '-f', 'json'] + server_txt = subprocess.run(cmd, stdout=subprocess.PIPE, check=True, universal_newlines=True).stdout + return json.loads(server_txt) + +def read_tf_state(): + """ Return json from terraform state in current directory """ + + with open('terraform.tfstate') as statef: + state = json.load(statef) + return state + +def check_server_errors(): + tf_state = read_tf_state() + for resource in tf_state['resources']: + if resource['type'] == 'openstack_compute_instance_v2': + for instance in resource['instances']: + name = instance['attributes']['name'] + uuid = instance['attributes']['id'] + + server = get_openstack_server(uuid) + failure_msg = server.get('fault', {}).get('message') + if failure_msg: + print(name, uuid, failure_msg) + +if __name__ == '__main__': + check_server_errors() \ No newline at end of file diff --git a/environments/smslabs-example/terraform/inventory.tf b/environments/smslabs/terraform/inventory.tf similarity index 100% rename from environments/smslabs-example/terraform/inventory.tf rename to environments/smslabs/terraform/inventory.tf diff --git a/environments/smslabs-example/terraform/inventory.tpl b/environments/smslabs/terraform/inventory.tpl similarity index 100% rename from environments/smslabs-example/terraform/inventory.tpl rename to environments/smslabs/terraform/inventory.tpl diff --git a/environments/smslabs/terraform/main.tf b/environments/smslabs/terraform/main.tf new file mode 100644 index 000000000..03beb0adc --- /dev/null +++ b/environments/smslabs/terraform/main.tf @@ -0,0 +1,12 @@ +terraform { + required_version = ">= 0.14" + required_providers { + openstack = { + source = "terraform-provider-openstack/openstack" + } + } +} + +provider "openstack" { + cloud = "openstack" +} diff --git a/environments/smslabs-example/terraform/network.tf b/environments/smslabs/terraform/network.tf similarity index 100% rename from environments/smslabs-example/terraform/network.tf rename to environments/smslabs/terraform/network.tf diff --git a/environments/smslabs-example/terraform/nodes.tf b/environments/smslabs/terraform/nodes.tf similarity index 79% rename from environments/smslabs-example/terraform/nodes.tf rename to environments/smslabs/terraform/nodes.tf index 3bca7fb36..832876e58 100644 --- a/environments/smslabs-example/terraform/nodes.tf +++ b/environments/smslabs/terraform/nodes.tf @@ -6,13 +6,17 @@ resource "openstack_compute_instance_v2" "control" { flavor_name = var.control_node.flavor key_pair = var.key_pair config_drive = true - security_groups = ["default", "ssh"] + security_groups = ["default", "SSH"] network { uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id # ensures nodes not created till subnet created access_network = true } + metadata = { + environment_root = var.environment_root + } + } resource "openstack_compute_instance_v2" "login" { @@ -24,13 +28,17 @@ resource "openstack_compute_instance_v2" "login" { flavor_name = each.value.flavor key_pair = var.key_pair config_drive = true - security_groups = ["default", "ssh"] + security_groups = ["default", "SSH"] network { uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id access_network = true } + metadata = { + environment_root = var.environment_root + } + } resource "openstack_compute_instance_v2" "compute" { @@ -42,11 +50,15 @@ resource "openstack_compute_instance_v2" "compute" { flavor_name = var.compute_types[each.value].flavor key_pair = var.key_pair config_drive = true - security_groups = ["default", "ssh"] + security_groups = ["default", "SSH"] network { uuid = data.openstack_networking_subnet_v2.cluster_subnet.network_id access_network = true } + metadata = { + environment_root = var.environment_root + } + } diff --git a/environments/smslabs-example/terraform/variables.tf b/environments/smslabs/terraform/variables.tf similarity index 93% rename from environments/smslabs-example/terraform/variables.tf rename to environments/smslabs/terraform/variables.tf index b6e82e90a..3a42a8d7f 100644 --- a/environments/smslabs-example/terraform/variables.tf +++ b/environments/smslabs/terraform/variables.tf @@ -66,3 +66,8 @@ variable "compute_images" { default = {} description = "Mapping to override compute images from compute_types: key ->(str) node name, value -> (str) image name" } + +variable "environment_root" { + type = string + description = "Path to environment root, automatically set by activate script" +} diff --git a/requirements.yml b/requirements.yml index a8e6ffa22..d31f8c490 100644 --- a/requirements.yml +++ b/requirements.yml @@ -2,7 +2,7 @@ roles: - src: stackhpc.nfs - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: v0.9.0 # supports Rocky Linux + version: v0.10.0 # supports Rocky Linux name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: support-rhel-clones @@ -24,5 +24,5 @@ collections: - name: community.grafana - name: https://github.com/stackhpc/ansible_collection_slurm_openstack_tools type: git - version: v0.1.0 + version: feature/autoscale2 ...