Skip to content

Refactor toplevel site.yml and fatimage.yml playbooks #657

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions ansible/bootstrap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -311,14 +311,31 @@
- include_role:
name: ofed

- hosts: ansible_init
- hosts: doca:&builder
become: yes
gather_facts: yes
tasks:
- name: Install NVIDIA DOCA
import_role:
name: doca

- hosts: ansible_init:&builder
gather_facts: yes
become: yes
tags: linux_ansible_init
tasks:
- include_role:
- name: Install ansible-init
ansible.builtin.include_role:
name: azimuth_cloud.image_utils.linux_ansible_init

- hosts: gateway:&builder
become: yes
tags: gateway
tasks:
- name: Install ansible-init gateway playbook
ansible.builtin.include_role:
name: gateway

- hosts: k3s:&builder
become: yes
tags: k3s
Expand Down
7 changes: 0 additions & 7 deletions ansible/disable-repos.yml

This file was deleted.

11 changes: 0 additions & 11 deletions ansible/extras.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,6 @@
- import_role:
name: persist_hostkeys


- name: Setup NFS export for compute node configuration
hosts: compute_init:!builder
# NB: has to be after eeesi and os-manila-mount
tags: compute_init
become: yes
tasks:
- include_role:
name: compute_init
tasks_from: export.yml

- name: Install k9s
become: yes
hosts: k9s
Expand Down
79 changes: 4 additions & 75 deletions ansible/fatimage.yml
Original file line number Diff line number Diff line change
@@ -1,15 +1,5 @@
# Builder version of site.yml just installing binaries

- hosts: builder
become: no
gather_facts: no
tasks:
- name: Report hostname (= final image name)
command: hostname
- name: Report inventory groups
debug:
var: group_names

- name: Run pre.yml hook
vars:
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
Expand All @@ -25,41 +15,22 @@
tasks_from: sync.yml
apply:
delegate_to: localhost
when: appliances_mode != 'configure'

- import_playbook: bootstrap.yml

- hosts: doca
become: yes
gather_facts: yes
tasks:
- name: Install NVIDIA DOCA
import_role:
name: doca

- name: Run post-bootstrap.yml hook
vars:
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
hook_path: "{{ appliances_environment_root }}/hooks/post-bootstrap.yml"
import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
when: hook_path | exists

- import_playbook: iam.yml

- hosts: builder
become: yes
gather_facts: yes
tasks:
# - import_playbook: iam.yml
- name: Install FreeIPA client
import_role:
name: freeipa
tasks_from: client-install.yml
when: "'freeipa_client' in group_names"
- name: Install sssd
import_role:
name: sssd
tasks_from: install.yml
when: "'sssd' in group_names"

# - import_playbook: filesystems.yml:
- name: Install nfs packages
dnf:
Expand All @@ -77,43 +48,12 @@
when: "'lustre' in group_names"

- import_playbook: extras.yml

# TODO: is this the right place?
- name: Install compute_init playbook
hosts: compute_init
tags: compute_init # tagged to allow running on cluster instances for dev
become: yes
tasks:
- include_role:
name: compute_init
tasks_from: install.yml

- name: Install gateway playbook
hosts: gateway
tags: gateway
become: yes
gather_facts: no
tasks:
- include_role:
name: gateway
- import_playbook: slurm.yml

- hosts: builder
become: yes
gather_facts: yes
tasks:
# - import_playbook: slurm.yml:
- name: Setup DB
include_role:
name: mysql
tasks_from: install.yml
when: "'mysql' in group_names"

- name: OpenHPC
import_role:
name: stackhpc.openhpc
tasks_from: install.yml
when: "'openhpc' in group_names"

# - import_playbook: portal.yml
- name: Open Ondemand server (packages)
include_role:
Expand Down Expand Up @@ -257,15 +197,4 @@
import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
when: hook_path | exists

- import_playbook: disable-repos.yml

- hosts: builder
become: yes
gather_facts: yes
tags: finalise
tasks:
- name: Cleanup image
import_tasks: cleanup.yml

- name: Shutdown Packer VM
community.general.shutdown:
- import_playbook: final.yml
38 changes: 38 additions & 0 deletions ansible/final.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
- hosts: compute_init
tags: compute_init
become: yes
tasks:
- name: Install compute_init playbook
ansible.builtin.include_role:
name: compute_init
tasks_from: 'install.yml'
when: "{{ appliances_mode == 'build' }}"
# conditional used instead of compute_init!builder to make dev easier

- hosts: compute_init:!builder
tags: compute_init
become: yes
tasks:
- name: Setup NFS export for compute node configuration
ansible.builtin.include_role:
name: compute_init
tasks_from: export.yml

- hosts: dnf_repos
become: yes
tasks:
- name: Disable pulp repos
ansible.builtin.include_role:
name: dnf_repos
tasks_from: disable_repos.yml

- hosts: builder
become: yes
gather_facts: yes
tags: finalise
tasks:
- name: Cleanup image
import_tasks: cleanup.yml

- name: Shutdown Packer VM
community.general.shutdown:
15 changes: 12 additions & 3 deletions ansible/iam.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
- hosts: freeipa_client
- hosts: freeipa_client:!builder
tags:
- freeipa
- freeipa_server # as this is only relevant if using freeipa_server
Expand All @@ -23,12 +23,20 @@
import_role:
name: freeipa
tasks_from: client-install.yml

- hosts: freeipa_client:!builder
tags:
- freeipa
- freeipa_client
gather_facts: yes
become: yes
tasks:
- name: Enrol FreeIPA client
import_role:
name: freeipa
tasks_from: enrol.yml

- hosts: freeipa_server
- hosts: freeipa_server:!builder
tags:
- freeipa
- freeipa_server
Expand All @@ -47,5 +55,6 @@
tags: sssd
tasks:
- name: Configure sssd
import_role:
ansible.builtin.include_role:
name: sssd
tasks_from: "{{ 'install.yml' if appliances_mode == 'build' else 'main.yml' }}"
27 changes: 12 additions & 15 deletions ansible/roles/compute_init/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,35 +143,32 @@ a new image:
additionally configure the control node to export compute hostvars over NFS.
Check the cluster is up.

2. Reimage the compute nodes:
2. Optionally, reimage the compute nodes to reset services etc.:

ansible-playbook --limit compute ansible/adhoc/rebuild.yml

3. Add metadata to a compute node e.g. via Horizon to turn on compute-init
playbook functionality.
3. Add metadata to a compute node (directly via Horizon or via OpenTofu) to
enable the new compute-init playbook functionality.

4. Stop ansible-init from running
4. Stop ansible-init from running:

ansible all -ba "systemctl stop ansible-init"

5. Fake an image build to deploy the compute-init playbook:
5. Fake an image build and rerunning the `site.yml` playbook:

ansible-playbook ansible/fatimage.yml --tags compute_init
ansible-playbook ansible/final.yml --tags compute_init

NB: This will also re-export the compute hostvars, as the nodes are not
in the builder group, which conveniently means any changes made to that
play also get picked up.
This both re-installs the compute-init playbook and re-configures the NFS
share with exported compute hostvars etc.

6. Fake a reimage of compute to run ansible-init and the updated compute-init playbook:
6. Fake a reimage of compute nodes to re-run ansible-init and the updated
compute-init playbook:

ansible all -ba "rm -f /var/lib/ansible-init.done && systemctl restart ansible-init"

Use `systemctl status ansible-init` to view stdout/stderr from Ansible.

Steps 4/5/6 can be repeated with changes to the compute script. If required,
reimage the compute node(s) first as in step 2 and/or add additional metadata
as in step 3.
7. Use `systemctl status ansible-init` to view stdout/stderr from Ansible.

Steps 4-7 can be repeated with changes to the compute script until it works.

## Design notes
- Duplicating code in roles into the `compute-init` script is unfortunate, but
Expand Down
9 changes: 1 addition & 8 deletions ansible/site.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
- import_playbook: slurm.yml
- import_playbook: portal.yml
- import_playbook: monitoring.yml
- import_playbook: disable-repos.yml

- name: Run post.yml hook
vars:
Expand All @@ -37,12 +36,6 @@
import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
when: hook_path | exists

- name: Clean up and shutdown Packer VM
hosts: builder
gather_facts: no
become: yes
tasks:
- import_tasks: cleanup.yml
- community.general.shutdown:
- import_playbook: final.yml

...
21 changes: 8 additions & 13 deletions ansible/slurm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
- name: Setup DB
hosts: mysql
become: true
tags:
- mysql
tags: mysql
tasks:
- include_role:
name: mysql
tasks_from: "{{ 'install.yml' if appliances_mode == 'build' else 'main.yml' }}"

- name: Setup slurm-driven rebuild
hosts: rebuild:!builder
Expand All @@ -20,12 +20,9 @@
name: rebuild

- name: Set locked memory limits on user-facing nodes
hosts:
- compute
- login
hosts: compute:login:!builder
become: yes
tags:
- openhpc
tags: openhpc
tasks:
- name: set memory limits
lineinfile:
Expand All @@ -34,10 +31,9 @@
line: "* soft memlock unlimited"

- name: Block ssh to compute nodes for non-privileged users without running jobs
hosts: compute
hosts: compute:!builder
become: yes
tags:
- openhpc
tags: openhpc
tasks:
- name: Configure sshd pam module
blockinfile:
Expand All @@ -57,9 +53,8 @@
- name: Setup slurm
hosts: openhpc
become: yes
tags:
- openhpc
tags: openhpc
tasks:
- include_role:
name: stackhpc.openhpc
tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}"
tasks_from: "{{ {'build':'install.yml', 'configure':'runtime.yml'}[appliances_mode] | default('main.yml') }}"
10 changes: 10 additions & 0 deletions environments/.stackhpc/hooks/pre.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
- hosts: builder
become: no
gather_facts: no
tasks:
- name: Report hostname (= final image name)
command: hostname
- name: Report inventory groups
debug:
var: group_names

- hosts: control:!builder
become: yes
gather_facts: false
Expand Down
Loading