From 4d4f1cee8e3e2f8ce0a2c8dbfa2ea2a09abaef36 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Apr 2025 10:56:17 +0000 Subject: [PATCH 01/30] remove hostname from fatimage --- ansible/fatimage.yml | 10 ---------- environments/.stackhpc/hooks/pre.yml | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 0b4335b14..1ce46f540 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -1,15 +1,5 @@ # Builder version of site.yml just installing binaries -- hosts: builder - become: no - gather_facts: no - tasks: - - name: Report hostname (= final image name) - command: hostname - - name: Report inventory groups - debug: - var: group_names - - name: Run pre.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index 305713a61..7ccf50b3a 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -1,3 +1,13 @@ +- hosts: builder + become: no + gather_facts: no + tasks: + - name: Report hostname (= final image name) + command: hostname + - name: Report inventory groups + debug: + var: group_names + - hosts: control:!builder become: yes gather_facts: false From a2b68882c9437ee1da414f32f9b42ded00af9b3e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Apr 2025 10:57:44 +0000 Subject: [PATCH 02/30] move doca to bootstrap --- ansible/bootstrap.yml | 8 ++++++++ ansible/fatimage.yml | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 27559952c..54cf47dc3 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -311,6 +311,14 @@ - include_role: name: ofed +- hosts: doca:&builder + become: yes + gather_facts: yes + tasks: + - name: Install NVIDIA DOCA + import_role: + name: doca + - hosts: ansible_init gather_facts: yes become: yes diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 1ce46f540..4a70241cc 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -19,14 +19,6 @@ - import_playbook: bootstrap.yml -- hosts: doca - become: yes - gather_facts: yes - tasks: - - name: Install NVIDIA DOCA - import_role: - name: doca - - name: Run post-bootstrap.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" From c04fa61274e146c33a852c6d50221c7681cc2bb3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Apr 2025 11:00:51 +0000 Subject: [PATCH 03/30] move compute-init install into extras.yml --- ansible/extras.yml | 8 ++++++++ ansible/fatimage.yml | 10 ---------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/ansible/extras.yml b/ansible/extras.yml index c7cacb877..428df1850 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -58,6 +58,14 @@ - import_role: name: persist_hostkeys +- name: Install compute_init playbook + hosts: compute_init:&builder + tags: compute_init # tagged to allow running on cluster instances for dev + become: yes + tasks: + - include_role: + name: compute_init + tasks_from: install.yml - name: Setup NFS export for compute node configuration hosts: compute_init:!builder diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 4a70241cc..ebbfb5b02 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -60,16 +60,6 @@ - import_playbook: extras.yml -# TODO: is this the right place? -- name: Install compute_init playbook - hosts: compute_init - tags: compute_init # tagged to allow running on cluster instances for dev - become: yes - tasks: - - include_role: - name: compute_init - tasks_from: install.yml - - name: Install gateway playbook hosts: gateway tags: gateway From ca3f4ffec645e35debf4393d760f6238a04f7cff Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Apr 2025 11:08:40 +0000 Subject: [PATCH 04/30] move gateway and only run ansible-init install during builder --- ansible/bootstrap.yml | 13 +++++++++++-- ansible/fatimage.yml | 9 --------- environments/common/layouts/everything | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 54cf47dc3..b418a5a30 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -319,14 +319,23 @@ import_role: name: doca -- hosts: ansible_init +- hosts: ansible_init:&builder gather_facts: yes become: yes tags: linux_ansible_init tasks: - - include_role: + - name: Install ansible-init + ansible.builtin.include_role: name: azimuth_cloud.image_utils.linux_ansible_init +- hosts: gateway:&builder + become: yes + tags: gateway + tasks: + - name: Install ansible-init gateway playbook + ansible.builtin.include_role: + name: gateway + - hosts: k3s:&builder become: yes tags: k3s diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index ebbfb5b02..4d12c3137 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -60,15 +60,6 @@ - import_playbook: extras.yml -- name: Install gateway playbook - hosts: gateway - tags: gateway - become: yes - gather_facts: no - tasks: - - include_role: - name: gateway - - hosts: builder become: yes gather_facts: yes diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index e3c3f763d..9f2c7c706 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -84,7 +84,7 @@ openondemand # Hosts to run TuneD configuration [ansible_init:children] -# Hosts to run linux-anisble-init +# Hosts to run linux-ansible-init cluster [sssd] From 0b9f615cb7fa089829dee2f47868e9baea5718df Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Apr 2025 11:10:46 +0000 Subject: [PATCH 05/30] remove vm cleanup/shutdown from site.yml --- ansible/site.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/ansible/site.yml b/ansible/site.yml index d973d9cb3..ad3dda734 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -37,12 +37,4 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists -- name: Clean up and shutdown Packer VM - hosts: builder - gather_facts: no - become: yes - tasks: - - import_tasks: cleanup.yml - - community.general.shutdown: - ... \ No newline at end of file From 1a744b87221be6d93f2f3dad2b36635ccc4afb17 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Apr 2025 11:11:59 +0000 Subject: [PATCH 06/30] remove unneeded conditional from pulp repo sync --- ansible/fatimage.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 4d12c3137..357930a1a 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -15,7 +15,6 @@ tasks_from: sync.yml apply: delegate_to: localhost - when: appliances_mode != 'configure' - import_playbook: bootstrap.yml From dccd39035e2ec2bd24098e3eadc3a7d082a1bdf6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Apr 2025 11:22:02 +0000 Subject: [PATCH 07/30] use iam.yml in fatimage --- ansible/fatimage.yml | 14 ++------------ ansible/iam.yml | 13 +++++++++++-- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 357930a1a..869615806 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -25,22 +25,12 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists +- import_playbook: iam.yml + - hosts: builder become: yes gather_facts: yes tasks: - # - import_playbook: iam.yml - - name: Install FreeIPA client - import_role: - name: freeipa - tasks_from: client-install.yml - when: "'freeipa_client' in group_names" - - name: Install sssd - import_role: - name: sssd - tasks_from: install.yml - when: "'sssd' in group_names" - # - import_playbook: filesystems.yml: - name: Install nfs packages dnf: diff --git a/ansible/iam.yml b/ansible/iam.yml index 857b8f840..a220819c8 100644 --- a/ansible/iam.yml +++ b/ansible/iam.yml @@ -1,4 +1,4 @@ -- hosts: freeipa_client +- hosts: freeipa_client:!builder tags: - freeipa - freeipa_server # as this is only relevant if using freeipa_server @@ -23,12 +23,20 @@ import_role: name: freeipa tasks_from: client-install.yml + +- hosts: freeipa_client:!builder + tags: + - freeipa + - freeipa_client + gather_facts: yes + become: yes + tasks: - name: Enrol FreeIPA client import_role: name: freeipa tasks_from: enrol.yml -- hosts: freeipa_server +- hosts: freeipa_server:!builder tags: - freeipa - freeipa_server @@ -49,3 +57,4 @@ - name: Configure sssd import_role: name: sssd + tasks_from: "{{ 'install.yml' if appliances_mode == 'build' else 'main.yml' }}" From 910ecc32a42a9951644d78113c477d7e4a787350 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Apr 2025 11:23:30 +0000 Subject: [PATCH 08/30] disable repos after hook for site.yml - matches fatimage --- ansible/site.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ansible/site.yml b/ansible/site.yml index ad3dda734..7b64059ae 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -27,7 +27,6 @@ - import_playbook: slurm.yml - import_playbook: portal.yml - import_playbook: monitoring.yml -- import_playbook: disable-repos.yml - name: Run post.yml hook vars: @@ -37,4 +36,6 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists +- import_playbook: disable-repos.yml + ... \ No newline at end of file From bb4cab150d3e572781c957357b8f024752372040 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Apr 2025 11:30:34 +0000 Subject: [PATCH 09/30] use slurm.yml from fatimage --- ansible/fatimage.yml | 14 +------------- ansible/slurm.yml | 21 ++++++++------------- 2 files changed, 9 insertions(+), 26 deletions(-) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 869615806..f4df32521 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -48,24 +48,12 @@ when: "'lustre' in group_names" - import_playbook: extras.yml +- import_playbook: slurm.yml - hosts: builder become: yes gather_facts: yes tasks: - # - import_playbook: slurm.yml: - - name: Setup DB - include_role: - name: mysql - tasks_from: install.yml - when: "'mysql' in group_names" - - - name: OpenHPC - import_role: - name: stackhpc.openhpc - tasks_from: install.yml - when: "'openhpc' in group_names" - # - import_playbook: portal.yml - name: Open Ondemand server (packages) include_role: diff --git a/ansible/slurm.yml b/ansible/slurm.yml index d1bb93a9f..1bf0bcbd9 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -3,11 +3,11 @@ - name: Setup DB hosts: mysql become: true - tags: - - mysql + tags: mysql tasks: - include_role: name: mysql + tasks_from: "{{ 'install.yml' if appliances_mode == 'build' else 'main.yml' }}" - name: Setup slurm-driven rebuild hosts: rebuild:!builder @@ -20,12 +20,9 @@ name: rebuild - name: Set locked memory limits on user-facing nodes - hosts: - - compute - - login + hosts: compute:login:!builder become: yes - tags: - - openhpc + tags: openhpc tasks: - name: set memory limits lineinfile: @@ -34,10 +31,9 @@ line: "* soft memlock unlimited" - name: Block ssh to compute nodes for non-privileged users without running jobs - hosts: compute + hosts: compute:!builder become: yes - tags: - - openhpc + tags: openhpc tasks: - name: Configure sshd pam module blockinfile: @@ -57,9 +53,8 @@ - name: Setup slurm hosts: openhpc become: yes - tags: - - openhpc + tags: openhpc tasks: - include_role: name: stackhpc.openhpc - tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" + tasks_from: "{{ {'install':'install.yml', 'runtime.yml':'configure'}[appliances_mode] | default('main.yml') }}" From 5cc7fbc7219432d1fdec08f530cedc7e183bca30 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Apr 2025 11:37:01 +0000 Subject: [PATCH 10/30] add new final.yml for both fatimage and site --- ansible/disable-repos.yml | 7 ------- ansible/fatimage.yml | 13 +------------ ansible/final.yml | 18 ++++++++++++++++++ ansible/site.yml | 2 +- 4 files changed, 20 insertions(+), 20 deletions(-) delete mode 100644 ansible/disable-repos.yml create mode 100644 ansible/final.yml diff --git a/ansible/disable-repos.yml b/ansible/disable-repos.yml deleted file mode 100644 index 3e8022965..000000000 --- a/ansible/disable-repos.yml +++ /dev/null @@ -1,7 +0,0 @@ -- hosts: dnf_repos - become: yes - tasks: - - name: Disable pulp repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: disable_repos.yml diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index f4df32521..db7fde901 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -197,15 +197,4 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists -- import_playbook: disable-repos.yml - -- hosts: builder - become: yes - gather_facts: yes - tags: finalise - tasks: - - name: Cleanup image - import_tasks: cleanup.yml - - - name: Shutdown Packer VM - community.general.shutdown: +- import_playbook: final.yml diff --git a/ansible/final.yml b/ansible/final.yml new file mode 100644 index 000000000..1830cd965 --- /dev/null +++ b/ansible/final.yml @@ -0,0 +1,18 @@ +- hosts: dnf_repos + become: yes + tasks: + - name: Disable pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: disable_repos.yml + +- hosts: builder + become: yes + gather_facts: yes + tags: finalise + tasks: + - name: Cleanup image + import_tasks: cleanup.yml + + - name: Shutdown Packer VM + community.general.shutdown: diff --git a/ansible/site.yml b/ansible/site.yml index 7b64059ae..faeca23fd 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -36,6 +36,6 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists -- import_playbook: disable-repos.yml +- import_playbook: final.yml ... \ No newline at end of file From cefbd9adb625be33c97b53a37d398f8b6500efe2 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Apr 2025 12:53:10 +0000 Subject: [PATCH 11/30] move compute-init install and configuration all to the end --- ansible/extras.yml | 19 ------------------- ansible/final.yml | 20 ++++++++++++++++++++ ansible/roles/compute_init/README.md | 27 ++++++++++++--------------- 3 files changed, 32 insertions(+), 34 deletions(-) diff --git a/ansible/extras.yml b/ansible/extras.yml index 428df1850..8e3248d3f 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -58,25 +58,6 @@ - import_role: name: persist_hostkeys -- name: Install compute_init playbook - hosts: compute_init:&builder - tags: compute_init # tagged to allow running on cluster instances for dev - become: yes - tasks: - - include_role: - name: compute_init - tasks_from: install.yml - -- name: Setup NFS export for compute node configuration - hosts: compute_init:!builder - # NB: has to be after eeesi and os-manila-mount - tags: compute_init - become: yes - tasks: - - include_role: - name: compute_init - tasks_from: export.yml - - name: Install k9s become: yes hosts: k9s diff --git a/ansible/final.yml b/ansible/final.yml index 1830cd965..8a12d12c3 100644 --- a/ansible/final.yml +++ b/ansible/final.yml @@ -1,3 +1,23 @@ +- hosts: compute_init + tags: compute_init + become: yes + tasks: + - name: Install compute_init playbook + ansible.builtin.include_role: + name: compute_init + tasks_from: 'install.yml' + when: "{{ appliances_mode == 'build' }}" + # conditional used instead of compute_init!builder to make dev easier + +- hosts: compute_init:!builder + tags: compute_init + become: yes + tasks: + - name: Setup NFS export for compute node configuration + ansible.builtin.include_role: + name: compute_init + tasks_from: export.yml + - hosts: dnf_repos become: yes tasks: diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 81a62bade..0d470ee1b 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -143,35 +143,32 @@ a new image: additionally configure the control node to export compute hostvars over NFS. Check the cluster is up. -2. Reimage the compute nodes: +2. Optionally, reimage the compute nodes to reset services etc.: ansible-playbook --limit compute ansible/adhoc/rebuild.yml -3. Add metadata to a compute node e.g. via Horizon to turn on compute-init - playbook functionality. +3. Add metadata to a compute node (directly via Horizon or via OpenTofu) to + enable the new compute-init playbook functionality. -4. Stop ansible-init from running +4. Stop ansible-init from running: ansible all -ba "systemctl stop ansible-init" -5. Fake an image build to deploy the compute-init playbook: +5. Fake an image build and rerunning the `site.yml` playbook: - ansible-playbook ansible/fatimage.yml --tags compute_init + ansible-playbook ansible/final.yml --tags compute_init - NB: This will also re-export the compute hostvars, as the nodes are not - in the builder group, which conveniently means any changes made to that - play also get picked up. + This both re-installs the compute-init playbook and re-configures the NFS + share with exported compute hostvars etc. -6. Fake a reimage of compute to run ansible-init and the updated compute-init playbook: +6. Fake a reimage of compute nodes to re-run ansible-init and the updated + compute-init playbook: ansible all -ba "rm -f /var/lib/ansible-init.done && systemctl restart ansible-init" - Use `systemctl status ansible-init` to view stdout/stderr from Ansible. - -Steps 4/5/6 can be repeated with changes to the compute script. If required, -reimage the compute node(s) first as in step 2 and/or add additional metadata -as in step 3. +7. Use `systemctl status ansible-init` to view stdout/stderr from Ansible. +Steps 4-7 can be repeated with changes to the compute script until it works. ## Design notes - Duplicating code in roles into the `compute-init` script is unfortunate, but From e1ee0eec7674b83a6e0f385ccc12dedbb32d627b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Apr 2025 13:18:40 +0000 Subject: [PATCH 12/30] fix appliances_mode not defined error --- ansible/iam.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/iam.yml b/ansible/iam.yml index a220819c8..37b648cd9 100644 --- a/ansible/iam.yml +++ b/ansible/iam.yml @@ -55,6 +55,6 @@ tags: sssd tasks: - name: Configure sssd - import_role: + ansible.builtin.include_role: name: sssd tasks_from: "{{ 'install.yml' if appliances_mode == 'build' else 'main.yml' }}" From c44b9f0828b4733d17c0a4c95f1869fdb5b1b6b1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Apr 2025 14:11:16 +0000 Subject: [PATCH 13/30] fix openhpc role task selection --- ansible/slurm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 1bf0bcbd9..94f2d4904 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -57,4 +57,4 @@ tasks: - include_role: name: stackhpc.openhpc - tasks_from: "{{ {'install':'install.yml', 'runtime.yml':'configure'}[appliances_mode] | default('main.yml') }}" + tasks_from: "{{ {'build':'install.yml', 'configure':'runtime.yml'}[appliances_mode] | default('main.yml') }}" From 0f07c36f4fc7a74ed51cc19552b86a12fb3c4603 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 24 Apr 2025 14:57:09 +0000 Subject: [PATCH 14/30] bump CI image --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 37fa04c35..9e36de71f 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250423-1606-b61e2f1a", - "RL9": "openhpc-RL9-250423-1606-b61e2f1a" + "RL8": "openhpc-RL8-250424-1413-c44b9f08", + "RL9": "openhpc-RL9-250424-1413-c44b9f08" } } From fc5889eae4a33128d38683301a87c73531a6a059 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 25 Apr 2025 08:33:38 +0000 Subject: [PATCH 15/30] make ansible_init depedencies correct in groups --- environments/common/inventory/groups | 6 ++++-- environments/common/layouts/everything | 8 ++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 1fc2a8424..a2bcba77b 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -133,8 +133,10 @@ freeipa_client [tuned] # Hosts to run TuneD configuration -[ansible_init] +[ansible_init:children] # Hosts to run linux-anisble-init +compute_init +gateway [sssd] # Hosts to configure sssd on @@ -181,4 +183,4 @@ extra_packages # Hosts where crony configuration is applied. See docs/chrony.md for more details. [gateway] -# Add builder to this group to install gateway ansible-init playbook into image +# Hosts to install compute_init playbook on during image build to configure ip gateway on boot diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 9f2c7c706..08d673912 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -83,10 +83,6 @@ openondemand [tuned:children] # Hosts to run TuneD configuration -[ansible_init:children] -# Hosts to run linux-ansible-init -cluster - [sssd] # Hosts to configure sssd on @@ -123,5 +119,5 @@ builder # Hosts where crony configuration is applied. See docs/chrony.md for more details. [gateway:children] -# Add builder to this group to install gateway ansible-init playbook into image -builder +# Hosts to install compute_init playbook on during image build to configure ip gateway on boot +cluster From 3120e4961ec54605d100a4064c66be3efd859a15 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 25 Apr 2025 10:08:21 +0000 Subject: [PATCH 16/30] bump CI image --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 9e36de71f..b02639411 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250424-1413-c44b9f08", - "RL9": "openhpc-RL9-250424-1413-c44b9f08" + "RL8": "openhpc-RL8-250425-0835-fc5889ea", + "RL9": "openhpc-RL9-250425-0835-fc5889ea" } } From b398135b924121b79f044d6d349af7f877ea4698 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 25 Apr 2025 12:39:45 +0000 Subject: [PATCH 17/30] make compute_init and rebuild group descriptions clearer --- environments/common/inventory/groups | 8 ++++++-- environments/common/layouts/everything | 9 +++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index a2bcba77b..3102524eb 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -63,7 +63,8 @@ mysql cluster [rebuild] -# Enable rebuild of nodes on an OpenStack cloud; add 'control' group. +# Add 'control' group to enable slurm-controlled rebuild of compute nodes +# NB: Compute nodes need compute_init enabled [update] # All hosts to (optionally) run yum update on. @@ -145,7 +146,10 @@ gateway # Hosts where the OpenSSH server daemon should be configured [compute_init] -# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on +# EXPERIMENTAL: Compute hosts which should rejoin the cluster after rebuild +# without running site.yml playbook. +# NB: Additional configuration is required and not all functionality is +# currently supported - ee ansible/roles/compute_init/README.md [k3s:children] # Hosts to run k3s server/agent diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 08d673912..5ac270b55 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -25,7 +25,9 @@ control [filebeat:children] slurm_stats -# NB: [rebuild] not defined here as likely to need features not currently supported +[rebuild] +# Add 'control' group to enable slurm-controlled rebuild of compute nodes +# NB: Compute nodes need compute_init enabled [update:children] @@ -90,7 +92,10 @@ openondemand # Hosts where the OpenSSH server daemon should be configured [compute_init] -# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on +# EXPERIMENTAL: Compute hosts which should rejoin the cluster after rebuild +# without running site.yml playbook. +# NB: Additional configuration is required and not all functionality is +# currently supported - ee ansible/roles/compute_init/README.md [k3s_server:children] # Hosts to run k3s server (should only be single node i.e control node) From 61d0c3b683f8e2a1b1c5531dd48a7795850e8339 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 30 Apr 2025 11:03:38 +0000 Subject: [PATCH 18/30] bump zenith client to 0.14 --- ansible/roles/zenith_proxy/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/zenith_proxy/defaults/main.yml b/ansible/roles/zenith_proxy/defaults/main.yml index 6b1a43aaa..02267cb87 100644 --- a/ansible/roles/zenith_proxy/defaults/main.yml +++ b/ansible/roles/zenith_proxy/defaults/main.yml @@ -15,7 +15,7 @@ zenith_proxy_pod_name: "{{ zenith_proxy_service_name }}" zenith_proxy_client_container_name: "{{ zenith_proxy_client_service_name }}" zenith_proxy_mitm_container_name: "{{ zenith_proxy_mitm_service_name }}" -zenith_proxy_image_tag: '0.12.0' +zenith_proxy_image_tag: '0.14.0' zenith_proxy_client_image_repository: ghcr.io/azimuth-cloud/zenith-client zenith_proxy_client_image: "{{ zenith_proxy_client_image_repository }}:{{ zenith_proxy_image_tag }}" From c55bcc5fa82c5b76dcefdb252f811688720de228 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 30 Apr 2025 13:53:38 +0000 Subject: [PATCH 19/30] try removing zenith_proxy_client_auth_params --- environments/.caas/hooks/post.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/environments/.caas/hooks/post.yml b/environments/.caas/hooks/post.yml index 309610ff9..502985431 100644 --- a/environments/.caas/hooks/post.yml +++ b/environments/.caas/hooks/post.yml @@ -11,8 +11,8 @@ zenith_proxy_upstream_host: "{{ ansible_host }}" # IP zenith_proxy_upstream_port: "{{ grafana_port }}" zenith_proxy_client_token: "{{ zenith_token_monitoring }}" - zenith_proxy_client_auth_params: - tenancy-id: "{{ openstack_project_id }}" + # zenith_proxy_client_auth_params: + # tenancy-id: "{{ openstack_project_id }}" zenith_proxy_mitm_enabled: yes zenith_proxy_mitm_auth_inject: basic zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}" @@ -31,8 +31,8 @@ zenith_proxy_upstream_host: "{{ ansible_host }}" # IP zenith_proxy_upstream_port: 443 zenith_proxy_client_token: "{{ zenith_token_ood }}" - zenith_proxy_client_auth_params: - tenancy-id: "{{ openstack_project_id }}" + # zenith_proxy_client_auth_params: + # tenancy-id: "{{ openstack_project_id }}" zenith_proxy_mitm_enabled: yes zenith_proxy_mitm_auth_inject: basic zenith_proxy_mitm_auth_basic_username: azimuth From aa11f439ffa6b594b8cc23af8d01a30f4e045e08 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 30 Apr 2025 15:18:34 +0000 Subject: [PATCH 20/30] try host networking for zenith --- ansible/roles/zenith_proxy/templates/pod.service.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/zenith_proxy/templates/pod.service.j2 b/ansible/roles/zenith_proxy/templates/pod.service.j2 index d46617556..c2c365155 100644 --- a/ansible/roles/zenith_proxy/templates/pod.service.j2 +++ b/ansible/roles/zenith_proxy/templates/pod.service.j2 @@ -9,7 +9,7 @@ Type=simple Restart=always User={{ zenith_proxy_podman_user }} Group={{ zenith_proxy_podman_user }} -ExecStartPre=/usr/bin/podman pod create --replace --name {{ zenith_proxy_pod_name }} +ExecStartPre=/usr/bin/podman pod create --replace --name {{ zenith_proxy_pod_name }} --network=host ExecStartPre=/usr/bin/podman pod start {{ zenith_proxy_pod_name }} ExecStart=/usr/bin/podman-pod-infra-attach.sh {{ zenith_proxy_pod_name }} ExecStop=/usr/bin/podman pod stop --ignore -t 10 {{ zenith_proxy_pod_name }} From 35ebf8bf5c6a97460537df1775a269fab902c01a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 30 Apr 2025 15:27:43 +0000 Subject: [PATCH 21/30] Revert "try host networking for zenith" This reverts commit aa11f439ffa6b594b8cc23af8d01a30f4e045e08. --- ansible/roles/zenith_proxy/templates/pod.service.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/zenith_proxy/templates/pod.service.j2 b/ansible/roles/zenith_proxy/templates/pod.service.j2 index c2c365155..d46617556 100644 --- a/ansible/roles/zenith_proxy/templates/pod.service.j2 +++ b/ansible/roles/zenith_proxy/templates/pod.service.j2 @@ -9,7 +9,7 @@ Type=simple Restart=always User={{ zenith_proxy_podman_user }} Group={{ zenith_proxy_podman_user }} -ExecStartPre=/usr/bin/podman pod create --replace --name {{ zenith_proxy_pod_name }} --network=host +ExecStartPre=/usr/bin/podman pod create --replace --name {{ zenith_proxy_pod_name }} ExecStartPre=/usr/bin/podman pod start {{ zenith_proxy_pod_name }} ExecStart=/usr/bin/podman-pod-infra-attach.sh {{ zenith_proxy_pod_name }} ExecStop=/usr/bin/podman pod stop --ignore -t 10 {{ zenith_proxy_pod_name }} From 8d264278ed1b2847c85472ef4788d2a612fbf5eb Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 30 Apr 2025 15:32:47 +0000 Subject: [PATCH 22/30] bind grafana to all interfaces for caas --- environments/.caas/inventory/group_vars/all/grafana.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environments/.caas/inventory/group_vars/all/grafana.yml b/environments/.caas/inventory/group_vars/all/grafana.yml index 10fdc926c..5f62958f9 100644 --- a/environments/.caas/inventory/group_vars/all/grafana.yml +++ b/environments/.caas/inventory/group_vars/all/grafana.yml @@ -1 +1,2 @@ grafana_auth_anonymous: "{{ groups['openondemand'] | count > 0 }}" +grafana_address: '0.0.0.0' From 76497fc725b9520622e1d97f6368f960a89ca47c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 30 Apr 2025 15:48:43 +0000 Subject: [PATCH 23/30] make sssd and mysql consistent with others; don't run install during site.yml, by default --- ansible/iam.yml | 2 +- ansible/slurm.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/iam.yml b/ansible/iam.yml index 37b648cd9..a0c59df0f 100644 --- a/ansible/iam.yml +++ b/ansible/iam.yml @@ -57,4 +57,4 @@ - name: Configure sssd ansible.builtin.include_role: name: sssd - tasks_from: "{{ 'install.yml' if appliances_mode == 'build' else 'main.yml' }}" + tasks_from: "{{ {'build':'install.yml', 'configure':'configure.yml'}[appliances_mode] | default('main.yml') }}" diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 94f2d4904..9445208e0 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -7,7 +7,7 @@ tasks: - include_role: name: mysql - tasks_from: "{{ 'install.yml' if appliances_mode == 'build' else 'main.yml' }}" + tasks_from: "{{ {'build':'install.yml', 'configure':'configure.yml'}[appliances_mode] | default('main.yml') }}" - name: Setup slurm-driven rebuild hosts: rebuild:!builder From f2a6689ca4a37b612bcabcd724fc074b0b3057d7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 May 2025 09:50:15 +0000 Subject: [PATCH 24/30] revert zenith pods to slirp4netns to allow rootless pods to reach host --- ansible/roles/zenith_proxy/templates/pod.service.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/zenith_proxy/templates/pod.service.j2 b/ansible/roles/zenith_proxy/templates/pod.service.j2 index d46617556..e10df23f7 100644 --- a/ansible/roles/zenith_proxy/templates/pod.service.j2 +++ b/ansible/roles/zenith_proxy/templates/pod.service.j2 @@ -9,7 +9,7 @@ Type=simple Restart=always User={{ zenith_proxy_podman_user }} Group={{ zenith_proxy_podman_user }} -ExecStartPre=/usr/bin/podman pod create --replace --name {{ zenith_proxy_pod_name }} +ExecStartPre=/usr/bin/podman pod create --replace --name {{ zenith_proxy_pod_name }} --network=slirp4netns ExecStartPre=/usr/bin/podman pod start {{ zenith_proxy_pod_name }} ExecStart=/usr/bin/podman-pod-infra-attach.sh {{ zenith_proxy_pod_name }} ExecStop=/usr/bin/podman pod stop --ignore -t 10 {{ zenith_proxy_pod_name }} From 17a41074ce6f2cc9305d80513557215e28da64ea Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 May 2025 10:11:20 +0000 Subject: [PATCH 25/30] Revert "bind grafana to all interfaces for caas" This reverts commit 8d264278ed1b2847c85472ef4788d2a612fbf5eb. --- environments/.caas/inventory/group_vars/all/grafana.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/environments/.caas/inventory/group_vars/all/grafana.yml b/environments/.caas/inventory/group_vars/all/grafana.yml index 5f62958f9..10fdc926c 100644 --- a/environments/.caas/inventory/group_vars/all/grafana.yml +++ b/environments/.caas/inventory/group_vars/all/grafana.yml @@ -1,2 +1 @@ grafana_auth_anonymous: "{{ groups['openondemand'] | count > 0 }}" -grafana_address: '0.0.0.0' From 33b9991e298a64fa585849b87dd602cf4298ec27 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 May 2025 10:32:32 +0000 Subject: [PATCH 26/30] remove commented-out zenith config --- environments/.caas/hooks/post.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/environments/.caas/hooks/post.yml b/environments/.caas/hooks/post.yml index 502985431..eaaeb23f9 100644 --- a/environments/.caas/hooks/post.yml +++ b/environments/.caas/hooks/post.yml @@ -11,8 +11,6 @@ zenith_proxy_upstream_host: "{{ ansible_host }}" # IP zenith_proxy_upstream_port: "{{ grafana_port }}" zenith_proxy_client_token: "{{ zenith_token_monitoring }}" - # zenith_proxy_client_auth_params: - # tenancy-id: "{{ openstack_project_id }}" zenith_proxy_mitm_enabled: yes zenith_proxy_mitm_auth_inject: basic zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}" @@ -31,8 +29,6 @@ zenith_proxy_upstream_host: "{{ ansible_host }}" # IP zenith_proxy_upstream_port: 443 zenith_proxy_client_token: "{{ zenith_token_ood }}" - # zenith_proxy_client_auth_params: - # tenancy-id: "{{ openstack_project_id }}" zenith_proxy_mitm_enabled: yes zenith_proxy_mitm_auth_inject: basic zenith_proxy_mitm_auth_basic_username: azimuth From 4d77c386ba6d985a1e382f45c0ceaccf4c027aef Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 May 2025 11:05:23 +0000 Subject: [PATCH 27/30] fix gateway group wording --- environments/common/inventory/groups | 5 +++-- environments/common/layouts/everything | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 3102524eb..d8ac9704c 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -135,7 +135,7 @@ freeipa_client # Hosts to run TuneD configuration [ansible_init:children] -# Hosts to run linux-anisble-init +# Hosts to run linux-ansible-init compute_init gateway @@ -187,4 +187,5 @@ extra_packages # Hosts where crony configuration is applied. See docs/chrony.md for more details. [gateway] -# Hosts to install compute_init playbook on during image build to configure ip gateway on boot +# Hosts to install gateway configuration functionality on during image build. +# The actual configuration is performed on boot using ansible-init. diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 5ac270b55..2a419a05a 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -124,5 +124,7 @@ builder # Hosts where crony configuration is applied. See docs/chrony.md for more details. [gateway:children] -# Hosts to install compute_init playbook on during image build to configure ip gateway on boot +# Hosts to install gateway configuration functionality on during image build. +# The actual configuration is performed on boot using ansible-init. +# Default `cluster` means this is installed during "fat image" builds. cluster From 3d05772b31e7021d43b1cb780c6b36987830bdc8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 May 2025 11:49:54 +0000 Subject: [PATCH 28/30] get caas hpctests working with root-squashed nfs --- environments/.caas/inventory/group_vars/all/hpctests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/environments/.caas/inventory/group_vars/all/hpctests.yml b/environments/.caas/inventory/group_vars/all/hpctests.yml index 192c90c5a..a6a2c9174 100644 --- a/environments/.caas/inventory/group_vars/all/hpctests.yml +++ b/environments/.caas/inventory/group_vars/all/hpctests.yml @@ -8,3 +8,6 @@ hpctests_outdir: "{{ playbook_dir }}/.tmp/hpctests" # hpctests run by default in Azimuth but not trying to stress-test the nodes # just check compiler, mpi etc works hpctests_hpl_mem_frac: 0.05 # 5% node memory + +# use basic_user-defined user: +hpctests_user: azimuth From f04fae9c4368d423827699786e5b2fcf48c6e888 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 May 2025 11:59:25 +0000 Subject: [PATCH 29/30] combine caas groups files (now not symlinked from everything anyway) --- environments/.caas/inventory/extra_groups | 16 ---------------- environments/.caas/inventory/groups | 11 ++++++++++- 2 files changed, 10 insertions(+), 17 deletions(-) delete mode 100644 environments/.caas/inventory/extra_groups diff --git a/environments/.caas/inventory/extra_groups b/environments/.caas/inventory/extra_groups deleted file mode 100644 index 45a1dc7aa..000000000 --- a/environments/.caas/inventory/extra_groups +++ /dev/null @@ -1,16 +0,0 @@ -[basic_users:children] -cluster - -[etc_hosts:children] -cluster - -[zenith:children] -grafana -openondemand - -[manila:children] -login -compute - -[podman:children] -zenith diff --git a/environments/.caas/inventory/groups b/environments/.caas/inventory/groups index f5665790f..4441385af 100644 --- a/environments/.caas/inventory/groups +++ b/environments/.caas/inventory/groups @@ -69,8 +69,10 @@ openhpc [proxy] # Hosts to configure http/s proxies - see ansible/roles/proxy/README.md -[manila] +[manila:children] # Hosts to configure for manila fileshares +login +compute [persist_hostkeys:children] # Hosts to use common set of hostkeys which persist across reimaging. @@ -125,3 +127,10 @@ builder [gateway:children] # Add builder to this group to install gateway ansible-init playbook into image builder + +[zenith:children] +grafana +openondemand + +[podman:children] +zenith From e94abb236eb25548f068ce565187107949818dda Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 1 May 2025 12:06:31 +0000 Subject: [PATCH 30/30] caas: mount homedirs on control node too for manila for consistency and fix homedir creation --- environments/.caas/inventory/group_vars/all/basic_users.yml | 3 +++ environments/.caas/inventory/groups | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/environments/.caas/inventory/group_vars/all/basic_users.yml b/environments/.caas/inventory/group_vars/all/basic_users.yml index 2823a4862..0e381486e 100644 --- a/environments/.caas/inventory/group_vars/all/basic_users.yml +++ b/environments/.caas/inventory/group_vars/all/basic_users.yml @@ -10,3 +10,6 @@ basic_users_users: - adm - systemd-journal sudo: azimuth ALL=(ALL) NOPASSWD:ALL + +# the path *on the control node* for the home directories depends on the filesystem: +basic_users_homedir_server_path: "{{ '/home' if cluster_home_manila_share | bool else '/exports/home' }}" diff --git a/environments/.caas/inventory/groups b/environments/.caas/inventory/groups index 4441385af..dbafc523e 100644 --- a/environments/.caas/inventory/groups +++ b/environments/.caas/inventory/groups @@ -71,8 +71,7 @@ openhpc [manila:children] # Hosts to configure for manila fileshares -login -compute +cluster [persist_hostkeys:children] # Hosts to use common set of hostkeys which persist across reimaging.