Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
e716a16
use TF-based jumphost for leafcloud
sjpb Jan 6, 2026
ee96604
update packer bastion details
sjpb Jan 6, 2026
f53e2c9
don't rely on jumphost having private key
sjpb Jan 7, 2026
3e58718
try fixing quoting for ssh args
sjpb Jan 7, 2026
888513e
try using ssh agent for build instead
sjpb Jan 7, 2026
920817a
fix default bastion private keyfile
sjpb Jan 7, 2026
8bc255d
build works locally
sjpb Jan 7, 2026
bb66270
debug ansible connection info
sjpb Jan 7, 2026
efc9914
print pubkey during fatimage ssh setup
sjpb Jan 7, 2026
9e17edb
remove identitiesonly (doubled in connection string) and show ssh ver…
sjpb Jan 7, 2026
12d44f0
try without specifying key location
sjpb Jan 7, 2026
2b5c6ac
try explicit proxycommand identity file
sjpb Jan 7, 2026
1a5ed79
test bastion
sjpb Jan 7, 2026
9677e18
update bastion fingerprints
sjpb Jan 7, 2026
a990a9a
turnoff verbose ansible during build
sjpb Jan 7, 2026
6132e45
add other CI cloud bastion fingerprints back
sjpb Jan 8, 2026
791e152
provide same manual run options to extrabuild workflow as fatimage has
sjpb Jan 8, 2026
42aebaa
debug extrabuild
sjpb Jan 8, 2026
a74fa80
remove debugging
sjpb Jan 8, 2026
43d6660
extrabuild use repo CI_CLOUD
sjpb Jan 8, 2026
1a4bbdc
use PR label for CI CLOUD for automatic extrabuilds
sjpb Jan 8, 2026
f5ac850
temporarily bump CI image to test workflow_call extrabuild
sjpb Jan 8, 2026
29ee001
Revert "temporarily bump CI image to test workflow_call extrabuild"
sjpb Jan 8, 2026
80d0dc8
store bastion fingerprints in repo vars to decouple from checkout
sjpb Jan 8, 2026
1b468a7
remove unneeded ssh setup from trivy scan
sjpb Jan 8, 2026
04730c0
fixup packer on error logic for extrabuild
sjpb Jan 8, 2026
b0f2a0b
fix splatting of bastion fingerprints
sjpb Jan 8, 2026
dddcedc
revert changes to ansible_ssh_common_args to see if stackhpc workflow…
sjpb Jan 8, 2026
b90397e
make stackhpc workflow bastion definitions always use current branch
sjpb Jan 8, 2026
9656175
fix previous
sjpb Jan 8, 2026
a737c00
add retries to compute/login rebuild in CI due to Leafcloud flakiness
sjpb Jan 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 41 additions & 4 deletions .github/workflows/extra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,23 @@ name: Test extra build
on:
workflow_call:
workflow_dispatch:
# checkov:skip=CKV_GHA_7: "The build output cannot be affected by user parameters other than the build entry point and the top-level source location. GitHub Actions workflow_dispatch inputs MUST be empty. "
inputs:
ci_cloud:
description: 'Select the CI_CLOUD'
required: true
type: choice
options:
- default
- LEAFCLOUD
- SMS
- ARCUS
default: default # Use repo CI_CLOUD setting or PR label
cleanup_on_failure:
description: Cleanup Packer resources on failure
type: boolean
required: true
default: true

permissions:
contents: read
Expand All @@ -34,9 +51,10 @@ jobs:
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
CI_CLOUD: ${{ github.event.inputs.ci_cloud == 'default' && vars.CI_CLOUD || github.event.inputs.ci_cloud || vars.CI_CLOUD }}
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}
PACKER_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PACKER_ON_ERROR: ${{ github.event.inputs.cleanup_on_failure == 'false' && 'abort' || vars.PACKER_ON_ERROR }}

steps:
- uses: actions/checkout@v4
Expand All @@ -50,9 +68,24 @@ jobs:
echo EOF
} >> "$GITHUB_ENV"

- name: Override CI_CLOUD if PR label is present
if: ${{ github.event_name == 'pull_request' }}
run: |
# Iterate over the labels
labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name')
echo "$labels"
for label in $labels; do
if [[ $label == CI_CLOUD=* ]]; then
# Extract the value after 'CI_CLOUD='
CI_CLOUD_OVERRIDE=${label#CI_CLOUD=}
echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> "$GITHUB_ENV"
fi
done

- name: Record settings
run: |
echo CI_CLOUD: ${{ env.CI_CLOUD }}
echo PACKER_ON_ERROR: ${{ env.PACKER_ON_ERROR}}
echo "FAT_IMAGES: ${FAT_IMAGES}"

- name: Setup ssh
Expand All @@ -61,10 +94,14 @@ jobs:
mkdir ~/.ssh
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
ssh-keygen -f ~/.ssh/id_rsa -y # tests key format is correct
shell: bash

- name: Add bastion's ssh key to known_hosts
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
- name: Add bastion ssh fingerprints to known_hosts
run: |
cat >> ~/.ssh/known_hosts << 'EOF'
${{ vars.BASTION_FINGERPRINTS }}
EOF
shell: bash

- name: Install ansible etc
Expand All @@ -91,7 +128,7 @@ jobs:
packer init .

PACKER_LOG=1 packer build \
-on-error=${{ vars.PACKER_ON_ERROR }} \
-on-error=${{ env.PACKER_ON_ERROR }} \
-var-file="$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl" \
-var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \
-var "image_name=${{ matrix.build.image_name }}" \
Expand Down
8 changes: 6 additions & 2 deletions .github/workflows/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,14 @@ jobs:
mkdir ~/.ssh
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
ssh-keygen -f ~/.ssh/id_rsa -y # tests key format is correct
shell: bash

- name: Add bastion's ssh key to known_hosts
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
- name: Add bastion ssh fingerprints to known_hosts
run: |
cat >> ~/.ssh/known_hosts << 'EOF'
${{ vars.BASTION_FINGERPRINTS }}
EOF
shell: bash

- name: Install ansible etc
Expand Down
21 changes: 18 additions & 3 deletions .github/workflows/stackhpc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,20 @@ jobs:
mkdir ~/.ssh
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
ssh-keygen -f ~/.ssh/id_rsa -y # tests key format is correct
shell: bash

- name: Add bastion's ssh key to known_hosts
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
- name: Add bastion ssh fingerprints to known_hosts
run: |
cat >> ~/.ssh/known_hosts << 'EOF'
${{ vars.BASTION_FINGERPRINTS }}
EOF
shell: bash

- name: Ensure Ansible bastion definitions are from **current** branch
run: |
git fetch origin ${{ github.head_ref || github.ref_name }}
git checkout origin/${{ github.head_ref || github.ref_name }} -- environments/.stackhpc/inventory/group_vars/all/bastion.yml
shell: bash

- uses: actions/setup-python@v6
Expand Down Expand Up @@ -174,7 +184,12 @@ jobs:
. environments/.stackhpc/activate
cd "$STACKHPC_TF_DIR"
tofu init
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
max_retries=3
delay=30
for i in $(seq 1 $max_retries); do
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" && break
[ "$i" -lt "$max_retries" ] && sleep $delay || exit 1
done

- name: Configure cluster using current branch
run: |
Expand Down
12 changes: 0 additions & 12 deletions .github/workflows/trivyscan.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,6 @@ jobs:
run: |
echo CI_CLOUD: ${{ env.CI_CLOUD }}

- name: Setup ssh
run: |
set -x
mkdir ~/.ssh
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
shell: bash

- name: Add bastion's ssh key to known_hosts
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
shell: bash

- name: setup environment
run: |
python3 -m venv venv
Expand Down
2 changes: 1 addition & 1 deletion environments/.stackhpc/LEAFCLOUD.pkrvars.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ ssh_private_key_file = "~/.ssh/id_rsa"
security_groups = ["default", "SSH"]
# see environments/.stackhpc/inventory/group_vars/all/bastion.yml:
ssh_bastion_username = "slurm-app-ci"
ssh_bastion_host = "195.114.30.222"
ssh_bastion_host = "45.135.59.32"
ssh_bastion_private_key_file = "~/.ssh/id_rsa"
8 changes: 0 additions & 8 deletions environments/.stackhpc/bastion_fingerprints

This file was deleted.

6 changes: 3 additions & 3 deletions environments/.stackhpc/inventory/group_vars/all/bastion.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@ bastion_config:
ARCUS:
user: slurm-app-ci
ip: 128.232.222.183
LEAFCLOUD:
LEAFCLOUD: # https://github.com/stackhpc/leafcloud-slurm-jumphost
user: slurm-app-ci
ip: 195.114.30.222
ip: 45.135.59.32
SMS:
user: slurm-app-ci
ip: 185.45.78.150
# NB: The bastion_{user,ip} variables are used directly in the CI workflow too
bastion_user: "{{ bastion_config[ci_cloud].user }}"
bastion_ip: "{{ bastion_config[ci_cloud].ip }}"
ansible_ssh_common_args: '-o ProxyCommand="ssh {{ bastion_user }}@{{ bastion_ip }} -W %h:%p"'
ansible_ssh_common_args: "-o ProxyCommand='ssh {{ bastion_user }}@{{ bastion_ip }} -W %h:%p'"
2 changes: 1 addition & 1 deletion packer/openstack.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ variable "ssh_bastion_username" {

variable "ssh_bastion_private_key_file" {
type = string
default = "~/.ssh/id_rsa"
default = null
}

variable "floating_ip_network" {
Expand Down