Skip to content

Commit f349fde

Browse files
authored
Merge pull request #488 from bedroge/software_rebuilds
Add functionality for rebuilding software: try it on OpenMPI 4.1.x to fix `smcuda` issue
2 parents 73905db + bde75ee commit f349fde

6 files changed

+207
-41
lines changed

EESSI-install-software.sh

+10-9
Original file line numberDiff line numberDiff line change
@@ -207,26 +207,27 @@ changed_easystacks=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z
207207
if [ -z ${changed_easystacks} ]; then
208208
echo "No missing installations, party time!" # Ensure the bot report success, as there was nothing to be build here
209209
else
210+
210211
for easystack_file in ${changed_easystacks}; do
211-
212+
212213
echo -e "Processing easystack file ${easystack_file}...\n\n"
213-
214+
214215
# determine version of EasyBuild module to load based on EasyBuild version included in name of easystack file
215216
eb_version=$(echo ${easystack_file} | sed 's/.*eb-\([0-9.]*\).*/\1/g')
216-
217+
217218
# load EasyBuild module (will be installed if it's not available yet)
218219
source ${TOPDIR}/load_easybuild_module.sh ${eb_version}
219-
220+
220221
${EB} --show-config
221-
222+
222223
echo_green "All set, let's start installing some software with EasyBuild v${eb_version} in ${EASYBUILD_INSTALLPATH}..."
223-
224+
224225
if [ -f ${easystack_file} ]; then
225226
echo_green "Feeding easystack file ${easystack_file} to EasyBuild..."
226-
227+
227228
${EB} --easystack ${TOPDIR}/${easystack_file} --robot
228229
ec=$?
229-
230+
230231
# copy EasyBuild log file if EasyBuild exited with an error
231232
if [ ${ec} -ne 0 ]; then
232233
eb_last_log=$(unset EB_VERBOSE; eb --last-log)
@@ -241,7 +242,7 @@ else
241242
else
242243
fatal_error "Easystack file ${easystack_file} not found!"
243244
fi
244-
245+
245246
done
246247
fi
247248

EESSI-remove-software.sh

+125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
#!/bin/bash
2+
#
3+
# Script to remove part of the EESSI software stack (version set through init/eessi_defaults)
4+
5+
# see example parsing of command line arguments at
6+
# https://wiki.bash-hackers.org/scripting/posparams#using_a_while_loop
7+
# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash
8+
9+
display_help() {
10+
echo "usage: $0 [OPTIONS]"
11+
echo " -g | --generic - instructs script to build for generic architecture target"
12+
echo " -h | --help - display this usage information"
13+
}
14+
15+
POSITIONAL_ARGS=()
16+
17+
while [[ $# -gt 0 ]]; do
18+
case $1 in
19+
-g|--generic)
20+
DETECTION_PARAMETERS="--generic"
21+
shift
22+
;;
23+
-h|--help)
24+
display_help # Call your function
25+
# no shifting needed here, we're done.
26+
exit 0
27+
;;
28+
-*|--*)
29+
echo "Error: Unknown option: $1" >&2
30+
exit 1
31+
;;
32+
*) # No more options
33+
POSITIONAL_ARGS+=("$1") # save positional arg
34+
shift
35+
;;
36+
esac
37+
done
38+
39+
set -- "${POSITIONAL_ARGS[@]}"
40+
41+
TOPDIR=$(dirname $(realpath $0))
42+
43+
export TMPDIR=$(mktemp -d /tmp/eessi-remove.XXXXXXXX)
44+
45+
source $TOPDIR/scripts/utils.sh
46+
47+
echo ">> Determining software subdirectory to use for current build host..."
48+
if [ -z $EESSI_SOFTWARE_SUBDIR_OVERRIDE ]; then
49+
export EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 $TOPDIR/eessi_software_subdir.py $DETECTION_PARAMETERS)
50+
echo ">> Determined \$EESSI_SOFTWARE_SUBDIR_OVERRIDE via 'eessi_software_subdir.py $DETECTION_PARAMETERS' script"
51+
else
52+
echo ">> Picking up pre-defined \$EESSI_SOFTWARE_SUBDIR_OVERRIDE: ${EESSI_SOFTWARE_SUBDIR_OVERRIDE}"
53+
fi
54+
55+
echo ">> Setting up environment..."
56+
57+
source $TOPDIR/init/bash
58+
59+
if [ -d $EESSI_CVMFS_REPO ]; then
60+
echo_green "$EESSI_CVMFS_REPO available, OK!"
61+
else
62+
fatal_error "$EESSI_CVMFS_REPO is not available!"
63+
fi
64+
65+
if [[ -z ${EESSI_SOFTWARE_SUBDIR} ]]; then
66+
fatal_error "Failed to determine software subdirectory?!"
67+
elif [[ "${EESSI_SOFTWARE_SUBDIR}" != "${EESSI_SOFTWARE_SUBDIR_OVERRIDE}" ]]; then
68+
fatal_error "Values for EESSI_SOFTWARE_SUBDIR_OVERRIDE (${EESSI_SOFTWARE_SUBDIR_OVERRIDE}) and EESSI_SOFTWARE_SUBDIR (${EESSI_SOFTWARE_SUBDIR}) differ!"
69+
else
70+
echo_green ">> Using ${EESSI_SOFTWARE_SUBDIR} as software subdirectory!"
71+
fi
72+
73+
echo ">> Configuring EasyBuild..."
74+
EB="eb"
75+
source $TOPDIR/configure_easybuild
76+
77+
echo ">> Setting up \$MODULEPATH..."
78+
# make sure no modules are loaded
79+
module --force purge
80+
# ignore current $MODULEPATH entirely
81+
module unuse $MODULEPATH
82+
module use $EASYBUILD_INSTALLPATH/modules/all
83+
if [[ -z ${MODULEPATH} ]]; then
84+
fatal_error "Failed to set up \$MODULEPATH?!"
85+
else
86+
echo_green ">> MODULEPATH set up: ${MODULEPATH}"
87+
fi
88+
89+
# assume there's only one diff file that corresponds to the PR patch file
90+
pr_diff=$(ls [0-9]*.diff | head -1)
91+
92+
# if this script is run as root, use PR patch file to determine if software needs to be removed first
93+
if [ $EUID -eq 0 ]; then
94+
changed_easystacks_rebuilds=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing' | grep "/rebuilds/")
95+
if [ -z ${changed_easystacks_rebuilds} ]; then
96+
echo "No software needs to be removed."
97+
else
98+
for easystack_file in ${changed_easystacks_rebuilds}; do
99+
# determine version of EasyBuild module to load based on EasyBuild version included in name of easystack file
100+
eb_version=$(echo ${easystack_file} | sed 's/.*eb-\([0-9.]*\).*/\1/g')
101+
102+
# load EasyBuild module (will be installed if it's not available yet)
103+
source ${TOPDIR}/load_easybuild_module.sh ${eb_version}
104+
105+
if [ -f ${easystack_file} ]; then
106+
echo_green "Software rebuild(s) requested in ${easystack_file}, so determining which existing installation have to be removed..."
107+
# we need to remove existing installation directories first,
108+
# so let's figure out which modules have to be rebuilt by doing a dry-run and grepping "someapp/someversion" for the relevant lines (with [R])
109+
# * [R] $CFGS/s/someapp/someapp-someversion.eb (module: someapp/someversion)
110+
rebuild_apps=$(eb --allow-use-as-root-and-accept-consequences --dry-run-short --rebuild --easystack ${easystack_file} | grep "^ \* \[R\]" | grep -o "module: .*[^)]" | awk '{print $2}')
111+
for app in ${rebuild_apps}; do
112+
app_dir=${EASYBUILD_INSTALLPATH}/software/${app}
113+
app_module=${EASYBUILD_INSTALLPATH}/modules/all/${app}.lua
114+
echo_yellow "Removing ${app_dir} and ${app_module}..."
115+
rm -rf ${app_dir}
116+
rm -rf ${app_module}
117+
done
118+
else
119+
fatal_error "Easystack file ${easystack_file} not found!"
120+
fi
121+
done
122+
fi
123+
else
124+
fatal_error "This script can only be run by root!"
125+
fi

bot/build.sh

+46-11
Original file line numberDiff line numberDiff line change
@@ -168,12 +168,56 @@ COMMON_ARGS+=("--mode" "run")
168168
# make sure to use the same parent dir for storing tarballs of tmp
169169
PREVIOUS_TMP_DIR=${PWD}/previous_tmp
170170

171+
# prepare arguments to install_software_layer.sh (specific to build step)
172+
declare -a BUILD_STEP_ARGS=()
173+
declare -a INSTALL_SCRIPT_ARGS=()
174+
declare -a REMOVAL_SCRIPT_ARGS=()
175+
if [[ ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} =~ .*/generic$ ]]; then
176+
INSTALL_SCRIPT_ARGS+=("--generic")
177+
REMOVAL_SCRIPT_ARGS+=("--generic")
178+
fi
179+
[[ ! -z ${BUILD_LOGS_DIR} ]] && INSTALL_SCRIPT_ARGS+=("--build-logs-dir" "${BUILD_LOGS_DIR}")
180+
[[ ! -z ${SHARED_FS_PATH} ]] && INSTALL_SCRIPT_ARGS+=("--shared-fs-path" "${SHARED_FS_PATH}")
181+
182+
# determine if the removal step has to be run
183+
# assume there's only one diff file that corresponds to the PR patch file
184+
pr_diff=$(ls [0-9]*.diff | head -1)
185+
changed_easystacks_rebuilds=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | grep "/rebuilds/")
186+
if [[ -z ${changed_easystacks_rebuilds} ]]; then
187+
echo "This PR does not add any easystack files in a rebuilds subdirectory, so let's skip the removal step."
188+
else
189+
# prepare directory to store tarball of tmp for removal and build steps
190+
TARBALL_TMP_REMOVAL_STEP_DIR=${PREVIOUS_TMP_DIR}/removal_step
191+
mkdir -p ${TARBALL_TMP_REMOVAL_STEP_DIR}
192+
193+
# prepare arguments to eessi_container.sh specific to remove step
194+
declare -a REMOVAL_STEP_ARGS=()
195+
REMOVAL_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}")
196+
REMOVAL_STEP_ARGS+=("--storage" "${STORAGE}")
197+
# add fakeroot option in order to be able to remove software, see:
198+
# https://github.com/EESSI/software-layer/issues/312
199+
REMOVAL_STEP_ARGS+=("--fakeroot")
200+
201+
# create tmp file for output of removal step
202+
removal_outerr=$(mktemp remove.outerr.XXXX)
203+
204+
echo "Executing command to remove software:"
205+
echo "./eessi_container.sh ${COMMON_ARGS[@]} ${REMOVAL_STEP_ARGS[@]}"
206+
echo " -- ./EESSI-remove-software.sh \"${REMOVAL_SCRIPT_ARGS[@]}\" \"$@\" 2>&1 | tee -a ${removal_outerr}"
207+
./eessi_container.sh "${COMMON_ARGS[@]}" "${REMOVAL_STEP_ARGS[@]}" \
208+
-- ./EESSI-remove-software.sh "${REMOVAL_SCRIPT_ARGS[@]}" "$@" 2>&1 | tee -a ${removal_outerr}
209+
210+
# make sure that the build step resumes from the same temporary directory
211+
# this is important, as otherwise the removed software will still be there
212+
REMOVAL_TMPDIR=$(grep ' as tmp directory ' ${removal_outerr} | cut -d ' ' -f 2)
213+
BUILD_STEP_ARGS+=("--resume" "${REMOVAL_TMPDIR}")
214+
fi
215+
171216
# prepare directory to store tarball of tmp for build step
172217
TARBALL_TMP_BUILD_STEP_DIR=${PREVIOUS_TMP_DIR}/build_step
173218
mkdir -p ${TARBALL_TMP_BUILD_STEP_DIR}
174219

175220
# prepare arguments to eessi_container.sh specific to build step
176-
declare -a BUILD_STEP_ARGS=()
177221
BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}")
178222
BUILD_STEP_ARGS+=("--storage" "${STORAGE}")
179223
# add options required to handle NVIDIA support
@@ -182,14 +226,6 @@ if [[ ! -z ${SHARED_FS_PATH} ]]; then
182226
BUILD_STEP_ARGS+=("--host-injections" "${SHARED_FS_PATH}/host-injections")
183227
fi
184228

185-
# prepare arguments to install_software_layer.sh (specific to build step)
186-
declare -a INSTALL_SCRIPT_ARGS=()
187-
if [[ ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} =~ .*/generic$ ]]; then
188-
INSTALL_SCRIPT_ARGS+=("--generic")
189-
fi
190-
[[ ! -z ${BUILD_LOGS_DIR} ]] && INSTALL_SCRIPT_ARGS+=("--build-logs-dir" "${BUILD_LOGS_DIR}")
191-
[[ ! -z ${SHARED_FS_PATH} ]] && INSTALL_SCRIPT_ARGS+=("--shared-fs-path" "${SHARED_FS_PATH}")
192-
193229
# create tmp file for output of build step
194230
build_outerr=$(mktemp build.outerr.XXXX)
195231

@@ -211,8 +247,7 @@ declare -a TARBALL_STEP_ARGS=()
211247
TARBALL_STEP_ARGS+=("--save" "${TARBALL_TMP_TARBALL_STEP_DIR}")
212248

213249
# determine temporary directory to resume from
214-
BUILD_TMPDIR=$(grep ' as tmp directory ' ${build_outerr} | cut -d ' ' -f 2)
215-
TARBALL_STEP_ARGS+=("--resume" "${BUILD_TMPDIR}")
250+
TARBALL_STEP_ARGS+=("--resume" "${REMOVAL_TMPDIR}")
216251

217252
timestamp=$(date +%s)
218253
# to set EESSI_VERSION we need to source init/eessi_defaults now

create_lmodsitepackage.py

-21
Original file line numberDiff line numberDiff line change
@@ -84,31 +84,10 @@
8484
end
8585
end
8686
87-
local function eessi_openmpi_load_hook(t)
88-
-- disable smcuda BTL when loading OpenMPI module for aarch64/neoverse_v1,
89-
-- to work around hang/crash due to bug in OpenMPI;
90-
-- see https://gitlab.com/eessi/support/-/issues/41
91-
local frameStk = require("FrameStk"):singleton()
92-
local mt = frameStk:mt()
93-
local moduleName = string.match(t.modFullName, "(.-)/")
94-
local cpuTarget = os.getenv("EESSI_SOFTWARE_SUBDIR") or ""
95-
if (moduleName == "OpenMPI") and (cpuTarget == "aarch64/neoverse_v1") then
96-
local msg = "Adding '^smcuda' to $OMPI_MCA_btl to work around bug in OpenMPI"
97-
LmodMessage(msg .. " (see https://gitlab.com/eessi/support/-/issues/41)")
98-
local ompiMcaBtl = os.getenv("OMPI_MCA_btl")
99-
if ompiMcaBtl == nil then
100-
setenv("OMPI_MCA_btl", "^smcuda")
101-
else
102-
setenv("OMPI_MCA_btl", ompiMcaBtl .. ",^smcuda")
103-
end
104-
end
105-
end
106-
10787
-- Combine both functions into a single one, as we can only register one function as load hook in lmod
10888
-- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed
10989
function eessi_load_hook(t)
11090
eessi_cuda_enabled_load_hook(t)
111-
eessi_openmpi_load_hook(t)
11291
end
11392
11493
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# 2024-03-01
2+
# Rebuild all OpenMPI 4.1.x versions due to an issue with smcuda:
3+
# https://github.com/open-mpi/ompi/issues/12270
4+
# https://github.com/open-mpi/ompi/pull/12344
5+
# https://github.com/easybuilders/easybuild-easyconfigs/pull/19940
6+
easyconfigs:
7+
- OpenMPI-4.1.4-GCC-12.2.0.eb:
8+
options:
9+
from-pr: 19940
10+
- OpenMPI-4.1.5-GCC-12.3.0:
11+
options:
12+
from-pr: 19940
13+
- OpenMPI-4.1.6-GCC-13.2.0:
14+
options:
15+
from-pr: 19940

eessi_container.sh

+11
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ display_help() {
7373
echo " -a | --access {ro,rw} - ro (read-only), rw (read & write) [default: ro]"
7474
echo " -c | --container IMG - image file or URL defining the container to use"
7575
echo " [default: docker://ghcr.io/eessi/build-node:debian11]"
76+
echo " -f | --fakeroot - run the container with --fakeroot [default: false]"
7677
echo " -g | --storage DIR - directory space on host machine (used for"
7778
echo " temporary data) [default: 1. TMPDIR, 2. /tmp]"
7879
echo " -h | --help - display this usage information [default: false]"
@@ -113,6 +114,7 @@ display_help() {
113114
ACCESS="ro"
114115
CONTAINER="docker://ghcr.io/eessi/build-node:debian11"
115116
#DRY_RUN=0
117+
FAKEROOT=0
116118
VERBOSE=0
117119
STORAGE=
118120
LIST_REPOS=0
@@ -140,6 +142,10 @@ while [[ $# -gt 0 ]]; do
140142
# DRY_RUN=1
141143
# shift 1
142144
# ;;
145+
-f|--fakeroot)
146+
FAKEROOT=1
147+
shift 1
148+
;;
143149
-g|--storage)
144150
STORAGE="$2"
145151
shift 2
@@ -466,6 +472,11 @@ if [[ ${SETUP_NVIDIA} -eq 1 ]]; then
466472
fi
467473
fi
468474

475+
# Configure the fakeroot setting for the container
476+
if [[ ${FAKEROOT} -eq 1 ]]; then
477+
ADDITIONAL_CONTAINER_OPTIONS+=("--fakeroot")
478+
fi
479+
469480
# set up repository config (always create directory repos_cfg and populate it with info when
470481
# arg -r|--repository is used)
471482
mkdir -p ${EESSI_TMPDIR}/repos_cfg

0 commit comments

Comments
 (0)