Skip to content

Commit dbdf9ae

Browse files
authored
Merge pull request #293 from TopRichard/EESSI-pr-473
add Lmod hook to set to ^smcuda when loading OpenMPI module to work around bug + renaming
2 parents bc265dc + 296f00c commit dbdf9ae

File tree

1 file changed

+29
-8
lines changed

1 file changed

+29
-8
lines changed

create_lmodrc.py

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
local simpleName = string.match(t.modFullName, "(.-)/")
3636
-- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections.
3737
-- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse
38-
-- to load the CUDA module and print an informative message on how to set up GPU support for EESSI
38+
-- to load the CUDA module and print an informative message on how to set up GPU support for NESSI
3939
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
4040
if simpleName == 'CUDA' then
4141
-- get the full host_injections path
@@ -44,26 +44,26 @@
4444
local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
4545
local cudaDirExists = isDir(cudaEasyBuildDir)
4646
if not cudaDirExists then
47-
local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI "
48-
advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where EESSI "
47+
local advice = "but while the module file exists, the actual software is not entirely shipped with NESSI "
48+
advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where NESSI "
4949
advice = advice .. "can find it.\\n"
5050
advice = advice .. refer_to_docs
5151
LmodError("\\nYou requested to load ", simpleName, " ", advice)
5252
end
5353
end
54-
-- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker,
54+
-- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the NESSI linker,
5555
-- otherwise, refuse to load the requested module and print error message
5656
local haveGpu = mt:haveProperty(simpleName,"arch","gpu")
5757
if haveGpu then
5858
local arch = os.getenv("EESSI_CPU_FAMILY") or ""
59-
local cudaVersionFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
60-
local cudaDriverFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
59+
local cudaVersionFile = "/cvmfs/pilot.nessi.no/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
60+
local cudaDriverFile = "/cvmfs/pilot.nessi.no/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
6161
local cudaDriverExists = isFile(cudaDriverFile)
6262
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
6363
if not (cudaDriverExists or singularityCudaExists) then
6464
local advice = "which relies on the CUDA runtime environment and driver libraries. "
6565
advice = advice .. "In order to be able to use the module, you will need "
66-
advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system.\\n"
66+
advice = advice .. "to make sure NESSI can find the GPU driver libraries on your host system.\\n"
6767
advice = advice .. refer_to_docs
6868
LmodError("\\nYou requested to load ", simpleName, " ", advice)
6969
else
@@ -85,7 +85,7 @@
8585
if driver_libs_need_update == true then
8686
local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
8787
advice = advice .. "Please update your CUDA driver libraries and then "
88-
advice = advice .. "let EESSI know about the update.\\n"
88+
advice = advice .. "let NESSI know about the update.\\n"
8989
advice = advice .. refer_to_docs
9090
LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice)
9191
end
@@ -94,7 +94,28 @@
9494
end
9595
end
9696
97+
local function openmpi_load_hook(t)
98+
-- disable smcuda BTL when loading OpenMPI module for aarch64/neoverse_v1,
99+
-- to work around hang/crash due to bug in OpenMPI;
100+
-- see https://gitlab.com/eessi/support/-/issues/41
101+
local frameStk = require("FrameStk"):singleton()
102+
local mt = frameStk:mt()
103+
local moduleName = string.match(t.modFullName, "(.-)/")
104+
local cpuTarget = os.getenv("EESSI_SOFTWARE_SUBDIR") or ""
105+
if (moduleName == "OpenMPI") and (cpuTarget == "aarch64/neoverse_v1") then
106+
local msg = "Adding '^smcuda' to $OMPI_MCA_btl to work around bug in OpenMPI"
107+
LmodMessage(msg .. " (see https://gitlab.com/eessi/support/-/issues/41)")
108+
local ompiMcaBtl = os.getenv("OMPI_MCA_btl")
109+
if ompiMcaBtl == nil then
110+
setenv("OMPI_MCA_btl", "^smcuda")
111+
else
112+
setenv("OMPI_MCA_btl", ompiMcaBtl .. ",^smcuda")
113+
end
114+
end
115+
end
116+
97117
hook.register("load", cuda_enabled_load_hook)
118+
hook.register("load", openmpi_load_hook)
98119
"""
99120

100121
def error(msg):

0 commit comments

Comments
 (0)