|
35 | 35 | local simpleName = string.match(t.modFullName, "(.-)/")
|
36 | 36 | -- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections.
|
37 | 37 | -- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse
|
38 |
| - -- to load the CUDA module and print an informative message on how to set up GPU support for EESSI |
| 38 | + -- to load the CUDA module and print an informative message on how to set up GPU support for NESSI |
39 | 39 | local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
|
40 | 40 | if simpleName == 'CUDA' then
|
41 | 41 | -- get the full host_injections path
|
|
44 | 44 | local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
|
45 | 45 | local cudaDirExists = isDir(cudaEasyBuildDir)
|
46 | 46 | if not cudaDirExists then
|
47 |
| - local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI " |
48 |
| - advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where EESSI " |
| 47 | + local advice = "but while the module file exists, the actual software is not entirely shipped with NESSI " |
| 48 | + advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where NESSI " |
49 | 49 | advice = advice .. "can find it.\\n"
|
50 | 50 | advice = advice .. refer_to_docs
|
51 | 51 | LmodError("\\nYou requested to load ", simpleName, " ", advice)
|
52 | 52 | end
|
53 | 53 | end
|
54 |
| - -- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker, |
| 54 | + -- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the NESSI linker, |
55 | 55 | -- otherwise, refuse to load the requested module and print error message
|
56 | 56 | local haveGpu = mt:haveProperty(simpleName,"arch","gpu")
|
57 | 57 | if haveGpu then
|
58 | 58 | local arch = os.getenv("EESSI_CPU_FAMILY") or ""
|
59 |
| - local cudaVersionFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt" |
60 |
| - local cudaDriverFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/libcuda.so" |
| 59 | + local cudaVersionFile = "/cvmfs/pilot.nessi.no/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt" |
| 60 | + local cudaDriverFile = "/cvmfs/pilot.nessi.no/host_injections/nvidia/" .. arch .. "/latest/libcuda.so" |
61 | 61 | local cudaDriverExists = isFile(cudaDriverFile)
|
62 | 62 | local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
|
63 | 63 | if not (cudaDriverExists or singularityCudaExists) then
|
64 | 64 | local advice = "which relies on the CUDA runtime environment and driver libraries. "
|
65 | 65 | advice = advice .. "In order to be able to use the module, you will need "
|
66 |
| - advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system.\\n" |
| 66 | + advice = advice .. "to make sure NESSI can find the GPU driver libraries on your host system.\\n" |
67 | 67 | advice = advice .. refer_to_docs
|
68 | 68 | LmodError("\\nYou requested to load ", simpleName, " ", advice)
|
69 | 69 | else
|
|
85 | 85 | if driver_libs_need_update == true then
|
86 | 86 | local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
|
87 | 87 | advice = advice .. "Please update your CUDA driver libraries and then "
|
88 |
| - advice = advice .. "let EESSI know about the update.\\n" |
| 88 | + advice = advice .. "let NESSI know about the update.\\n" |
89 | 89 | advice = advice .. refer_to_docs
|
90 | 90 | LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice)
|
91 | 91 | end
|
|
94 | 94 | end
|
95 | 95 | end
|
96 | 96 |
|
| 97 | +local function openmpi_load_hook(t) |
| 98 | + -- disable smcuda BTL when loading OpenMPI module for aarch64/neoverse_v1, |
| 99 | + -- to work around hang/crash due to bug in OpenMPI; |
| 100 | + -- see https://gitlab.com/eessi/support/-/issues/41 |
| 101 | + local frameStk = require("FrameStk"):singleton() |
| 102 | + local mt = frameStk:mt() |
| 103 | + local moduleName = string.match(t.modFullName, "(.-)/") |
| 104 | + local cpuTarget = os.getenv("EESSI_SOFTWARE_SUBDIR") or "" |
| 105 | + if (moduleName == "OpenMPI") and (cpuTarget == "aarch64/neoverse_v1") then |
| 106 | + local msg = "Adding '^smcuda' to $OMPI_MCA_btl to work around bug in OpenMPI" |
| 107 | + LmodMessage(msg .. " (see https://gitlab.com/eessi/support/-/issues/41)") |
| 108 | + local ompiMcaBtl = os.getenv("OMPI_MCA_btl") |
| 109 | + if ompiMcaBtl == nil then |
| 110 | + setenv("OMPI_MCA_btl", "^smcuda") |
| 111 | + else |
| 112 | + setenv("OMPI_MCA_btl", ompiMcaBtl .. ",^smcuda") |
| 113 | + end |
| 114 | + end |
| 115 | +end |
| 116 | +
|
97 | 117 | hook.register("load", cuda_enabled_load_hook)
|
| 118 | +hook.register("load", openmpi_load_hook) |
98 | 119 | """
|
99 | 120 |
|
100 | 121 | def error(msg):
|
|
0 commit comments