diff --git a/create_lmodrc.py b/create_lmodrc.py index 0e738a530e..ca702346b4 100755 --- a/create_lmodrc.py +++ b/create_lmodrc.py @@ -94,7 +94,28 @@ end end +local function openmpi_load_hook(t) + -- disable smcuda BTL when loading OpenMPI module for aarch64/neoverse_v1, + -- to work around hang/crash due to bug in OpenMPI; + -- see https://gitlab.com/eessi/support/-/issues/41 + local frameStk = require("FrameStk"):singleton() + local mt = frameStk:mt() + local moduleName = string.match(t.modFullName, "(.-)/") + local cpuTarget = os.getenv("EESSI_SOFTWARE_SUBDIR") or "" + if (moduleName == "OpenMPI") and (cpuTarget == "x86_64/intel/skylake_avx512") then --(cpuTarget == "aarch64/neoverse_v1") then + local msg = "Adding '^smcuda' to $OMPI_MCA_btl to work around bug in OpenMPI" + LmodMessage(msg .. " (see https://gitlab.com/eessi/support/-/issues/41)") + local ompiMcaBtl = os.getenv("OMPI_MCA_btl") + if ompiMcaBtl == nil then + setenv("OMPI_MCA_btl", "^smcuda") + else + setenv("OMPI_MCA_btl", ompiMcaBtl .. ",^smcuda") + end + end +end + hook.register("load", cuda_enabled_load_hook) +hook.register("load", openmpi_load_hook) """ def error(msg):