From 4eb103c4c373ff54edace96dfe6cb127deea2868 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 18 Nov 2024 15:25:36 -0700 Subject: [PATCH] Provide a warning about Slurm-related envar Instead of silently ignoring the Slurm envar controlling internal PRRTE cmd line arguments used by `srun` to start the PRRTE daemons, let's output a warning message and error out if the envar is found. Signed-off-by: Ralph Castain --- .../show-help-files/help-prte-runtime.txt | 40 +++++++++++++++++ src/runtime/prte_init.c | 43 +++++++++++-------- 2 files changed, 65 insertions(+), 18 deletions(-) diff --git a/src/docs/show-help-files/help-prte-runtime.txt b/src/docs/show-help-files/help-prte-runtime.txt index 24b22bde2e..912ae7607a 100644 --- a/src/docs/show-help-files/help-prte-runtime.txt +++ b/src/docs/show-help-files/help-prte-runtime.txt @@ -144,3 +144,43 @@ other nodes using the "prte_remote_tmpdir_base" param. This is only a warning advisory and your job will continue. You can disable this warning in the future by setting the "prte_silence_shared_fs" MCA param to "1". + +[prte:slurm:envar] +PRTE detected the presence of an MCA parameter in the environment that +assigns custom command line arguments to the `srun` command used to +start PRTE's daemons on remote nodes: + + Paramater value: %s + +This warning is provided to alert you (the user) that this parameter +value will be ignored. + +Background: Starting with Slurm version 23.11, a command line argument +(`--external-launcher`) was added to `srun` to indicate that the +command was being initiated from within a third-party launcher (e.g., +`prte` or `prterun`). This allows Slurm to essentially freely modify +the `srun` command line while retaining a backward compatibility +capability when explicitly told to use it. Notably, Slurm +did this by automatically setting the PRTE_MCA_plm_slurm_args environment +variable to pass in its own command line arguments. + +Unfortunately, this had the side effect of overriding most user- or +system-level settings. In addition, arguments passed on the +PRTE command line overrode any Slurm setting of the environment +variable, but with potentially undesirable side effects if newer +versions of `srun` misinterpret or fail to understand the user-specified +arguments. + +PRRTE now directly determines the `srun` version and sets its `srun` cmd +line arguments accordingly. These arguments are set in addition to any +provided by the user, and therefore the user no longer needs to concern +themselves with changes induced by Slurm. + +If the setting of the MCA parameter was intentional, or if the +parameter value looks acceptable to you, then please set the MCA parameter +either in the default MCA param file (either the system or user-level file), +or on the cmd line: + + Cmd line: --prtemca plm_slurm_args %s + Default MCA param file: plm_slurm_args = %s + diff --git a/src/runtime/prte_init.c b/src/runtime/prte_init.c index e76e3e49cc..638a1b4002 100644 --- a/src/runtime/prte_init.c +++ b/src/runtime/prte_init.c @@ -172,24 +172,6 @@ int prte_init_minimum(void) return PRTE_ERR_SILENT; } - /* Protect against the envar version of the Slurm - * custom args MCA param. This is an unfortunate - * hack that hopefully will eventually go away. - * See both of the following for detailed - * explanations and discussion: - * - * https://github.com/openpmix/prrte/issues/1974 - * https://github.com/open-mpi/ompi/issues/12471 - * - * Orgs/users wanting to add custom args to the - * internal "srun" command used to spawn the - * PRRTE daemons must do so via the default MCA - * param files (system or user), or via the - * prterun (or its proxy) cmd line - */ - unsetenv("PRTE_MCA_plm_slurm_args"); - unsetenv("OMPI_MCA_plm_slurm_args"); - /* carry across the toolname */ pmix_tool_basename = prte_tool_basename; @@ -239,6 +221,31 @@ int prte_init_minimum(void) return ret; } + /* Protect against the envar version of the Slurm + * custom args MCA param. This is an unfortunate + * hack that hopefully will eventually go away. + * See both of the following for detailed + * explanations and discussion: + * + * https://github.com/openpmix/prrte/issues/1974 + * https://github.com/open-mpi/ompi/issues/12471 + * + * Orgs/users wanting to add custom args to the + * internal "srun" command used to spawn the + * PRRTE daemons must do so via the default MCA + * param files (system or user), or via the + * prterun (or its proxy) cmd line + */ + if (NULL != (evar = getenv("PRTE_MCA_plm_slurm_args"))) { + pmix_show_help("help-prte-runtime.txt", "prte:slurm:envar", true, + evar, evar, evar); + return PRTE_ERR_SILENT; + } else if (NULL != (evar = getenv("OMPI_MCA_plm_slurm_args"))) { + pmix_show_help("help-prte-runtime.txt", "prte:slurm:envar", true, + evar, evar, evar); + return PRTE_ERR_SILENT; + } + /* pre-load any default mca param files */ prte_preload_default_mca_params();