diff --git a/configs/components/oifs/oifs43.env.yaml b/configs/components/oifs/oifs43.env.yaml index 2f1527eef..4628e13f2 100644 --- a/configs/components/oifs/oifs43.env.yaml +++ b/configs/components/oifs/oifs43.env.yaml @@ -139,8 +139,8 @@ computer: OIFS_FFTW_INCLUDE: "'-I$OIFS_FFTW_DIR/include/'" OIFS_FFTW_LIB: "'-L$OIFS_FFTW_DIR/lib/ -lmkl_intel_lp64 -lmkl_core -qmkl=sequential'" # TODO: figure out whether those two are still needed - ESM_NETCDF_C_DIR: "$NETCDFROOT" - ESM_NETCDF_F_DIR: "$NETCDFFROOT" + ESM_NETCDF_C_DIR: '"$NETCDFROOT"' + ESM_NETCDF_F_DIR: '"$NETCDFFROOT"' # grib api / eccodes OIFS_GRIB_API_INCLUDE: '"-I$ECCODESROOT/include"' OIFS_GRIB_API_LIB: '"-L$ECCODESROOT/lib -L$ECCODESROOT/lib64 -leccodes_f90 -leccodes"' @@ -149,20 +149,20 @@ computer: OIFS_GRIB_API_BIN: '"$ECCODESROOT/bin"' # oasis OIFS_OASIS_BASE: "$(pwd)/oasis" - OIFS_OASIS_INCLUDE: "-I$OIFS_OASIS_BASE/build/lib/psmile -I$OIFS_OASIS_BASE/build/lib/psmile/scrip -I$OIFS_OASIS_BASE/build/lib/psmile/mct -I$OIFS_OASIS_BASE/build/lib/psmile/mct/mpeu" - OIFS_OASIS_LIB: "-L$OIFS_OASIS_BASE/build/lib/psmile -L$OIFS_OASIS_BASE/build/lib/psmile/scrip -L$OIFS_OASIS_BASE/build/lib/psmile/mct -L$OIFS_OASIS_BASE/build/lib/psmile/mct/mpeu -lpsmile -lmct -lmpeu -lscrip" + OIFS_OASIS_INCLUDE: '"-I$OIFS_OASIS_BASE/build/lib/psmile -I$OIFS_OASIS_BASE/build/lib/psmile/scrip -I$OIFS_OASIS_BASE/build/lib/psmile/mct -I$OIFS_OASIS_BASE/build/lib/psmile/mct/mpeu"' + OIFS_OASIS_LIB: '"-L$OIFS_OASIS_BASE/build/lib/psmile -L$OIFS_OASIS_BASE/build/lib/psmile/scrip -L$OIFS_OASIS_BASE/build/lib/psmile/mct -L$OIFS_OASIS_BASE/build/lib/psmile/mct/mpeu -lpsmile -lmct -lmpeu -lscrip"' # netcdf - OIFS_NETCDF_INCLUDE: "-I$NETCDFROOT/include" - OIFS_NETCDF_LIB: "-L$NETCDFROOT/lib -lnetcdf" - OIFS_NETCDFF_INCLUDE: "-I$NETCDFFROOT/include" - OIFS_NETCDFF_LIB: "-L$NETCDFFROOT/lib -lnetcdff" + OIFS_NETCDF_INCLUDE: '"-I$NETCDFROOT/include"' + OIFS_NETCDF_LIB: '"-L$NETCDFROOT/lib -lnetcdf"' + OIFS_NETCDFF_INCLUDE: '"-I$NETCDFFROOT/include"' + OIFS_NETCDFF_LIB: '"-L$NETCDFFROOT/lib -lnetcdff"' # compilers and compile switches - OIFS_FC: "$FC" + OIFS_FC: '"$FC"' OIFS_FFLAGS: '"-qopenmp -r8 -fp-model precise -align array32byte -O1 -xCORE_AVX2 -g -traceback -convert big_endian -fpe0"' - OIFS_FFIXED: "" + OIFS_FFIXED: '""' OIFS_FCDEFS: '"BLAS LITTLE LINUX INTEGER_IS_INT"' OIFS_LFLAGS: '"$OIFS_MPI_LIB -qopenmp"' - OIFS_CC: "$CC" + OIFS_CC: '"$CC"' # Intel icx (C compiler) no longer supports the -fpe0 flag. # Also, the C standard is raised, so we need to use -std=gnu89 to enforce C89 # since implicit declarations were removed in C99 and onward. diff --git a/configs/components/xios/xios.yaml b/configs/components/xios/xios.yaml index 234cc6858..92aa5bd11 100644 --- a/configs/components/xios/xios.yaml +++ b/configs/components/xios/xios.yaml @@ -99,6 +99,10 @@ xios: archfile: ESMTOOLS_generic_oasis_gcc "*": archfile: ESMTOOLS_generic_oasis_intel + blogin: + choose_computer.compiler_mpi: + intel2024_impi2021: + archfile: ESMTOOLS_levante_oasis_intel use_oasis: --use_oasis oasis3_mct branch: master comp_command: export XIOS_TOPLEVEL=${model_dir}; ./make_xios --arch ${archfile} --netcdf_lib netcdf4_par ${use_oasis} --job 24 --prod; cp bin/xios_server.exe bin/xios.x diff --git a/configs/other_software/batch_system/slurm.yaml b/configs/other_software/batch_system/slurm.yaml index 441c69b22..2915920da 100644 --- a/configs/other_software/batch_system/slurm.yaml +++ b/configs/other_software/batch_system/slurm.yaml @@ -47,20 +47,30 @@ computer: output_flags: "--output=${thisrun_logfile} --error=${thisrun_logfile}" name_flag: "--job-name=${expid}" - taskset: false + hetjob_strategy: hetjob mt_launcher_flag: "" add_choose_heterogeneous_parallelization: true: cpu_bind: "none" - choose_taskset: + choose_hetjob_strategy: # Support for old taskset heterogeneous parallelization approach - true: + taskset: srun_execution_command: "${debugger_flags_prelauncher} ${launcher} ${launcher_flags} ${mt_launcher_flag}--multi-prog ${config_files.hostfile}" - # Support for new packjob heterogeneous parallelization approach - false: - - # kh 24.06.22 --hint=nomultithread is needed (in many cases) on levante to avoid hyperthreading + # Support for hetjob parallelization approach (heterogeneity handled with heterogeneous slurm job + srun steps, allows for heterogeneous resources) + hetjob: + launcher_flags_per_component: " + ${mt_launcher_flag} + --nodes=@nnodes@ + --ntasks=@nproc@ + --ntasks-per-node=@nproc_per_node@ + --cpus-per-task=@cpus_per_proc@ + --export=ALL,OMP_NUM_THREADS=@omp_num_threads@" + srun_execution_command: "${debugger_flags_prelauncher} ${launcher} ${launcher_flags} \\\n@components@" + launcher_comp_sep: " \\\n:" + # Support for srun steps approach (heterogeneity handled in the srun command) + srunsteps: + # kh 24.06.22 --hint=nomultithread is needed (in many cases) on levante to avoid hyperthreading launcher_flags_per_component: " ${mt_launcher_flag} --nodes=@nnodes@ diff --git a/configs/setups/awicm3/awicm3.yaml b/configs/setups/awicm3/awicm3.yaml index 7b2c0e075..e74a460c9 100644 --- a/configs/setups/awicm3/awicm3.yaml +++ b/configs/setups/awicm3/awicm3.yaml @@ -996,8 +996,9 @@ computer: iolibraries: system_libs compiler_mpi: intel2025_psmpi blogin: + taskset: false iolibraries: geomar_libs - compiler_mpi: intel2019_ompi + compiler_mpi: intel2024_impi2021 glogin: iolibraries: geomar_libs compiler_mpi: intel2019_ompi diff --git a/src/esm_runscripts/batch_system.py b/src/esm_runscripts/batch_system.py index 8c0321b8a..a4e52a13a 100644 --- a/src/esm_runscripts/batch_system.py +++ b/src/esm_runscripts/batch_system.py @@ -50,7 +50,6 @@ def job_is_still_running(self, jobid): def add_pre_launcher_lines(self, config, cluster, runfile): return self.bs.add_pre_launcher_lines(config, cluster, runfile) - # TODO: remove it once it's not needed anymore (substituted by packjob) def write_het_par_wrappers(self, config): return self.bs.write_het_par_wrappers(config) @@ -375,26 +374,35 @@ def get_run_commands(config, subjob, batch_or_shell): # here or in compute.py? commands = [] if subjob.startswith("compute"): - if config["general"].get("submit_to_batch_system", True): - batch_system = config["computer"] - if "execution_command" in batch_system: - commands.append( - "time " - + batch_system["execution_command"] - + f" 2>&1{config['computer'].get('write_execution_log', '')} &" - ) - if config["general"].get("multi_srun"): - return self.bs.get_run_commands_multisrun(config, commands) + submit_to_batch_system = config["general"].get( + "submit_to_batch_system", True + ) + write_execution_log = config["computer"].get( + "write_execution_log", "" + ) + + if submit_to_batch_system: + execution_command_list = config["computer"].get( + "execution_command_list", + [config["computer"].get("execution_command")] + ) else: + execution_command_list = [] for model in config: if model == "computer": continue if "execution_command" in config[model]: - commands.append( - "time ./" - + config[model]["execution_command"] - + f" 2>&1{config['computer'].get('write_execution_log', '')} &" + execution_command_list.append( + config[model]["execution_command"] ) + + for execution_command in execution_command_list: + commands.append( + f"time {execution_command} 2>&1{write_execution_log} &" + ) + + if submit_to_batch_system and config["general"].get("multi_srun"): + return self.bs.get_run_commands_multisrun(config, commands) else: subjob_tasks = dataprocess.subjob_tasks(config, subjob, batch_or_shell) for task in subjob_tasks: @@ -460,10 +468,9 @@ def write_simple_runscript(config, cluster, batch_or_shell="batch"): if batch_or_shell == "batch": config = batch_system.calculate_requirements(config, cluster) - # TODO: remove it once it's not needed anymore (substituted by packjob) if cluster in reserved_jobtypes and config["computer"].get( - "taskset", False - ): + "hetjob_strategy", "hetjob" + ) == "taskset": config = config["general"]["batch"].write_het_par_wrappers(config) # Prepare launcher config = config["general"]["batch"].prepare_launcher(config, cluster) @@ -703,7 +710,7 @@ def find_openmp(config): config[model]["nproc"] = 1 return config - def het_par_launcher_lines(self, config, cluster): + def hetjob_single_launcher_command(self, config, cluster): """ Loops through the components to generate job launcher flags and execution commands, to be appended in substitution to the ``@components@`` tag, in @@ -747,6 +754,54 @@ def het_par_launcher_lines(self, config, cluster): .replace("@jobtype@", cluster) ) + def hetjob_concurrent_launcher_commands(self, config, cluster): + """ + Loops through the components to generate job launcher flags and execution + commands, to be appended in substitution to the ``@components@`` tag, in + ``computer.execution_command``, that would later be used in the writing of the + ``.run`` file, in ``batch_system.py``. + + Parameters + ---------- + config : dict + Configuration dictionary containing information about the experiment and + experiment directory. + cluster : str + Type of job cluster. + """ + component_lines = [] + # Read in the separator to be used in between component calls in the job + # launcher + sep = config["computer"].get("launcher_comp_sep", "\\\n ") + " " + # Loop through the components + for model in config["general"]["valid_model_names"]: + command = None + # Read in execution command + if "execution_command" in config[model]: + command = config[model]["execution_command"] + elif "executable" in config[model]: + command = config[model]["executable"] + # Prepare the MPMD commands + + # kh 24.06.22 workaround: filter hdmodel + if command and (command != "NONE"): + launcher = config["computer"].get("launcher") + launcher_flags = self.calc_launcher_flags(config, model, cluster) + component_lines.append(f"{launcher_flags} ./{command} ") + + execution_command = config["computer"]["execution_command"] + execution_command_list = [] + for component in component_lines: + # Replace the ``@components@`` tag with the component command + # and add the ``&`` at the end to run it in background + execution_command_list.append( + execution_command + .replace("@components@", component) + .replace("@jobtype@", cluster) + ) + + config["computer"]["execution_command_list"] = execution_command_list + @staticmethod def calc_launcher_flags(config, model, cluster): """ diff --git a/src/esm_runscripts/slurm.py b/src/esm_runscripts/slurm.py index 48a744e60..fa2e147f9 100644 --- a/src/esm_runscripts/slurm.py +++ b/src/esm_runscripts/slurm.py @@ -10,7 +10,7 @@ from loguru import logger import esm_parser -from esm_tools import user_note +from esm_tools import user_note, user_error class Slurm: @@ -69,12 +69,11 @@ def prepare_launcher(self, config, cluster): current_hostfile = self.path + "_" + run_type write_one_hostfile(current_hostfile, config) - if config["computer"].get( + heterogeneous_parallelization = config["computer"].get( "heterogeneous_parallelization", False - ) and not config["computer"].get("taskset", False): - # Prepare heterogeneous parallelization call - config["general"]["batch"].het_par_launcher_lines(config, cluster) - else: + ) + hetjob_strategy = config["computer"].get("hetjob_strategy", "hetjob") + if not heterogeneous_parallelization or hetjob_strategy == "taskset": # Standard/old way of running jobs with slurm self.write_one_hostfile(self.path, config) @@ -82,6 +81,16 @@ def prepare_launcher(self, config, cluster): config["general"]["work_dir"] + "/" + os.path.basename(self.path) ) shutil.copyfile(self.path, hostfile_in_work) + elif hetjob_strategy == "hetjob" or hetjob_strategy == "srunsteps": + # Prepare heterogeneous parallelization call for hetjob (one srun command + # per binary) + config["general"]["batch"].hetjob_single_launcher_command(config, cluster) + else: + user_error( + "hetjob strategy", + f"``{hetjob_strategy}`` is not a valid one. Choose one among " + f"``hetjob`` (default), ``srunsteps`` or ``taskset``.", + ) return config @@ -157,9 +166,8 @@ def add_pre_launcher_lines(self, config, cluster, runfile): (``runfile.write("")``). """ - # TODO: remove it once it's not needed anymore (substituted by packjob) if config["computer"].get("heterogeneous_parallelization", False): - if config["computer"].get("taskset", False): + if config["computer"].get("hetjob_strategy") == "taskset": self.add_hostlist_file_gen_lines(config, runfile) @staticmethod @@ -185,9 +193,10 @@ def het_par_headers(config, cluster, headers): for heterogeneous parallelization in SLURM. """ # Only modify the headers if ``heterogeneous_parallelization`` is ``True`` + hetjob_strategy = config["computer"].get("hetjob_strategy", "hetjob") if config["computer"].get( "heterogeneous_parallelization", False - ) and not config["computer"].get("taskset", False): + ) and hetjob_strategy not in ["taskset", "srunsteps"]: this_batch_system = config["computer"] # Get the variables to be modified for the headers nodes_flag = this_batch_system["nodes_flag"].split("=")[0] @@ -223,7 +232,6 @@ def het_par_headers(config, cluster, headers): return headers - # TODO: remove it once it's not needed anymore (substituted by packjob) @staticmethod def write_het_par_wrappers(config): cores_per_node = config["computer"]["partitions"]["compute"]["cores_per_node"] @@ -300,7 +308,6 @@ def write_het_par_wrappers(config): config[model]["execution_command_het_par"] = execution_command_het_par return config - # TODO: remove it once it's not needed anymore (substituted by packjob) @staticmethod def add_hostlist_file_gen_lines(config, runfile): cores_per_node = config["computer"]["partitions"]["compute"]["cores_per_node"]