diff --git a/src/esm_environment/esm_environment.py b/src/esm_environment/esm_environment.py index adf8b96a1..b48e2101c 100644 --- a/src/esm_environment/esm_environment.py +++ b/src/esm_environment/esm_environment.py @@ -548,8 +548,11 @@ def write_dummy_script(self, include_set_e=True): print('WARNING: "sh_interpreter" not defined in the machine yaml') with open("dummy_script.sh", "w") as script_file: # Write the file headings + #script_file.write( + # f'#!{self.config.get("sh_interpreter", "/bin/bash")} -l\n' + #) script_file.write( - f'#!{self.config.get("sh_interpreter", "/bin/bash")} -l\n' + f'#!{self.config.get("sh_interpreter", "/bin/bash")}\n' ) script_file.write( "# Dummy script generated by esm-tools, to be removed later: \n" diff --git a/src/esm_runscripts/oasis.py b/src/esm_runscripts/oasis.py index fb78609c7..a4e89b17a 100644 --- a/src/esm_runscripts/oasis.py +++ b/src/esm_runscripts/oasis.py @@ -45,8 +45,11 @@ def __init__( self.namcouple += [" $NBMODEL", " " + str(exec_entry), " $END"] self.namcouple += [" $RUNTIME", " " + str(runtime), " $END"] if lucia: - if mct_version >= (5, 0): - self.namcouple += [" $NLOGPRT", " " + str(debug_level) + " 0 1", " $END"] + # LUCIA (load balancing) is done differently in MCT 5.0 + if mct_version >= (5,0): + # In MCT5 you set X Y Z, where X refers to verbosity, Y to timing info and Z to load balancing + # Here: Set X = debug_level, Y = 0 (no info), Z = 1 (activate load balancing) + self.namcouple += [" $NLOGPRT", " " + str(debug_level) + " 0 1 ", " $END"] else: self.namcouple += [" $NLOGPRT", " " + "1 -1", " $END"] else: @@ -540,12 +543,9 @@ def add_restart_files(self, restart_file_label, fconfig): config["restart_in_in_work"][restart_file_label] = restart_file # In case of a branch-off experiment -> use the correct oasis restart files: - # Not the soft link to the last, but the actual one for the branch-off date - if ( - gconfig["run_number"] == 1 - and config["lresume"] - and gconfig["jobtype"] == "prepcompute" - ): + # Not the rstas.nc soft link to the last, but the actual one for the + # branch-off date + if gconfig["run_number"] == 1 and config["lresume"] and gconfig["jobtype"] == "prepcompute" and config.get("norestart", "F") == "F": # If they do not exist, define ``ini_restart_date`` and ``ini_restart_dir`` # based on ``ini_parent_date`` and ``ini_parent_dir`` if "ini_parent_date" in config and "ini_restart_date" not in config: @@ -561,7 +561,7 @@ def add_restart_files(self, restart_file_label, fconfig): # check if restart file with ini_restart_date in filename is in the restart # folder of the parent experiment to be branched off from: glob_search_file = ( - f"{restart_file_path}*" + f"{config['ini_restart_dir']}{restart_file}*" f"{config['ini_restart_date'].year}" f"{config['ini_restart_date'].month:02}" f"{config['ini_restart_date'].day:02}" @@ -577,7 +577,13 @@ def add_restart_files(self, restart_file_label, fconfig): restart_file = os.path.basename(glob_restart_file[0]) elif len(glob_restart_file) == 0: restart_file = restart_file_path - if not os.path.isfile(restart_file): + # in case config["restart_in_sources"] are given explicitely + # AND are not absolute paths as e.g in FOCI + # ini_parent_dir: "${general.ini_parent_dir}/oasis3mct/" + # restart_in_sources: sstocean_${parent_expid}_... + # we need to check for the full path as well + # btw it was a nightmare to track this down + if not os.path.isfile(restart_file) and not os.path.isfile(f"{config['ini_restart_dir']}/{restart_file}"): user_error( "Restart file missing", f"No OASIS restart file for ``{restart_file_label}`` found " diff --git a/src/esm_runscripts/slurm.py b/src/esm_runscripts/slurm.py index 48a744e60..5833be43e 100644 --- a/src/esm_runscripts/slurm.py +++ b/src/esm_runscripts/slurm.py @@ -63,11 +63,21 @@ def get_jobid(): return os.environ.get("SLURM_JOB_ID") def prepare_launcher(self, config, cluster): + # which launcher are we using? + launcher = config["computer"].get("launcher",None) + # friendly check that you are using a launcher that we support + if launcher not in ["srun", "mpirun"]: + print(" The launcher %s is not compatible with ESM-Tools in SLURM " % (launcher,)) + print(" Supported launchers for SLURM are srun and mpirun ") + # MA: not sure how this will play with heterogeneous parallelization if "multi_srun" in config["general"]: for run_type in list(config["general"]["multi_srun"]): current_hostfile = self.path + "_" + run_type - write_one_hostfile(current_hostfile, config) + if launcher == "srun": + write_one_hostfile_srun(current_hostfile, config) + elif launcher == "mpirun": + write_one_hostfile_mpirun(current_hostfile, config) if config["computer"].get( "heterogeneous_parallelization", False @@ -76,7 +86,11 @@ def prepare_launcher(self, config, cluster): config["general"]["batch"].het_par_launcher_lines(config, cluster) else: # Standard/old way of running jobs with slurm - self.write_one_hostfile(self.path, config) + if launcher == "srun": + self.write_one_hostfile_srun(self.path, config) + elif launcher == "mpirun": + # JK: Need to think about how to handle heterogeneous paralleisation here... + self.write_one_hostfile_mpirun(self.path, config) hostfile_in_work = ( config["general"]["work_dir"] + "/" + os.path.basename(self.path) @@ -85,10 +99,11 @@ def prepare_launcher(self, config, cluster): return config - def write_one_hostfile(self, hostfile, config): + def write_one_hostfile_srun(self, hostfile, config): """ Gathers previously prepared requirements (batch_system.calculate_requirements) and writes them to ``self.path``. + Suitable for srun """ with open(hostfile, "w") as hostfile: @@ -112,7 +127,50 @@ def write_one_hostfile(self, hostfile, config): hostfile.write( str(start_proc) + "-" + str(end_proc) + " " + command + "\n" ) + + def write_one_hostfile_mpirun(self, hostfile, config): + """ + Gathers previously prepared requirements + (batch_system.calculate_requirements) and writes them to ``self.path``. + Suitable for mpirun launcher + """ + + # make an empty string which we will append commands to + mpirun_options = "" + for model in config["general"]["valid_model_names"]: + end_proc = config[model].get("end_proc", None) + start_proc = config[model].get("start_proc", None) + + # a model component like oasis3mct does not need cores + # since its technically a library + # So start_proc and end_proc will be None. Skip it + if start_proc == None or end_proc == None: + continue + + # number of cores needed + no_cpus = end_proc - start_proc + 1 + + # check if execution_command or executable exist + if "execution_command" in config[model]: + command = "./" + config[model]["execution_command"] + elif "executable" in config[model]: + command = "./" + config[model]["executable"] + else: + print('warning: the executable or execution_command could not be detemined for %s' % (model,)) + continue + + # the mpirun command is set here. + mpirun_options += ( + " -np %d %s :" % (no_cpus, command) + ) + + mpirun_options = mpirun_options[:-1] # remove trailing ":" + + with open(hostfile, "w") as hostfile: + hostfile.write(mpirun_options) + + @staticmethod def get_job_state(jobid): """