esm-tools · mandresm · Feb 14, 2025 · Feb 14, 2025
diff --git a/src/esm_environment/esm_environment.py b/src/esm_environment/esm_environment.py
@@ -548,8 +548,11 @@ def write_dummy_script(self, include_set_e=True):
             print('WARNING: "sh_interpreter" not defined in the machine yaml')
         with open("dummy_script.sh", "w") as script_file:
             # Write the file headings
+            #script_file.write(
+            #    f'#!{self.config.get("sh_interpreter", "/bin/bash")} -l\n'
+            #)
             script_file.write(
-                f'#!{self.config.get("sh_interpreter", "/bin/bash")} -l\n'
+                f'#!{self.config.get("sh_interpreter", "/bin/bash")}\n'
             )
             script_file.write(
                 "# Dummy script generated by esm-tools, to be removed later: \n"

diff --git a/src/esm_runscripts/oasis.py b/src/esm_runscripts/oasis.py
@@ -45,8 +45,11 @@ def __init__(
         self.namcouple += [" $NBMODEL", "            " + str(exec_entry), " $END"]
         self.namcouple += [" $RUNTIME", "           " + str(runtime), " $END"]
         if lucia:
-            if mct_version >= (5, 0):
-                self.namcouple += [" $NLOGPRT", "           " + str(debug_level) + " 0 1", " $END"]
+            # LUCIA (load balancing) is done differently in MCT 5.0
+            if mct_version >= (5,0):
+                # In MCT5 you set X Y Z, where X refers to verbosity, Y to timing info and Z to load balancing
+                # Here: Set X = debug_level, Y = 0 (no info), Z = 1 (activate load balancing)
+                self.namcouple += [" $NLOGPRT", "           " + str(debug_level) + " 0 1 ", " $END"]
             else:
                 self.namcouple += [" $NLOGPRT", "           " + "1 -1", " $END"]
         else:
@@ -540,12 +543,9 @@ def add_restart_files(self, restart_file_label, fconfig):
         config["restart_in_in_work"][restart_file_label] = restart_file
 
         # In case of a branch-off experiment -> use the correct oasis restart files:
-        # Not the soft link to the last, but the actual one for the branch-off date
-        if (
-            gconfig["run_number"] == 1
-            and config["lresume"]
-            and gconfig["jobtype"] == "prepcompute"
-        ):
+        # Not the rstas.nc soft link to the last, but the actual one for the
+        # branch-off date
+        if gconfig["run_number"] == 1 and config["lresume"] and gconfig["jobtype"] == "prepcompute" and config.get("norestart", "F") == "F":
             # If they do not exist, define ``ini_restart_date`` and ``ini_restart_dir``
             # based on ``ini_parent_date`` and ``ini_parent_dir``
             if "ini_parent_date" in config and "ini_restart_date" not in config:
@@ -561,7 +561,7 @@ def add_restart_files(self, restart_file_label, fconfig):
                 # check if restart file with ini_restart_date in filename is in the restart
                 # folder of the parent experiment to be branched off from:
                 glob_search_file = (
-                    f"{restart_file_path}*"
+                    f"{config['ini_restart_dir']}{restart_file}*"
                     f"{config['ini_restart_date'].year}"
                     f"{config['ini_restart_date'].month:02}"
                     f"{config['ini_restart_date'].day:02}"
@@ -577,7 +577,13 @@ def add_restart_files(self, restart_file_label, fconfig):
                     restart_file = os.path.basename(glob_restart_file[0])
                 elif len(glob_restart_file) == 0:
                     restart_file = restart_file_path
-                    if not os.path.isfile(restart_file):
+                    # in case config["restart_in_sources"] are given explicitely 
+                    # AND are not absolute paths as e.g in FOCI
+                    # ini_parent_dir: "${general.ini_parent_dir}/oasis3mct/"
+                    #    restart_in_sources: sstocean_${parent_expid}_...
+                    # we need to check for the full path as well 
+                    # btw it was a nightmare to track this down
+                    if not os.path.isfile(restart_file) and not os.path.isfile(f"{config['ini_restart_dir']}/{restart_file}"):
                         user_error(
                             "Restart file missing",
                             f"No OASIS restart file for ``{restart_file_label}`` found "

diff --git a/src/esm_runscripts/slurm.py b/src/esm_runscripts/slurm.py
@@ -63,11 +63,21 @@ def get_jobid():
         return os.environ.get("SLURM_JOB_ID")
 
     def prepare_launcher(self, config, cluster):
+        # which launcher are we using?
+        launcher = config["computer"].get("launcher",None)
+        # friendly check that you are using a launcher that we support
+        if launcher not in ["srun", "mpirun"]:
+            print(" The launcher %s is not compatible with ESM-Tools in SLURM " % (launcher,))
+            print(" Supported launchers for SLURM are srun and mpirun ")
+
         # MA: not sure how this will play with heterogeneous parallelization
         if "multi_srun" in config["general"]:
             for run_type in list(config["general"]["multi_srun"]):
                 current_hostfile = self.path + "_" + run_type
-                write_one_hostfile(current_hostfile, config)
+                if launcher == "srun":
+                    write_one_hostfile_srun(current_hostfile, config)
+                elif launcher == "mpirun":
+                    write_one_hostfile_mpirun(current_hostfile, config)
 
         if config["computer"].get(
             "heterogeneous_parallelization", False
@@ -76,7 +86,11 @@ def prepare_launcher(self, config, cluster):
             config["general"]["batch"].het_par_launcher_lines(config, cluster)
         else:
             # Standard/old way of running jobs with slurm
-            self.write_one_hostfile(self.path, config)
+            if launcher == "srun":
+                self.write_one_hostfile_srun(self.path, config)
+            elif launcher == "mpirun":
+                # JK: Need to think about how to handle heterogeneous paralleisation here...
+                self.write_one_hostfile_mpirun(self.path, config)
 
             hostfile_in_work = (
                 config["general"]["work_dir"] + "/" + os.path.basename(self.path)
@@ -85,10 +99,11 @@ def prepare_launcher(self, config, cluster):
 
         return config
 
-    def write_one_hostfile(self, hostfile, config):
+    def write_one_hostfile_srun(self, hostfile, config):
         """
         Gathers previously prepared requirements
         (batch_system.calculate_requirements) and writes them to ``self.path``.
+        Suitable for srun
         """
 
         with open(hostfile, "w") as hostfile:
@@ -112,7 +127,50 @@ def write_one_hostfile(self, hostfile, config):
                 hostfile.write(
                     str(start_proc) + "-" + str(end_proc) + "  " + command + "\n"
                 )
+
+    def write_one_hostfile_mpirun(self, hostfile, config):
+        """ 
+        Gathers previously prepared requirements
+        (batch_system.calculate_requirements) and writes them to ``self.path``.
+        Suitable for mpirun launcher
+        """
+
+        # make an empty string which we will append commands to
+        mpirun_options = ""
 
+        for model in config["general"]["valid_model_names"]:
+            end_proc = config[model].get("end_proc", None)
+            start_proc = config[model].get("start_proc", None)
+
+            # a model component like oasis3mct does not need cores
+            # since its technically a library
+            # So start_proc and end_proc will be None. Skip it
+            if start_proc == None or end_proc == None:
+                continue
+
+            # number of cores needed
+            no_cpus = end_proc - start_proc + 1
+
+            # check if execution_command or executable exist
+            if "execution_command" in config[model]:
+                command = "./" + config[model]["execution_command"]
+            elif "executable" in config[model]:
+                command = "./" + config[model]["executable"]
+            else:
+                print('warning: the executable or execution_command could not be detemined for %s' % (model,))
+                continue
+
+            # the mpirun command is set here. 
+            mpirun_options += (
+                    " -np %d %s :" % (no_cpus, command)
+                )
+
+        mpirun_options = mpirun_options[:-1]  # remove trailing ":"
+
+        with open(hostfile, "w") as hostfile:
+            hostfile.write(mpirun_options)
+
+
     @staticmethod
     def get_job_state(jobid):
         """