Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/esm_environment/esm_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,8 +548,11 @@ def write_dummy_script(self, include_set_e=True):
print('WARNING: "sh_interpreter" not defined in the machine yaml')
with open("dummy_script.sh", "w") as script_file:
# Write the file headings
#script_file.write(
# f'#!{self.config.get("sh_interpreter", "/bin/bash")} -l\n'
#)
script_file.write(
f'#!{self.config.get("sh_interpreter", "/bin/bash")} -l\n'
f'#!{self.config.get("sh_interpreter", "/bin/bash")}\n'
)
script_file.write(
"# Dummy script generated by esm-tools, to be removed later: \n"
Expand Down
26 changes: 16 additions & 10 deletions src/esm_runscripts/oasis.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,11 @@ def __init__(
self.namcouple += [" $NBMODEL", " " + str(exec_entry), " $END"]
self.namcouple += [" $RUNTIME", " " + str(runtime), " $END"]
if lucia:
if mct_version >= (5, 0):
self.namcouple += [" $NLOGPRT", " " + str(debug_level) + " 0 1", " $END"]
# LUCIA (load balancing) is done differently in MCT 5.0
if mct_version >= (5,0):
# In MCT5 you set X Y Z, where X refers to verbosity, Y to timing info and Z to load balancing
# Here: Set X = debug_level, Y = 0 (no info), Z = 1 (activate load balancing)
self.namcouple += [" $NLOGPRT", " " + str(debug_level) + " 0 1 ", " $END"]
else:
self.namcouple += [" $NLOGPRT", " " + "1 -1", " $END"]
else:
Expand Down Expand Up @@ -540,12 +543,9 @@ def add_restart_files(self, restart_file_label, fconfig):
config["restart_in_in_work"][restart_file_label] = restart_file

# In case of a branch-off experiment -> use the correct oasis restart files:
# Not the soft link to the last, but the actual one for the branch-off date
if (
gconfig["run_number"] == 1
and config["lresume"]
and gconfig["jobtype"] == "prepcompute"
):
# Not the rstas.nc soft link to the last, but the actual one for the
# branch-off date
if gconfig["run_number"] == 1 and config["lresume"] and gconfig["jobtype"] == "prepcompute" and config.get("norestart", "F") == "F":
# If they do not exist, define ``ini_restart_date`` and ``ini_restart_dir``
# based on ``ini_parent_date`` and ``ini_parent_dir``
if "ini_parent_date" in config and "ini_restart_date" not in config:
Expand All @@ -561,7 +561,7 @@ def add_restart_files(self, restart_file_label, fconfig):
# check if restart file with ini_restart_date in filename is in the restart
# folder of the parent experiment to be branched off from:
glob_search_file = (
f"{restart_file_path}*"
f"{config['ini_restart_dir']}{restart_file}*"
f"{config['ini_restart_date'].year}"
f"{config['ini_restart_date'].month:02}"
f"{config['ini_restart_date'].day:02}"
Expand All @@ -577,7 +577,13 @@ def add_restart_files(self, restart_file_label, fconfig):
restart_file = os.path.basename(glob_restart_file[0])
elif len(glob_restart_file) == 0:
restart_file = restart_file_path
if not os.path.isfile(restart_file):
# in case config["restart_in_sources"] are given explicitely
# AND are not absolute paths as e.g in FOCI
# ini_parent_dir: "${general.ini_parent_dir}/oasis3mct/"
# restart_in_sources: sstocean_${parent_expid}_...
# we need to check for the full path as well
# btw it was a nightmare to track this down
if not os.path.isfile(restart_file) and not os.path.isfile(f"{config['ini_restart_dir']}/{restart_file}"):
user_error(
"Restart file missing",
f"No OASIS restart file for ``{restart_file_label}`` found "
Expand Down
64 changes: 61 additions & 3 deletions src/esm_runscripts/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,21 @@ def get_jobid():
return os.environ.get("SLURM_JOB_ID")

def prepare_launcher(self, config, cluster):
# which launcher are we using?
launcher = config["computer"].get("launcher",None)
# friendly check that you are using a launcher that we support
if launcher not in ["srun", "mpirun"]:
print(" The launcher %s is not compatible with ESM-Tools in SLURM " % (launcher,))
print(" Supported launchers for SLURM are srun and mpirun ")

# MA: not sure how this will play with heterogeneous parallelization
if "multi_srun" in config["general"]:
for run_type in list(config["general"]["multi_srun"]):
current_hostfile = self.path + "_" + run_type
write_one_hostfile(current_hostfile, config)
if launcher == "srun":
write_one_hostfile_srun(current_hostfile, config)
elif launcher == "mpirun":
write_one_hostfile_mpirun(current_hostfile, config)

if config["computer"].get(
"heterogeneous_parallelization", False
Expand All @@ -76,7 +86,11 @@ def prepare_launcher(self, config, cluster):
config["general"]["batch"].het_par_launcher_lines(config, cluster)
else:
# Standard/old way of running jobs with slurm
self.write_one_hostfile(self.path, config)
if launcher == "srun":
self.write_one_hostfile_srun(self.path, config)
elif launcher == "mpirun":
# JK: Need to think about how to handle heterogeneous paralleisation here...
self.write_one_hostfile_mpirun(self.path, config)

hostfile_in_work = (
config["general"]["work_dir"] + "/" + os.path.basename(self.path)
Expand All @@ -85,10 +99,11 @@ def prepare_launcher(self, config, cluster):

return config

def write_one_hostfile(self, hostfile, config):
def write_one_hostfile_srun(self, hostfile, config):
"""
Gathers previously prepared requirements
(batch_system.calculate_requirements) and writes them to ``self.path``.
Suitable for srun
"""

with open(hostfile, "w") as hostfile:
Expand All @@ -112,7 +127,50 @@ def write_one_hostfile(self, hostfile, config):
hostfile.write(
str(start_proc) + "-" + str(end_proc) + " " + command + "\n"
)

def write_one_hostfile_mpirun(self, hostfile, config):
"""
Gathers previously prepared requirements
(batch_system.calculate_requirements) and writes them to ``self.path``.
Suitable for mpirun launcher
"""

# make an empty string which we will append commands to
mpirun_options = ""

for model in config["general"]["valid_model_names"]:
end_proc = config[model].get("end_proc", None)
start_proc = config[model].get("start_proc", None)

# a model component like oasis3mct does not need cores
# since its technically a library
# So start_proc and end_proc will be None. Skip it
if start_proc == None or end_proc == None:
continue

# number of cores needed
no_cpus = end_proc - start_proc + 1

# check if execution_command or executable exist
if "execution_command" in config[model]:
command = "./" + config[model]["execution_command"]
elif "executable" in config[model]:
command = "./" + config[model]["executable"]
else:
print('warning: the executable or execution_command could not be detemined for %s' % (model,))
continue

# the mpirun command is set here.
mpirun_options += (
" -np %d %s :" % (no_cpus, command)
)

mpirun_options = mpirun_options[:-1] # remove trailing ":"

with open(hostfile, "w") as hostfile:
hostfile.write(mpirun_options)


@staticmethod
def get_job_state(jobid):
"""
Expand Down
Loading