esm-tools · mandresm · Jun 7, 2023 · Jun 7, 2023 · Jun 21, 2023 · pgierz
diff --git a/src/esm_runscripts/filedicts.py b/src/esm_runscripts/filedicts.py
@@ -12,6 +12,7 @@
 import functools
 import glob
 import inspect
+import hashlib
 import os
 import pathlib
 import shutil
@@ -1207,17 +1208,18 @@ def from_config(cls, config: dict):
                 sim_files[file_key] = SimulationFile.from_config(
                     config, f"{config_address}.{file_key}"
                 )
+                sim_files[file_key]["component"] = component
         return sim_files
 
     def _gather_file_movements(self) -> None:
         """Puts the methods for each file movement into the dictionary as callable values behind the `_filesystem_op` key"""
         for sim_file_id, sim_file_obj in self.items():
-            movement_type = sim_file_obj.get("movement_type", "cp")
-            if movement_type == "mv":
+            movement_type = sim_file_obj.get("movement_type", "copy")
+            if movement_type == "move":
                 self[sim_file_id]["_filesystem_op"] = getattr(sim_file_obj, "mv")
-            elif movement_type == "cp":
+            elif movement_type == "copy":
                 self[sim_file_id]["_filesystem_op"] = getattr(sim_file_obj, "cp")
-            elif movement_type == "ln":
+            elif movement_type == "link":
                 self[sim_file_id]["_filesystem_op"] = getattr(sim_file_obj, "ln")
             else:
                 raise ValueError(
@@ -1231,12 +1233,19 @@ def execute_filesystem_operation(
         for sim_file_id, sim_file_obj in self.items():
             logger.info(f"Processing {sim_file_id}")
             if config["general"]["jobtype"] == "prepcompute":
-                src, dest = "pool", "work"
+                src, dest = "computer", "work"
             elif config["general"]["jobtype"] == "tidy":
                 src, dest = "work", "exp_tree"
             else:
                 raise ValueError(f"Incorrect jobtype specified for {sim_file_obj}")
             sim_file_obj["_filesystem_op"](src, dest)
+            config[sim_file_obj["component"]]["files"][sim_file_id]["src"] = (
+                sim_file_obj.paths[src]
+            )
+            config[sim_file_obj["component"]]["files"][sim_file_id]["intermediate"] = None
+            config[sim_file_obj["component"]]["files"][sim_file_id]["dest"] = (
+                sim_file_obj.paths[dest]
+            )
         return config
 
 
@@ -1258,3 +1267,66 @@ def resolve_file_movements(config: ConfigSetup) -> ConfigSetup:
     sim_file_collection = SimulationFileCollection.from_config(config)
     config = sim_file_collection.execute_filesystem_operation(config)
     return config
+
+
+def log_used_files(config: ConfigSetup) -> ConfigSetup:
+    """
+    Logs the files moved on this current phase.
+
+    Parameters
+    ----------
+    config : ConfigSetup
+        The complete simulation configuration.
+
+    Returns
+    -------
+    config : ConfigSetup
+        The complete simulation configuration, potentially modified.
+    """
+    if config["general"].get("verbose", False):
+        logger.info("\n::: Logging used files")
+
+    filetypes = config["general"]["relevant_filetypes"]
+    expid = config["general"]["expid"]
+    it_coupled_model_name = config["general"].get("iterative_coupled_model", "")
+    datestamp = config["general"]["run_datestamp"]
+    thisrun_log_dir = config["general"]["thisrun_log_dir"]
+    flist_file = (
+        f"{thisrun_log_dir}/{expid}_{it_coupled_model_name}filelist_{datestamp}.yaml"
+    )
+    all_files = {}
+
+    for model in config["general"]["valid_model_names"] + ["general"]:
+        for filetype in filetypes:
+            model_config = config[model]
+            model_files = {}
+
+            for file_key, file_obj in model_config.get("files", {}).items():
+                try:
+                    checksum = hashlib.md5(open(
+                        file_obj["dest"], "rb"
+                    ).read()).hexdigest()
+                except FileNotFoundError as err:
+                    checksum = None
+
+                model_files[file_key] = {
+                    "source": str(file_obj["src"]),
+                    "intermediate": file_obj["intermediate"],
+                    "target": str(file_obj["dest"]),
+                    "checksum": checksum,
+                    "kind": filetype,
+                }
+
+                if config["general"].get("verbose", False):
+                    logger.info(f"::: logging file category: {filetype}")
+                    logger.info(f"- source: {files['src']}")
+                    logger.info(f"- target: {files['dest']}")
+                    helpers.print_datetime(config)
+
+            if model_files:
+                all_files[model] = model_files
+
+    with open(flist_file, "w") as flist:
+        yaml.dump(all_files, flist)
+
+    return config
diff --git a/tests/test_esm_runscripts/test_filedicts.py b/tests/test_esm_runscripts/test_filedicts.py
@@ -1115,3 +1115,63 @@ def test_globbing_ln(fs):
 
     for nf in expected_new_paths:
         assert os.path.exists(nf)
+
+
+def test_file_log(fs):
+    """Checks that the file log is produced correctly"""
+    dummy_config = """
+    general:
+        expid: expid
+        base_dir: /some/dummy/location/
+        thisrun_work_dir: "/work/ollie/pgierz/some_exp/run_20010101-20011231/work"
+        thisrun_log_dir: "/work/ollie/pgierz/some_exp/run_20010101-20011231/log"
+        exp_dir: "/work/ollie/pgierz/some_exp"
+        thisrun_dir: "/work/ollie/pgierz/some_exp/run_20010101-20011231"
+        all_model_filetypes: [analysis, bin, config, forcing, input, couple, log, mon, outdata, restart, viz, ignore]
+        jobtype: "prepcompute"
+        valid_model_names: ["echam"]
+        relevant_filetypes: ["input"]
+        run_datestamp: "20010101-20011231"
+    computer:
+        pool_dir: "/work/ollie/pool"
+    echam:
+        experiment_input_dir: /work/ollie/pgierz/some_exp/input/echam
+        thisrun_input_dir: /work/ollie/pgierz/some_exp/run_20010101-20011231/input/echam
+        files:
+            human_readable_tag_001:
+                kind: input
+                allowed_to_be_missing: True
+                name_in_computer: foo
+                path_in_computer: /work/data/pool
+                name_in_work: foo
+                path_in_work: .
+                movement_type: move
+    """
+
+    check_log_file = """ \
+echam:
+    human_readable_tag_001:
+        checksum: d41d8cd98f00b204e9800998ecf8427e
+        intermediate: null
+        source: /work/data/pool/foo
+        target: /work/ollie/pgierz/some_exp/run_20010101-20011231/work/foo
+        kind: input
+    """
+    date = esm_calendar.Date("2000-01-01T00:00:00")
+    config = yaml.safe_load(dummy_config)
+    config["general"]["current_date"] = date
+    fs.create_dir("/work/data/pool")
+    fs.create_file("/work/data/pool/foo")
+    fs.create_dir("/work/ollie/pgierz/some_exp/run_20010101-20011231/work")
+    fs.create_dir("/work/ollie/pgierz/some_exp/run_20010101-20011231/log")
+
+    sim_files = esm_runscripts.filedicts.SimulationFileCollection.from_config(
+        config
+    )
+    config = sim_files.execute_filesystem_operation(config)
+
+    esm_runscripts.filedicts.log_used_files(config)
+
+    log_file = open("/work/ollie/pgierz/some_exp/run_20010101-20011231/log/expid_filelist_20010101-20011231.yaml", "r").read()
+
+    assert log_file==yaml.dump(yaml.safe_load(check_log_file))