From 73b4a48af617fd8c7b599ec9ec41a34cb5fde3be Mon Sep 17 00:00:00 2001 From: jiongwalai Date: Thu, 24 Apr 2025 14:47:16 +0800 Subject: [PATCH 01/49] support NvNMD-train and NvNMD-explore --- dpgen2/constants.py | 2 + dpgen2/entrypoint/args.py | 2 + dpgen2/entrypoint/submit.py | 42 +- dpgen2/exploration/task/lmp/lmp_input.py | 18 +- .../task/lmp_template_task_group.py | 65 +- .../task/make_task_group_from_config.py | 29 + dpgen2/exploration/task/npt_task_group.py | 3 + dpgen2/flow/dpgen_loop.py | 39 + dpgen2/op/__init__.py | 10 + dpgen2/op/prep_nvnmd_train.py | 119 +++ dpgen2/op/run_nvnmd.py | 404 ++++++++++ dpgen2/op/run_nvnmd_train.py | 693 ++++++++++++++++++ dpgen2/superop/__init__.py | 3 + dpgen2/superop/block.py | 20 +- dpgen2/superop/prep_run_nvnmd_train.py | 255 +++++++ dpgen2/utils/download_dpgen2_artifacts.py | 6 + 16 files changed, 1693 insertions(+), 17 deletions(-) create mode 100644 dpgen2/op/prep_nvnmd_train.py create mode 100644 dpgen2/op/run_nvnmd.py create mode 100644 dpgen2/op/run_nvnmd_train.py create mode 100644 dpgen2/superop/prep_run_nvnmd_train.py diff --git a/dpgen2/constants.py b/dpgen2/constants.py index 6d5d0197..3f9c1e69 100644 --- a/dpgen2/constants.py +++ b/dpgen2/constants.py @@ -1,6 +1,8 @@ train_index_pattern = "%04d" train_task_pattern = "task." + train_index_pattern train_script_name = "input.json" +train_cnn_script_name = "input_cnn.json" +train_qnn_script_name = "input_qnn.json" train_log_name = "train.log" model_name_pattern = "model.%03d.pb" pytorch_model_name_pattern = "model.%03d.pth" diff --git a/dpgen2/entrypoint/args.py b/dpgen2/entrypoint/args.py index df11ff7f..b4553157 100644 --- a/dpgen2/entrypoint/args.py +++ b/dpgen2/entrypoint/args.py @@ -133,6 +133,7 @@ def variant_train(): "type", [ Argument("dp", dict, dp_train_args()), + Argument("dp-nvnmd", dict, dp_train_args()), Argument("dp-dist", dict, dp_dist_train_args()), ], doc=doc, @@ -454,6 +455,7 @@ def variant_explore(): "type", [ Argument("lmp", dict, lmp_args(), doc=doc_lmp), + Argument("nvnmd", dict, lmp_args(), doc=doc_lmp), Argument("calypso", dict, caly_args(), doc=doc_calypso), Argument("calypso:default", dict, caly_args(), doc=doc_calypso), Argument("calypso:merge", dict, caly_args(), doc=doc_calypso), diff --git a/dpgen2/entrypoint/submit.py b/dpgen2/entrypoint/submit.py index b069e2e5..f3f00d3e 100644 --- a/dpgen2/entrypoint/submit.py +++ b/dpgen2/entrypoint/submit.py @@ -104,13 +104,16 @@ PrepCalyInput, PrepCalyModelDevi, PrepDPTrain, + PrepNvNMDTrain, PrepLmp, PrepRelax, RunCalyDPOptim, RunCalyModelDevi, RunDPTrain, + RunNvNMDTrain, RunLmp, RunLmpHDF5, + RunNvNMD, RunRelax, RunRelaxHDF5, SelectConfs, @@ -123,6 +126,7 @@ PrepRunCaly, PrepRunDiffCSP, PrepRunDPTrain, + PrepRunNvNMDTrain, PrepRunFp, PrepRunLmp, ) @@ -182,6 +186,17 @@ def make_concurrent_learning_op( valid_data=valid_data, optional_files=train_optional_files, ) + elif train_style == "dp-nvnmd": + prep_run_train_op = PrepRunNvNMDTrain( + "prep-run-nvnmd-train", + PrepNvNMDTrain, + RunNvNMDTrain, + prep_config=prep_train_config, + run_config=run_train_config, + upload_python_packages=upload_python_packages, + valid_data=valid_data, + optional_files=train_optional_files, + ) else: raise RuntimeError(f"unknown train_style {train_style}") if explore_style == "lmp": @@ -193,6 +208,15 @@ def make_concurrent_learning_op( run_config=run_explore_config, upload_python_packages=upload_python_packages, ) + elif "nvnmd" in explore_style: + prep_run_explore_op = PrepRunLmp( + "prep-run-nvnmd", + PrepLmp, + RunNvNMD, + prep_config=prep_explore_config, + run_config=run_explore_config, + upload_python_packages=upload_python_packages, + ) elif "calypso" in explore_style: expl_mode = explore_style.split(":")[-1] if ":" in explore_style else "default" if expl_mode == "merge": @@ -286,7 +310,7 @@ def make_naive_exploration_scheduler( # use npt task group explore_style = config["explore"]["type"] - if explore_style == "lmp": + if explore_style in ("lmp", "nvnmd"): return make_lmp_naive_exploration_scheduler(config) elif "calypso" in explore_style or explore_style == "diffcsp": return make_naive_exploration_scheduler_without_conf(config, explore_style) @@ -374,6 +398,7 @@ def make_lmp_naive_exploration_scheduler(config): output_nopbc = config["explore"]["output_nopbc"] conf_filters = get_conf_filters(config["explore"]["filters"]) use_ele_temp = config["inputs"]["use_ele_temp"] + config["explore"]["type"] scheduler = ExplorationScheduler() # report conv_style = convergence.pop("type") @@ -506,6 +531,16 @@ def workflow_concurrent_learning( else None ) config["train"]["numb_models"] = 1 + + elif train_style == "dp-nvnmd": + init_models_paths = config["train"].get("init_models_paths", None) + numb_models = config["train"]["numb_models"] + if init_models_paths is not None and len(init_models_paths) != numb_models: + raise RuntimeError( + f"{len(init_models_paths)} init models provided, which does " + "not match numb_models={numb_models}" + ) + else: raise RuntimeError(f"unknown params, train_style: {train_style}") @@ -625,6 +660,8 @@ def workflow_concurrent_learning( init_models = get_artifact_from_uri(config["train"]["init_models_uri"]) elif train_style == "dp-dist" and config["train"]["student_model_uri"] is not None: init_models = get_artifact_from_uri(config["train"]["student_model_uri"]) + elif train_style == "dp-nvnmd" and config["train"]["init_models_uri"] is not None: + init_models = get_artifact_from_uri(config["train"]["init_models_uri"]) elif init_models_paths is not None: init_models = upload_artifact_and_print_uri(init_models_paths, "init_models") else: @@ -662,6 +699,9 @@ def workflow_concurrent_learning( }, artifacts={ "init_models": init_models, + "init_models_ckpt_meta": None, + "init_models_ckpt_index": None, + "init_models_ckpt_data": None, "init_data": init_data, "iter_data": iter_data, }, diff --git a/dpgen2/exploration/task/lmp/lmp_input.py b/dpgen2/exploration/task/lmp/lmp_input.py index c2a22b60..cb42f214 100644 --- a/dpgen2/exploration/task/lmp/lmp_input.py +++ b/dpgen2/exploration/task/lmp/lmp_input.py @@ -50,6 +50,7 @@ def make_lmp_input( nopbc: bool = False, max_seed: int = 1000000, deepmd_version="2.0", + nvnmd_version="0.0", trj_seperate_files=True, pimd_bead: Optional[str] = None, ): @@ -69,9 +70,9 @@ def make_lmp_input( ret += "variable THERMO_FREQ equal %d\n" % trj_freq ret += "variable DUMP_FREQ equal %d\n" % trj_freq ret += "variable TEMP equal %f\n" % temp - if ele_temp_f is not None: + if ele_temp_f is not None and nvnmd_version is None: ret += "variable ELE_TEMP equal %f\n" % ele_temp_f - if ele_temp_a is not None: + if ele_temp_a is not None and nvnmd_version is None: ret += "variable ELE_TEMP equal %f\n" % ele_temp_a if pres is not None: ret += "variable PRES equal %f\n" % pres @@ -106,12 +107,16 @@ def make_lmp_input( if pimd_bead is not None else lmp_model_devi_name ) - if Version(deepmd_version) < Version("1"): + if Version(deepmd_version) < Version("1") and nvnmd_version is None: # 0.x ret += "pair_style deepmd %s ${THERMO_FREQ} %s\n" % ( graph_list, model_devi_file_name, ) + elif nvnmd_version is not None: + ret += "pair_style nvnmd %s\n" % ( + "model.pb" + ) else: # 1.x keywords = "" @@ -146,6 +151,8 @@ def make_lmp_input( ) ret += "restart 10000 dpgen.restart\n" ret += "\n" + if(nvnmd_version is not None): + ret += 'if "${rerun} > 0" then "jump SELF rerun"\n' if pka_e is None: ret += 'if "${restart} == 0" then "velocity all create ${TEMP} %d"' % ( random.randrange(max_seed - 1) + 1 @@ -193,4 +200,9 @@ def make_lmp_input( ret += "\n" ret += "timestep %f\n" % dt ret += "run ${NSTEPS} upto\n" + if(nvnmd_version is not None): + ret += 'jump SELF end\n' + ret += 'label rerun\n' + ret += 'rerun %s.0 dump x y z fx fy fz\n' % lmp_traj_file_name + ret += 'label end\n' return ret diff --git a/dpgen2/exploration/task/lmp_template_task_group.py b/dpgen2/exploration/task/lmp_template_task_group.py index 1a44cb8e..de97b05f 100644 --- a/dpgen2/exploration/task/lmp_template_task_group.py +++ b/dpgen2/exploration/task/lmp_template_task_group.py @@ -47,12 +47,14 @@ def set_lmp( revisions: dict = {}, traj_freq: int = 10, extra_pair_style_args: str = "", + nvnmd_version: Optional[str] = None, pimd_bead: Optional[str] = None, ) -> None: self.lmp_template = Path(lmp_template_fname).read_text().split("\n") self.revisions = revisions self.traj_freq = traj_freq self.extra_pair_style_args = extra_pair_style_args + self.nvnmd_version = nvnmd_version self.pimd_bead = pimd_bead self.lmp_set = True self.model_list = sorted([model_name_pattern % ii for ii in range(numb_models)]) @@ -62,10 +64,16 @@ def set_lmp( self.traj_freq, self.extra_pair_style_args, self.pimd_bead, + nvnmd_version=self.nvnmd_version, ) self.lmp_template = revise_lmp_input_dump( - self.lmp_template, self.traj_freq, self.pimd_bead + self.lmp_template, + self.traj_freq, + self.pimd_bead, + nvnmd_version=self.nvnmd_version, ) + if(nvnmd_version is not None): + self.lmp_template = revise_lmp_input_rerun(self.lmp_template) if plm_template_fname is not None: self.plm_template = Path(plm_template_fname).read_text().split("\n") self.plm_set = True @@ -158,8 +166,8 @@ def revise_lmp_input_model( extra_pair_style_args="", pimd_bead=None, deepmd_version="1", + nvnmd_version=None, ): - idx = find_only_one_key(lmp_lines, ["pair_style", "deepmd"]) if extra_pair_style_args: extra_pair_style_args = " " + extra_pair_style_args graph_list = " ".join(task_model_list) @@ -168,23 +176,41 @@ def revise_lmp_input_model( if pimd_bead is not None else lmp_model_devi_name ) - lmp_lines[idx] = "pair_style deepmd %s out_freq %d out_file %s%s" % ( - graph_list, - trj_freq, - model_devi_file_name, - extra_pair_style_args, - ) + if(nvnmd_version is None): + idx = find_only_one_key(lmp_lines, ["pair_style", "deepmd"]) + lmp_lines[idx] = "pair_style deepmd %s out_freq %d out_file %s%s" % ( + graph_list, + trj_freq, + model_devi_file_name, + extra_pair_style_args, + ) + else: + idx = find_only_one_key(lmp_lines, ["pair_style", "nvnmd"]) + lmp_lines[idx] = "pair_style nvnmd %s %s" % ( + "model.pb", + extra_pair_style_args + ) + return lmp_lines -def revise_lmp_input_dump(lmp_lines, trj_freq, pimd_bead=None): +def revise_lmp_input_dump(lmp_lines, trj_freq, pimd_bead=None,nvnmd_version=None): idx = find_only_one_key(lmp_lines, ["dump", "dpgen_dump"]) lmp_traj_file_name = ( lmp_pimd_traj_name % pimd_bead if pimd_bead is not None else lmp_traj_name ) - lmp_lines[ - idx - ] = f"dump dpgen_dump all custom {trj_freq} {lmp_traj_file_name} id type x y z" + if(nvnmd_version is None): + lmp_lines[ + idx + ] = f"dump dpgen_dump all custom {trj_freq} {lmp_traj_file_name} id type x y z" + else: + lmp_lines[ + idx + ] = f"dump dpgen_dump all custom ${trj_freq} ${lmp_traj_file_name} id type x y z fx fy fz" + lmp_lines.insert( + idx+1, + 'if \"${rerun} > 0\" then \"jump SELF rerun' + ) return lmp_lines @@ -196,6 +222,21 @@ def revise_lmp_input_plm(lmp_lines, in_plm, out_plm="output.plumed"): ) return lmp_lines +def revise_lmp_input_rerun(lmp_lines): + lmp_lines.append( + 'jump SELF end' + ) + lmp_lines.append( + 'label rerun' + ) + lmp_lines.append( + f'rerun rerun {lmp_traj_name}.0 dump x y z fx fy fz' + ) + lmp_lines.append( + 'labal end' + ) + return lmp_lines + def revise_by_keys(lmp_lines, keys, values): for kk, vv in zip(keys, values): # type: ignore diff --git a/dpgen2/exploration/task/make_task_group_from_config.py b/dpgen2/exploration/task/make_task_group_from_config.py index 3b793c58..05bfac30 100644 --- a/dpgen2/exploration/task/make_task_group_from_config.py +++ b/dpgen2/exploration/task/make_task_group_from_config.py @@ -297,12 +297,21 @@ def variant_task_group(): Argument( "lmp-md", dict, npt_task_group_args(), alias=["lmp-npt"], doc=doc_lmp_md ), + Argument( + "lmp-nvnmd", dict, npt_task_group_args(), alias=["lmp-nvnmd-npt"], doc=doc_lmp_md + ), Argument( "lmp-template", dict, lmp_template_task_group_args(), doc=doc_lmp_template, ), + Argument( + "lmp-nvnmd-template", + dict, + lmp_template_task_group_args(), + doc=doc_lmp_template, + ), Argument( "customized-lmp-template", dict, @@ -618,6 +627,7 @@ def make_lmp_task_group_from_config( config["conf_idx"] = [] if "conf_idx" not in config else None config = lmp_normalize(config) config = config_strip_confidx(config) + if config["type"] == "lmp-md": tgroup = NPTTaskGroup() config.pop("type") @@ -626,6 +636,15 @@ def make_lmp_task_group_from_config( mass_map, **config, ) + elif config["type"] == "lmp-nvnmd": + tgroup = NPTTaskGroup() + config.pop("type") + config["nvnmd_version"] = "0.0" + tgroup.set_md( + numb_models, + mass_map, + **config, + ) elif config["type"] == "lmp-template": tgroup = LmpTemplateTaskGroup() config.pop("type") @@ -635,6 +654,16 @@ def make_lmp_task_group_from_config( lmp_template, **config, ) + elif config["type"] == "lmp-nvnmd-template": + tgroup = LmpTemplateTaskGroup() + config.pop("type") + config["nvnmd_version"] = "0.0" + lmp_template = config.pop("lmp_template_fname") + tgroup.set_lmp( + numb_models, + lmp_template, + **config, + ) elif config["type"] == "customized-lmp-template": tgroup = CustomizedLmpTemplateTaskGroup() config.pop("type") diff --git a/dpgen2/exploration/task/npt_task_group.py b/dpgen2/exploration/task/npt_task_group.py index 27c1e001..e597071b 100644 --- a/dpgen2/exploration/task/npt_task_group.py +++ b/dpgen2/exploration/task/npt_task_group.py @@ -49,6 +49,7 @@ def set_md( relative_v_epsilon: Optional[float] = None, ele_temp_f: Optional[float] = None, ele_temp_a: Optional[float] = None, + nvnmd_version: Optional[str] = None, pimd_bead: Optional[str] = None, ): """ @@ -73,6 +74,7 @@ def set_md( self.ele_temp_f = ele_temp_f self.ele_temp_a = ele_temp_a self.md_set = True + self.nvnmd_version = nvnmd_version self.pimd_bead = pimd_bead def make_task( @@ -132,6 +134,7 @@ def _make_lmp_task( self.ele_temp_f, self.ele_temp_a, self.no_pbc, + nvnmd_version = self.nvnmd_version, trj_seperate_files=False, pimd_bead=self.pimd_bead, ), diff --git a/dpgen2/flow/dpgen_loop.py b/dpgen2/flow/dpgen_loop.py index 190a1090..66949e20 100644 --- a/dpgen2/flow/dpgen_loop.py +++ b/dpgen2/flow/dpgen_loop.py @@ -186,6 +186,9 @@ def __init__( } self._input_artifacts = { "init_models": InputArtifact(optional=True), + "init_models_ckpt_meta": InputArtifact(optional=True), + "init_models_ckpt_data": InputArtifact(optional=True), + "init_models_ckpt_index": InputArtifact(optional=True), "init_data": InputArtifact(), "iter_data": InputArtifact(), } @@ -194,6 +197,9 @@ def __init__( } self._output_artifacts = { "models": OutputArtifact(), + "models_ckpt_meta": OutputArtifact(), + "models_ckpt_data": OutputArtifact(), + "models_ckpt_index": OutputArtifact(), "iter_data": OutputArtifact(), } @@ -277,6 +283,9 @@ def __init__( self._input_artifacts = { "init_models": InputArtifact(optional=True), + "init_models_ckpt_meta": InputArtifact(optional=True), + "init_models_ckpt_data": InputArtifact(optional=True), + "init_models_ckpt_index": InputArtifact(optional=True), "init_data": InputArtifact(), "iter_data": InputArtifact(), } @@ -285,6 +294,9 @@ def __init__( } self._output_artifacts = { "models": OutputArtifact(), + "models_ckpt_meta": OutputArtifact(), + "models_ckpt_data": OutputArtifact(), + "models_ckpt_index": OutputArtifact(), "iter_data": OutputArtifact(), } @@ -374,6 +386,9 @@ def _loop( parameters=block_common_parameters, artifacts={ "init_models": steps.inputs.artifacts["init_models"], + "init_models_ckpt_meta": steps.inputs.artifacts["init_models_ckpt_meta"], + "init_models_ckpt_index": steps.inputs.artifacts["init_models_ckpt_index"], + "init_models_ckpt_data": steps.inputs.artifacts["init_models_ckpt_data"], "init_data": steps.inputs.artifacts["init_data"], "iter_data": steps.inputs.artifacts["iter_data"], }, @@ -446,6 +461,9 @@ def _loop( parameters=next_common_parameters, artifacts={ "init_models": block_step.outputs.artifacts["models"], + "init_models_ckpt_meta": block_step.outputs.artifacts["models_ckpt_meta"], + "init_models_ckpt_data": block_step.outputs.artifacts["models_ckpt_data"], + "init_models_ckpt_index": block_step.outputs.artifacts["models_ckpt_index"], "init_data": steps.inputs.artifacts["init_data"], "iter_data": block_step.outputs.artifacts["iter_data"], }, @@ -465,6 +483,21 @@ def _loop( _then=block_step.outputs.artifacts["models"], _else=next_step.outputs.artifacts["models"], ) + steps.outputs.artifacts["models_ckpt_meta"].from_expression = if_expression( + _if=(scheduler_step.outputs.parameters["converged"] == True), + _then=block_step.outputs.artifacts["models_ckpt_meta"], + _else=next_step.outputs.artifacts["models_ckpt_meta"], + ) + steps.outputs.artifacts["models_ckpt_data"].from_expression = if_expression( + _if=(scheduler_step.outputs.parameters["converged"] == True), + _then=block_step.outputs.artifacts["models_ckpt_data"], + _else=next_step.outputs.artifacts["models_ckpt_data"], + ) + steps.outputs.artifacts["models_ckpt_index"].from_expression = if_expression( + _if=(scheduler_step.outputs.parameters["converged"] == True), + _then=block_step.outputs.artifacts["models_ckpt_index"], + _else=next_step.outputs.artifacts["models_ckpt_index"], + ) steps.outputs.artifacts["iter_data"].from_expression = if_expression( _if=(scheduler_step.outputs.parameters["converged"] == True), _then=block_step.outputs.artifacts["iter_data"], @@ -550,6 +583,9 @@ def _dpgen( parameters=common_parameters, artifacts={ "init_models": steps.inputs.artifacts["init_models"], + "init_models_ckpt_meta": steps.inputs.artifacts["init_models_ckpt_meta"], + "init_models_ckpt_data": steps.inputs.artifacts["init_models_ckpt_data"], + "init_models_ckpt_index": steps.inputs.artifacts["init_models_ckpt_index"], "init_data": steps.inputs.artifacts["init_data"], "iter_data": steps.inputs.artifacts["iter_data"], }, @@ -561,6 +597,9 @@ def _dpgen( "exploration_scheduler" ].value_from_parameter = loop_step.outputs.parameters["exploration_scheduler"] steps.outputs.artifacts["models"]._from = loop_step.outputs.artifacts["models"] + steps.outputs.artifacts["models_ckpt_meta"]._from = loop_step.outputs.artifacts["models_ckpt_meta"] + steps.outputs.artifacts["models_ckpt_data"]._from = loop_step.outputs.artifacts["models_ckpt_data"] + steps.outputs.artifacts["models_ckpt_index"]._from = loop_step.outputs.artifacts["models_ckpt_index"] steps.outputs.artifacts["iter_data"]._from = loop_step.outputs.artifacts[ "iter_data" ] diff --git a/dpgen2/op/__init__.py b/dpgen2/op/__init__.py index f4fec3a2..95f7f9c2 100644 --- a/dpgen2/op/__init__.py +++ b/dpgen2/op/__init__.py @@ -19,6 +19,9 @@ from .prep_dp_train import ( PrepDPTrain, ) +from .prep_nvnmd_train import ( + PrepNvNMDTrain, +) from .prep_lmp import ( PrepLmp, ) @@ -34,10 +37,17 @@ from .run_dp_train import ( RunDPTrain, ) +from .run_nvnmd_train import ( + RunNvNMDTrain, +) from .run_lmp import ( RunLmp, RunLmpHDF5, ) +from .run_nvnmd import ( + RunNvNMD, +) + from .run_relax import ( RunRelax, RunRelaxHDF5, diff --git a/dpgen2/op/prep_nvnmd_train.py b/dpgen2/op/prep_nvnmd_train.py new file mode 100644 index 00000000..a1600635 --- /dev/null +++ b/dpgen2/op/prep_nvnmd_train.py @@ -0,0 +1,119 @@ +import json +import random +import sys +from pathlib import ( + Path, +) +from typing import ( + List, + Tuple, + Union, +) + +from dflow.python import ( + OP, + OPIO, + Artifact, + BigParameter, + OPIOSign, +) + +from dpgen2.constants import ( + train_script_name, + train_task_pattern, +) + + +class PrepNvNMDTrain(OP): + r"""Prepares the working directories for DP training tasks. + + A list of (`numb_models`) working directories containing all files + needed to start training tasks will be created. The paths of the + directories will be returned as `op["task_paths"]`. The identities + of the tasks are returned as `op["task_names"]`. + + """ + + @classmethod + def get_input_sign(cls): + return OPIOSign( + { + "template_script": BigParameter(Union[dict, List[dict]]), + "numb_models": int, + } + ) + + @classmethod + def get_output_sign(cls): + return OPIOSign( + { + "task_names": BigParameter(List[str]), + "task_paths": Artifact(List[Path]), + } + ) + + @OP.exec_sign_check + def execute( + self, + ip: OPIO, + ) -> OPIO: + r"""Execute the OP. + + Parameters + ---------- + ip : dict + Input dict with components: + + - `template_script`: (`str` or `List[str]`) A template of the training script. Can be a `str` or `List[str]`. In the case of `str`, all training tasks share the same training input template, the only difference is the random number used to initialize the network parameters. In the case of `List[str]`, one training task uses one template from the list. The random numbers used to initialize the network parameters are differnt. The length of the list should be the same as `numb_models`. + - `numb_models`: (`int`) Number of DP models to train. + + Returns + ------- + op : dict + Output dict with components: + + - `task_names`: (`List[str]`) The name of tasks. Will be used as the identities of the tasks. The names of different tasks are different. + - `task_paths`: (`Artifact(List[Path])`) The parepared working paths of the tasks. The order fo the Paths should be consistent with `op["task_names"]` + + """ + template = ip["template_script"] + numb_models = ip["numb_models"] + osubdirs = [] + if type(template) != list: + template = [template for ii in range(numb_models)] + else: + if not (len(template) == numb_models): + raise RuntimeError( + f"length of the template list should be equal to {numb_models}" + ) + + for ii in range(numb_models): + # mkdir + subdir = Path(train_task_pattern % ii) + subdir.mkdir(exist_ok=True, parents=True) + osubdirs.append(str(subdir)) + # change random seed in template + idict = self._script_rand_seed(template[ii]) + # write input script + fname = subdir / train_script_name + with open(fname, "w") as fp: + json.dump(idict, fp, indent=4) + + op = OPIO( + { + "task_names": osubdirs, + "task_paths": [Path(ii) for ii in osubdirs], + } + ) + return op + + def _script_rand_seed( + self, + input_dict, + ): + jtmp = input_dict.copy() + + # the key "seed" in "nvnmd" is used to set the random seed for the network parameters, it is developing. + #jtmp["nvnmd"]["seed"] = random.randrange(sys.maxsize) % (2**32) + jtmp["training"]["seed"] = random.randrange(sys.maxsize) % (2**32) + return jtmp diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py new file mode 100644 index 00000000..326ef35d --- /dev/null +++ b/dpgen2/op/run_nvnmd.py @@ -0,0 +1,404 @@ +import glob +import json +import logging +import os +import random +import re +from pathlib import ( + Path, +) +from typing import ( + List, + Optional, + Set, + Tuple, +) + +import numpy as np +from dargs import ( + Argument, + ArgumentEncoder, + Variant, + dargs, +) +from dflow.python import ( + OP, + OPIO, + Artifact, + BigParameter, + FatalError, + HDF5Datasets, + OPIOSign, + TransientError, +) + +from dpgen2.constants import ( + lmp_conf_name, + lmp_input_name, + lmp_log_name, + lmp_model_devi_name, + lmp_traj_name, + model_name_match_pattern, + model_name_pattern, + plm_output_name, + pytorch_model_name_pattern, +) +from dpgen2.utils import ( + BinaryFileInput, + set_directory, +) +from dpgen2.utils.run_command import ( + run_command, +) + + +class RunNvNMD(OP): + r"""Execute a LAMMPS task. + + A working directory named `task_name` is created. All input files + are copied or symbol linked to directory `task_name`. The LAMMPS + command is exectuted from directory `task_name`. The trajectory + and the model deviation will be stored in files `op["traj"]` and + `op["model_devi"]`, respectively. + + """ + + @classmethod + def get_input_sign(cls): + return OPIOSign( + { + "config": BigParameter(dict), + "task_name": BigParameter(str), + "task_path": Artifact(Path), + "models": Artifact(List[Path]), + } + ) + + @classmethod + def get_output_sign(cls): + return OPIOSign( + { + "log": Artifact(Path), + "traj": Artifact(Path), + "model_devi": Artifact(Path), + "plm_output": Artifact(Path, optional=True), + "optional_output": Artifact(Path, optional=True), + } + ) + + @OP.exec_sign_check + def execute( + self, + ip: OPIO, + ) -> OPIO: + r"""Execute the OP. + + Parameters + ---------- + ip : dict + Input dict with components: + + - `config`: (`dict`) The config of lmp task. Check `RunNvNMD.lmp_args` for definitions. + - `task_name`: (`str`) The name of the task. + - `task_path`: (`Artifact(Path)`) The path that contains all input files prepareed by `PrepLmp`. + - `models`: (`Artifact(List[Path])`) The frozen model to estimate the model deviation. The first model with be used to drive molecular dynamics simulation. + + Returns + ------- + Any + Output dict with components: + - `log`: (`Artifact(Path)`) The log file of LAMMPS. + - `traj`: (`Artifact(Path)`) The output trajectory. + - `model_devi`: (`Artifact(Path)`) The model deviation. The order of recorded model deviations should be consistent with the order of frames in `traj`. + + Raises + ------ + TransientError + On the failure of LAMMPS execution. Handle different failure cases? e.g. loss atoms. + """ + config = ip["config"] if ip["config"] is not None else {} + config = RunNvNMD.normalize_config(config) + command = config["command"] + teacher_model: Optional[BinaryFileInput] = config["teacher_model_path"] + shuffle_models: Optional[bool] = config["shuffle_models"] + task_name = ip["task_name"] + task_path = ip["task_path"] + models = ip["models"] + # input_files = [lmp_conf_name, lmp_input_name] + # input_files = [(Path(task_path) / ii).resolve() for ii in input_files] + input_files = [ii.resolve() for ii in Path(task_path).iterdir()] + model_files = [Path(ii).resolve() for ii in models] + work_dir = Path(task_name) + + if teacher_model is not None: + assert ( + len(model_files) == 1 + ), "One model is enough in knowledge distillation" + ext = os.path.splitext(teacher_model.file_name)[-1] + teacher_model_file = "teacher_model" + ext + teacher_model.save_as_file(teacher_model_file) + model_files = [Path(teacher_model_file).resolve()] + model_files + + with set_directory(work_dir): + # link input files + for ii in input_files: + iname = ii.name + try: + Path(iname).symlink_to(ii) + except: + logging.warning( + "failed to link %s, maybe already linked" % iname + ) + pass + # link models + model_names = [] + for idx, mm in enumerate(model_files): + ext = os.path.splitext(mm)[-1] + if ext == ".pb": + mname = model_name_pattern % (idx) + try: + Path(mname).symlink_to(mm) + except: + logging.warning( + "failed to link %s, maybe already linked" % iname + ) + pass + + elif ext == ".pt": + # freeze model + mname = pytorch_model_name_pattern % (idx) + freeze_model(mm, mname, config.get("model_frozen_head")) + else: + raise RuntimeError( + "Model file with extension '%s' is not supported" % ext + ) + model_names.append(mname) + + if shuffle_models: + random.shuffle(model_names) + + set_models(lmp_input_name, model_names) + + # run lmp + calc_model_devi_command = ["python /mnt/nvnmd/input/ljh/calc_model_devi.py", "cp %s.0 %s"%(lmp_traj_name, lmp_traj_name)] + commands = " ; ".join([" ".join( + ["cp", model_name, "model.pb", "&&", command, "-i", lmp_input_name, "-log", lmp_log_name, "-v", "rerun", "%d"%i, "&&", "cp", lmp_traj_name, lmp_traj_name+".%d"%i]) + for i, model_name in enumerate(model_names)] + calc_model_devi_command) + ret, out, err = run_command(commands, shell=True) + if ret != 0: + logging.error( + "".join( + ( + "lmp failed\n", + "command was: ", + commands, + "out msg: ", + out, + "\n", + "err msg: ", + err, + "\n", + ) + ) + ) + raise TransientError("lmp failed") + + ele_temp = None + if config.get("use_ele_temp", 0): + ele_temp = get_ele_temp(lmp_log_name) + if ele_temp is not None: + data = { + "ele_temp": ele_temp, + } + with open("job.json", "w") as f: + json.dump(data, f, indent=4) + merge_pimd_files() + + ret_dict = { + "log": work_dir / lmp_log_name, + "traj": work_dir / lmp_traj_name, + "model_devi": self.get_model_devi(work_dir / lmp_model_devi_name), + } + plm_output = ( + {"plm_output": work_dir / plm_output_name} + if (work_dir / plm_output_name).is_file() + else {} + ) + ret_dict.update(plm_output) + if ele_temp is not None: + ret_dict["optional_output"] = work_dir / "job.json" + + return OPIO(ret_dict) + + def get_model_devi(self, model_devi_file): + return model_devi_file + + @staticmethod + def lmp_args(): + doc_lmp_cmd = "The command of LAMMPS" + doc_teacher_model = "The teacher model in `Knowledge Distillation`" + doc_shuffle_models = "Randomly pick a model from the group of models to drive theexploration MD simulation" + doc_head = "Select a head from multitask" + doc_use_ele_temp = "Whether to use electronic temperature, 0 for no, 1 for frame temperature, and 2 for atomic temperature" + doc_use_hdf5 = "Use HDF5 to store trajs and model_devis" + return [ + Argument("command", str, optional=True, default="lmp", doc=doc_lmp_cmd), + Argument( + "teacher_model_path", + [BinaryFileInput, str], + optional=True, + default=None, + doc=doc_teacher_model, + ), + Argument( + "shuffle_models", + bool, + optional=True, + default=False, + doc=doc_shuffle_models, + ), + Argument("head", str, optional=True, default=None, doc=doc_head), + Argument( + "use_ele_temp", int, optional=True, default=0, doc=doc_use_ele_temp + ), + Argument( + "model_frozen_head", str, optional=True, default=None, doc=doc_head + ), + Argument( + "use_hdf5", + bool, + optional=True, + default=False, + doc=doc_use_hdf5, + ), + ] + + @staticmethod + def normalize_config(data={}): + ta = RunNvNMD.lmp_args() + base = Argument("base", dict, ta) + data = base.normalize_value(data, trim_pattern="_*") + base.check_value(data, strict=True) + return data + + +config_args = RunNvNMD.lmp_args + + +def set_models(lmp_input_name: str, model_names: List[str]): + with open(lmp_input_name, encoding="utf8") as f: + lmp_input_lines = f.readlines() + + idx = find_only_one_key( + lmp_input_lines, ["pair_style", "deepmd"], raise_not_found=False + ) + if idx is None: + return + new_line_split = lmp_input_lines[idx].split() + match_first = -1 + match_last = -1 + pattern = model_name_match_pattern + for sidx, ii in enumerate(new_line_split): + if re.fullmatch(pattern, ii) is not None: + if match_first == -1: + match_first = sidx + else: + if match_first != -1: + match_last = sidx + break + if match_first == -1: + raise RuntimeError( + f"cannot file model pattern {pattern} in line " f" {lmp_input_lines[idx]}" + ) + if match_last == -1: + raise RuntimeError(f"last matching index should not be -1, terribly wrong ") + for ii in range(match_last, len(new_line_split)): + if re.fullmatch(pattern, new_line_split[ii]) is not None: + raise RuntimeError( + f"unexpected matching of model pattern {pattern} " + f"in line {lmp_input_lines[idx]}" + ) + new_line_split[match_first:match_last] = model_names + lmp_input_lines[idx] = " ".join(new_line_split) + "\n" + + with open(lmp_input_name, "w", encoding="utf8") as f: + f.write("".join(lmp_input_lines)) + + +def find_only_one_key(lmp_lines, key, raise_not_found=True): + found = [] + for idx in range(len(lmp_lines)): + words = lmp_lines[idx].split() + nkey = len(key) + if len(words) >= nkey and words[:nkey] == key: + found.append(idx) + if len(found) > 1: + raise RuntimeError("found %d keywords %s" % (len(found), key)) + if len(found) == 0: + if raise_not_found: + raise RuntimeError("failed to find keyword %s" % (key)) + else: + return None + return found[0] + + +def get_ele_temp(lmp_log_name): + with open(lmp_log_name, encoding="utf8") as f: + lmp_log_lines = f.readlines() + + for line in lmp_log_lines: + fields = line.split() + if fields[:2] == ["pair_style", "deepmd"]: + if "fparam" in fields: + # for rendering variables + try: + return float(fields[fields.index("fparam") + 1]) + except Exception: + pass + if "aparam" in fields: + try: + return float(fields[fields.index("aparam") + 1]) + except Exception: + pass + + return None + + +def freeze_model(input_model, frozen_model, head=None): + freeze_args = "-o %s" % frozen_model + if head is not None: + freeze_args += " --head %s" % head + freeze_cmd = "dp --pt freeze -c %s %s" % (input_model, freeze_args) + ret, out, err = run_command(freeze_cmd, shell=True) + if ret != 0: + logging.error( + "".join( + ( + "freeze failed\n", + "command was", + freeze_cmd, + "out msg", + out, + "\n", + "err msg", + err, + "\n", + ) + ) + ) + raise TransientError("freeze failed") + + +def merge_pimd_files(): + traj_files = glob.glob("traj.*.dump") + if len(traj_files) > 0: + with open(lmp_traj_name, "w") as f: + for traj_file in sorted(traj_files): + with open(traj_file, "r") as f2: + f.write(f2.read()) + model_devi_files = glob.glob("model_devi.*.out") + if len(model_devi_files) > 0: + with open(lmp_model_devi_name, "w") as f: + for model_devi_file in sorted(model_devi_files): + with open(model_devi_file, "r") as f2: + f.write(f2.read()) diff --git a/dpgen2/op/run_nvnmd_train.py b/dpgen2/op/run_nvnmd_train.py new file mode 100644 index 00000000..af0f395c --- /dev/null +++ b/dpgen2/op/run_nvnmd_train.py @@ -0,0 +1,693 @@ +import glob +import json +import logging +import os +import shutil +import copy +from pathlib import ( + Path, +) +from typing import ( + Dict, + List, + Optional, + Tuple, + Union, +) + +import dpdata +from dargs import ( + Argument, + ArgumentEncoder, + Variant, + dargs, +) +from dflow.python import ( + OP, + OPIO, + Artifact, + BigParameter, + FatalError, + NestedDict, + OPIOSign, + Parameter, + TransientError, +) + +from dpgen2.constants import ( + train_script_name, + train_cnn_script_name, + train_qnn_script_name, + train_task_pattern, +) +from dpgen2.utils.chdir import ( + set_directory, +) +from dpgen2.utils.run_command import ( + run_command, +) + + +def _make_train_command( + dp_command, + train_script_name, + impl, + do_init_model, + init_model, + finetune_mode, + finetune_args, + init_model_with_finetune, + train_args="", +): + + # find checkpoint + if impl == "tensorflow" and os.path.isfile("nvnmd_cnn/checkpoint") and not os.path.isfile("nvnmd_cnn/frozen_model.pb"): + checkpoint = "nvnmd_cnn/model.ckpt" + else: + checkpoint = None + # case of restart + if checkpoint is not None: + command = dp_command + ["train-nvnmd", "--restart", checkpoint, train_script_name] + return command + # case of init model and finetune + assert checkpoint is None + case_init_model = do_init_model # and (not init_model_with_finetune) + # nvnmd-train do not support initial model + if case_init_model: + # not support initial frozen model in nvnmd + #init_flag = "--init-frz-model" if impl == "tensorflow" else "--init-model" + init_flag = "--init-model" + for i in init_model: + shutil.copy(i, "./") + + command = dp_command + [ + "train-nvnmd", + init_flag, + "model.ckpt", + train_script_name, + ] + else: + command = dp_command + ["train-nvnmd", train_script_name] + + command += train_args.split() + print(command) + return command + + +class RunNvNMDTrain(OP): + r"""Execute a DP training task. Train and freeze a DP model. + + A working directory named `task_name` is created. All input files + are copied or symbol linked to directory `task_name`. The + DeePMD-kit training and freezing commands are exectuted from + directory `task_name`. + + """ + + default_optional_parameter = { + "mixed_type": False, + "finetune_mode": "no", + } + + @classmethod + def get_input_sign(cls): + return OPIOSign( + { + "config": dict, + "task_name": BigParameter(str), + "optional_parameter": Parameter( + dict, + default=RunNvNMDTrain.default_optional_parameter, + ), + "task_path": Artifact(Path), + "init_model": Artifact(Path, optional=True), + "init_model_ckpt_meta": Artifact(Path, optional=True), + "init_model_ckpt_data": Artifact(Path, optional=True), + "init_model_ckpt_index": Artifact(Path, optional=True), + "init_data": Artifact(NestedDict[Path]), + "iter_data": Artifact(List[Path]), + "valid_data": Artifact(NestedDict[Path], optional=True), + "optional_files": Artifact(List[Path], optional=True), + } + ) + + @classmethod + def get_output_sign(cls): + return OPIOSign( + { + "script": Artifact(Path), + "cnn_model": Artifact(Path), + "qnn_model": Artifact(Path), + "model_ckpt_data": Artifact(Path), + "model_ckpt_meta": Artifact(Path), + "model_ckpt_index": Artifact(Path), + "lcurve": Artifact(Path), + "log": Artifact(Path), + } + ) + + @OP.exec_sign_check + def execute( + self, + ip: OPIO, + ) -> OPIO: + r"""Execute the OP. + + Parameters + ---------- + ip : dict + Input dict with components: + + - `config`: (`dict`) The config of training task. Check `RunNvNMDTrain.training_args` for definitions. + - `task_name`: (`str`) The name of training task. + - `task_path`: (`Artifact(Path)`) The path that contains all input files prepareed by `PrepDPTrain`. + - `init_model`: (`Artifact(Path)`) A frozen model to initialize the training. + - `init_data`: (`Artifact(NestedDict[Path])`) Initial training data. + - `iter_data`: (`Artifact(List[Path])`) Training data generated in the DPGEN iterations. + + Returns + ------- + Any + Output dict with components: + - `script`: (`Artifact(Path)`) The training script. + - `model`: (`Artifact(Path)`) The trained frozen model. + - `lcurve`: (`Artifact(Path)`) The learning curve file. + - `log`: (`Artifact(Path)`) The log file of training. + + Raises + ------ + FatalError + On the failure of training or freezing. Human intervention needed. + """ + mixed_type = ip["optional_parameter"]["mixed_type"] + finetune_mode = ip["optional_parameter"]["finetune_mode"] + config = ip["config"] if ip["config"] is not None else {} + impl = ip["config"].get("impl", "tensorflow") + dp_command = ip["config"].get("command", "dp").split() + assert impl in ["tensorflow"] + finetune_args = config.get("finetune_args", "") + train_args = config.get("train_args", "") + config = RunNvNMDTrain.normalize_config(config) + task_name = ip["task_name"] + task_path = ip["task_path"] + init_model = ip["init_model"] + init_model_ckpt_data = ip["init_model_ckpt_data"] + init_model_ckpt_meta = ip["init_model_ckpt_meta"] + init_model_ckpt_index = ip["init_model_ckpt_index"] + init_data = ip["init_data"] + iter_data = ip["iter_data"] + valid_data = ip["valid_data"] + iter_data_old_exp = _expand_all_multi_sys_to_sys(iter_data[:-1]) + iter_data_new_exp = _expand_all_multi_sys_to_sys(iter_data[-1:]) + iter_data_exp = iter_data_old_exp + iter_data_new_exp + work_dir = Path(task_name) + init_model_with_finetune = config["init_model_with_finetune"] + + # update the input script + input_script = Path(task_path) / train_script_name + with open(input_script) as fp: + train_dict = json.load(fp) + if "systems" in train_dict["training"]: + major_version = "1" + else: + major_version = "2" + + # auto prob style + init_model = [init_model_ckpt_meta, init_model_ckpt_data, init_model_ckpt_index] if init_model is not None else init_model + do_init_model = RunNvNMDTrain.decide_init_model( + config, + init_model, + init_data, + iter_data, + mixed_type=mixed_type, + ) + auto_prob_str = "prob_sys_size" + if do_init_model: + old_ratio = config["init_model_old_ratio"] + len_init = len(init_data) + numb_old = len_init + len(iter_data_old_exp) + numb_new = numb_old + len(iter_data_new_exp) + auto_prob_str = f"prob_sys_size; 0:{numb_old}:{old_ratio}; {numb_old}:{numb_new}:{1.-old_ratio:g}" + + # update the input dict + train_dict = RunNvNMDTrain.write_data_to_input_script( + train_dict, + config, + init_data, + iter_data_exp, + auto_prob_str, + "2", + valid_data, + ) + train_cnn_dict = RunNvNMDTrain.write_other_to_input_script( + train_dict, config, do_init_model, False, "2", + ) + train_qnn_dict = RunNvNMDTrain.write_other_to_input_script( + train_dict, config, do_init_model, True, "2", + ) + + with set_directory(work_dir): + # open log + fplog = open("train.log", "w") + + def clean_before_quit(): + fplog.close() + + # dump train script + with open(train_cnn_script_name, "w") as fp: + json.dump(train_cnn_dict, fp, indent=4) + + with open(train_qnn_script_name, "w") as fp: + json.dump(train_qnn_dict, fp, indent=4) + + if ip["optional_files"] is not None: + for f in ip["optional_files"]: + Path(f.name).symlink_to(f) + + # train cnn model + command = _make_train_command( + dp_command, + train_cnn_script_name, + impl, + do_init_model, + init_model, + finetune_mode, + finetune_args, + init_model_with_finetune, + train_args = "-s s1", + ) + + ret, out, err = run_command(command) + if ret != 0: + clean_before_quit() + logging.error( + "".join( + ( + "dp train failed\n", + "out msg: ", + out, + "\n", + "err msg: ", + err, + "\n", + ) + ) + ) + raise FatalError("dp train failed") + fplog.write("#=================== train std out ===================\n") + fplog.write(out) + fplog.write("#=================== train std err ===================\n") + fplog.write(err) + + #if RunNvNMDTrain.skip_training( + #work_dir, train_dict, init_model, iter_data, finetune_mode + #): + + # train model + command = _make_train_command( + dp_command, + train_qnn_script_name, + impl, + do_init_model, + init_model, + finetune_mode, + finetune_args, + init_model_with_finetune, + train_args = "-s s2", + ) + + ret, out, err = run_command(command) + if ret != 0: + clean_before_quit() + logging.error( + "".join( + ( + "nvnmd train (cnn) failed\n", + "out msg: ", + out, + "\n", + "err msg: ", + err, + "\n", + ) + ) + ) + raise FatalError("nvnmd train (cnn) failed") + fplog.write("#=================== train std out ===================\n") + fplog.write(out) + fplog.write("#=================== train std err ===================\n") + fplog.write(err) + + if finetune_mode == "finetune" and os.path.exists("input_v2_compat.json"): + shutil.copy2("input_v2_compat.json", train_script_name) + + # freeze model + cnn_model_file = "nvnmd_cnn/frozen_model.pb" + model_ckpt_data_file = "nvnmd_cnn/model.ckpt.data-00000-of-00001" + model_ckpt_index_file = "nvnmd_cnn/model.ckpt.index" + model_ckpt_meta_file = "nvnmd_cnn/model.ckpt.meta" + qnn_model_file = "nvnmd_qnn/model.pb" + lcurve_file = "nvnmd_cnn/lcurve.out" + + clean_before_quit() + + return OPIO( + { + "script": work_dir / train_script_name, + "cnn_model": work_dir / cnn_model_file, + "model_ckpt_data": work_dir / model_ckpt_data_file, + "model_ckpt_meta": work_dir / model_ckpt_meta_file, + "model_ckpt_index": work_dir / model_ckpt_index_file, + "qnn_model": work_dir / qnn_model_file, + "lcurve": work_dir / lcurve_file, + "log": work_dir / "train.log", + } + ) + + @staticmethod + def write_data_to_input_script( + idict: dict, + config, + init_data: Union[List[Path], Dict[str, List[Path]]], + iter_data: List[Path], + auto_prob_str: str = "prob_sys_size", + major_version: str = "2", + valid_data: Optional[Union[List[Path], Dict[str, List[Path]]]] = None, + ): + odict = idict.copy() + + data_list = [str(ii) for ii in init_data] + [str(ii) for ii in iter_data] + if major_version == "1": + # v1 behavior + odict["training"]["systems"] = data_list + odict["training"].setdefault("batch_size", "auto") + odict["training"]["auto_prob_style"] = auto_prob_str + if valid_data is not None: + odict["training"]["validation_data"] = { + "systems": [str(ii) for ii in valid_data], + "batch_size": 1, + } + elif major_version == "2": + # v2 behavior + odict["training"]["training_data"]["systems"] = data_list + odict["training"]["training_data"].setdefault("batch_size", "auto") + odict["training"]["training_data"]["auto_prob"] = auto_prob_str + if valid_data is None: + odict["training"].pop("validation_data", None) + else: + odict["training"]["validation_data"] = { + "systems": [str(ii) for ii in valid_data], + "batch_size": 1, + } + else: + raise RuntimeError("unsupported DeePMD-kit major version", major_version) + return odict + + @staticmethod + def write_other_to_input_script( + idict, + config, + do_init_model, + train_qnn_model: bool = False, + major_version: str = "1", + ): + odict = copy.deepcopy(idict) + odict["training"]["disp_file"] = "lcurve.out" + odict["training"]["save_ckpt"] = "model.ckpt" + if do_init_model: + odict["learning_rate"]["start_lr"] = config["init_model_start_lr"] + if "loss_dict" in odict: + for v in odict["loss_dict"].values(): + if isinstance(v, dict): + v["start_pref_e"] = config["init_model_start_pref_e"] + v["start_pref_f"] = config["init_model_start_pref_f"] + v["start_pref_v"] = config["init_model_start_pref_v"] + else: + odict["loss"]["start_pref_e"] = config["init_model_start_pref_e"] + odict["loss"]["start_pref_f"] = config["init_model_start_pref_f"] + odict["loss"]["start_pref_v"] = config["init_model_start_pref_v"] + if major_version == "1": + odict["training"]["stop_batch"] = config["init_model_numb_steps"] + elif major_version == "2": + odict["training"]["numb_steps"] = config["init_model_numb_steps"] + else: + raise RuntimeError( + "unsupported DeePMD-kit major version", major_version + ) + if train_qnn_model: + odict["learning_rate"]["start_lr"] = config["init_model_start_lr"] + if "loss_dict" in odict: + for v in odict["loss_dict"].values(): + if isinstance(v, dict): + v["start_pref_e"] = 1 + v["start_pref_f"] = 1 + v["start_pref_v"] = 1 + odict["learning_rate"]["start_lr"] = odict["learning_rate"]["stop_lr"] + if major_version == "1": + odict["training"]["stop_batch"] = 0 + elif major_version == "2": + odict["training"]["numb_steps"] = 0 + return odict + + @staticmethod + def skip_training( + work_dir, + train_dict, + init_model, + iter_data, + finetune_mode, + ): + # do not skip if we do finetuning + #if finetune_mode is not None and finetune_mode == "finetune": + # return False + # we have init model and no iter data, skip training + if (init_model is not None) and (iter_data is None or len(iter_data) == 0): + with set_directory(work_dir): + with open(train_script_name, "w") as fp: + json.dump(train_dict, fp, indent=4) + Path("train.log").write_text( + f"We have init model {init_model} and " + f"no iteration training data. " + f"The training is skipped.\n" + ) + Path("lcurve.out").touch() + return True + else: + return False + + @staticmethod + def decide_init_model( + config, + init_model, + init_data, + iter_data, + mixed_type=False, + ): + do_init_model = False + # decide if we do init-model + ## cases we do definitely not + if init_model is None or iter_data is None or len(iter_data) == 0: + do_init_model = False + ## cases controlled by the policy + else: + if config["init_model_policy"] == "no": + do_init_model = False + elif config["init_model_policy"] == "yes": + do_init_model = True + elif "old_data_larger_than" in config["init_model_policy"]: + old_data_size_level = int(config["init_model_policy"].split(":")[-1]) + if isinstance(init_data, dict): + init_data_size = _get_data_size_of_all_systems( + sum(init_data.values(), []) + ) + else: + init_data_size = _get_data_size_of_all_systems(init_data) + iter_data_old_size = _get_data_size_of_all_mult_sys( + iter_data[:-1], mixed_type=mixed_type + ) + old_data_size = init_data_size + iter_data_old_size + if old_data_size > old_data_size_level: + do_init_model = True + return do_init_model + + @staticmethod + def training_args(): + doc_command = "The command for DP, 'dp' for default" + doc_impl = "The implementation/backend of DP. It can be 'tensorflow' or 'pytorch'. 'tensorflow' for default." + doc_init_model_policy = "The policy of init-model training. It can be\n\n\ + - 'no': No init-model training. Traing from scratch.\n\n\ + - 'yes': Do init-model training.\n\n\ + - 'old_data_larger_than:XXX': Do init-model if the training data size of the previous model is larger than XXX. XXX is an int number." + doc_init_model_old_ratio = "The frequency ratio of old data over new data" + doc_init_model_numb_steps = "The number of training steps when init-model" + doc_init_model_start_lr = "The start learning rate when init-model" + doc_init_model_start_pref_e = ( + "The start energy prefactor in loss when init-model" + ) + doc_init_model_start_pref_f = ( + "The start force prefactor in loss when init-model" + ) + doc_init_model_start_pref_v = ( + "The start virial prefactor in loss when init-model" + ) + doc_finetune_args = "Extra arguments for finetuning" + doc_multitask = "Do multitask training" + doc_head = "Head to use in the multitask training" + doc_init_model_with_finetune = "Use finetune for init model" + doc_train_args = "Extra arguments for dp train" + return [ + Argument( + "command", + str, + optional=True, + default="dp", + doc=doc_command, + ), + Argument( + "impl", + str, + optional=True, + default="tensorflow", + doc=doc_impl, + alias=["backend"], + ), + Argument( + "init_model_policy", + str, + optional=True, + default="no", + doc=doc_init_model_policy, + ), + Argument( + "init_model_old_ratio", + float, + optional=True, + default=0.9, + doc=doc_init_model_old_ratio, + ), + Argument( + "init_model_numb_steps", + int, + optional=True, + default=400000, + doc=doc_init_model_numb_steps, + alias=["init_model_stop_batch"], + ), + Argument( + "init_model_start_lr", + float, + optional=True, + default=1e-4, + doc=doc_init_model_start_lr, + ), + Argument( + "init_model_start_pref_e", + float, + optional=True, + default=0.1, + doc=doc_init_model_start_pref_e, + ), + Argument( + "init_model_start_pref_f", + float, + optional=True, + default=100, + doc=doc_init_model_start_pref_f, + ), + Argument( + "init_model_start_pref_v", + float, + optional=True, + default=0.0, + doc=doc_init_model_start_pref_v, + ), + Argument( + "init_model_with_finetune", + bool, + optional=True, + default=False, + doc=doc_init_model_with_finetune, + ), + Argument( + "finetune_args", + str, + optional=True, + default="", + doc=doc_finetune_args, + ), + Argument( + "multitask", + bool, + optional=True, + default=False, + doc=doc_multitask, + ), + Argument( + "head", + str, + optional=True, + default=None, + doc=doc_head, + ), + Argument( + "train_args", + str, + optional=True, + default="", + doc=doc_train_args, + ), + ] + + @staticmethod + def normalize_config(data={}): + ta = RunNvNMDTrain.training_args() + + base = Argument("base", dict, ta) + data = base.normalize_value(data, trim_pattern="_*") + base.check_value(data, strict=True) + + return data + + +def _get_data_size_of_system(data_dir): + ss = dpdata.System(data_dir, fmt="deepmd/npy") + return ss.get_nframes() + + +def _get_data_size_of_all_systems(data_dirs): + count = 0 + for ii in data_dirs: + count += _get_data_size_of_system(ii) + return count + + +def _get_data_size_of_mult_sys(data_dir, mixed_type=False): + ms = dpdata.MultiSystems() + if mixed_type: + ms.from_deepmd_npy_mixed(data_dir) # type: ignore + else: + ms.from_deepmd_npy(data_dir) # type: ignore + return ms.get_nframes() + + +def _get_data_size_of_all_mult_sys(data_dirs, mixed_type=False): + count = 0 + for ii in data_dirs: + count += _get_data_size_of_mult_sys(ii, mixed_type) + return count + + +def _expand_multi_sys_to_sys(multi_sys_dir): + all_type_raws = sorted(glob.glob(os.path.join(multi_sys_dir, "*", "type.raw"))) + all_sys_dirs = [str(Path(ii).parent) for ii in all_type_raws] + return all_sys_dirs + + +def _expand_all_multi_sys_to_sys(list_multi_sys): + all_sys_dirs = [] + for ii in list_multi_sys: + all_sys_dirs = all_sys_dirs + _expand_multi_sys_to_sys(ii) + return all_sys_dirs + + +config_args = RunNvNMDTrain.training_args diff --git a/dpgen2/superop/__init__.py b/dpgen2/superop/__init__.py index 0223605f..cfddabdd 100644 --- a/dpgen2/superop/__init__.py +++ b/dpgen2/superop/__init__.py @@ -10,6 +10,9 @@ from .prep_run_dp_train import ( PrepRunDPTrain, ) +from .prep_run_nvnmd_train import ( + PrepRunNvNMDTrain, +) from .prep_run_fp import ( PrepRunFp, ) diff --git a/dpgen2/superop/block.py b/dpgen2/superop/block.py index 0e39ab38..d9d12932 100644 --- a/dpgen2/superop/block.py +++ b/dpgen2/superop/block.py @@ -113,6 +113,9 @@ def __init__( } self._input_artifacts = { "init_models": InputArtifact(optional=True), + "init_models_ckpt_index": InputArtifact(optional=True), + "init_models_ckpt_data": InputArtifact(optional=True), + "init_models_ckpt_meta": InputArtifact(optional=True), "init_data": InputArtifact(), "iter_data": InputArtifact(), } @@ -121,6 +124,9 @@ def __init__( } self._output_artifacts = { "models": OutputArtifact(), + "models_ckpt_index": OutputArtifact(), + "models_ckpt_data": OutputArtifact(), + "models_ckpt_meta": OutputArtifact(), "iter_data": OutputArtifact(), "trajs": OutputArtifact(), } @@ -224,6 +230,9 @@ def _block_cl( }, artifacts={ "init_models": block_steps.inputs.artifacts["init_models"], + "init_models_ckpt_data": block_steps.inputs.artifacts["init_models_ckpt_data"], + "init_models_ckpt_index": block_steps.inputs.artifacts["init_models_ckpt_index"], + "init_models_ckpt_meta": block_steps.inputs.artifacts["init_models_ckpt_meta"], "init_data": block_steps.inputs.artifacts["init_data"], "iter_data": block_steps.inputs.artifacts["iter_data"], }, @@ -243,7 +252,7 @@ def _block_cl( "type_map": block_steps.inputs.parameters["type_map"], }, artifacts={ - "models": prep_run_dp_train.outputs.artifacts["models"], + "models": prep_run_dp_train.outputs.artifacts["nvnmodels"], }, key="--".join( ["%s" % block_steps.inputs.parameters["block_id"], "prep-run-explore"] @@ -322,6 +331,15 @@ def _block_cl( block_steps.outputs.artifacts["models"]._from = prep_run_dp_train.outputs.artifacts[ "models" ] + block_steps.outputs.artifacts["models_ckpt_meta"]._from = prep_run_dp_train.outputs.artifacts[ + "models_ckpt_meta" + ] + block_steps.outputs.artifacts["models_ckpt_data"]._from = prep_run_dp_train.outputs.artifacts[ + "models_ckpt_data" + ] + block_steps.outputs.artifacts["models_ckpt_index"]._from = prep_run_dp_train.outputs.artifacts[ + "models_ckpt_index" + ] block_steps.outputs.artifacts["iter_data"]._from = collect_data.outputs.artifacts[ "iter_data" ] diff --git a/dpgen2/superop/prep_run_nvnmd_train.py b/dpgen2/superop/prep_run_nvnmd_train.py new file mode 100644 index 00000000..9fc8ea70 --- /dev/null +++ b/dpgen2/superop/prep_run_nvnmd_train.py @@ -0,0 +1,255 @@ +import json +import os +from copy import ( + deepcopy, +) +from pathlib import ( + Path, +) +from typing import ( + List, + Optional, + Set, + Type, +) + +from dflow import ( + InputArtifact, + InputParameter, + Inputs, + OutputArtifact, + OutputParameter, + Outputs, + S3Artifact, + Step, + Steps, + Workflow, + argo_len, + argo_range, + argo_sequence, + download_artifact, + upload_artifact, +) +from dflow.python import ( + OP, + OPIO, + Artifact, + BigParameter, + OPIOSign, + PythonOPTemplate, + Slices, +) + +from dpgen2.constants import ( + train_index_pattern, + train_script_name, + train_task_pattern, +) +from dpgen2.op import ( + RunNvNMDTrain, +) +from dpgen2.utils.step_config import ( + init_executor, +) +from dpgen2.utils.step_config import normalize as normalize_step_dict + + +class PrepRunNvNMDTrain(Steps): + def __init__( + self, + name: str, + prep_train_op: Type[OP], + run_train_op: Type[RunNvNMDTrain], + prep_config: Optional[dict] = None, + run_config: Optional[dict] = None, + upload_python_packages: Optional[List[os.PathLike]] = None, + valid_data: Optional[S3Artifact] = None, + optional_files: Optional[List[str]] = None, + ): + prep_config = normalize_step_dict({}) if prep_config is None else prep_config + run_config = normalize_step_dict({}) if run_config is None else run_config + self._input_parameters = { + "block_id": InputParameter(type=str, value=""), + "numb_models": InputParameter(type=int), + "template_script": InputParameter(), + "train_config": InputParameter(), + "run_optional_parameter": InputParameter( + type=dict, value=run_train_op.default_optional_parameter + ), + } + self._input_artifacts = { + "init_models": InputArtifact(optional=True), + "init_models_ckpt_data": InputArtifact(optional=True), + "init_models_ckpt_index": InputArtifact(optional=True), + "init_models_ckpt_meta": InputArtifact(optional=True), + "init_data": InputArtifact(), + "iter_data": InputArtifact(), + } + self._output_parameters = { + "template_script": OutputParameter(), + } + self._output_artifacts = { + "scripts": OutputArtifact(), + "models": OutputArtifact(), + "nvnmodels": OutputArtifact(), + "models_ckpt_meta": OutputArtifact(), + "models_ckpt_data": OutputArtifact(), + "models_ckpt_index": OutputArtifact(), + "logs": OutputArtifact(), + "lcurves": OutputArtifact(), + } + + super().__init__( + name=name, + inputs=Inputs( + parameters=self._input_parameters, + artifacts=self._input_artifacts, + ), + outputs=Outputs( + parameters=self._output_parameters, + artifacts=self._output_artifacts, + ), + ) + + self._keys = ["prep-train", "run-train"] + self.step_keys = {} + ii = "prep-train" + self.step_keys[ii] = "--".join(["%s" % self.inputs.parameters["block_id"], ii]) + ii = "run-train" + self.step_keys[ii] = "--".join( + ["%s" % self.inputs.parameters["block_id"], ii + "-{{item}}"] + ) + + self = _prep_run_nvnmd_train( + self, + self.step_keys, + prep_train_op, + run_train_op, + prep_config=prep_config, + run_config=run_config, + upload_python_packages=upload_python_packages, + valid_data=valid_data, + optional_files=optional_files, + ) + + @property + def input_parameters(self): + return self._input_parameters + + @property + def input_artifacts(self): + return self._input_artifacts + + @property + def output_parameters(self): + return self._output_parameters + + @property + def output_artifacts(self): + return self._output_artifacts + + @property + def keys(self): + return self._keys + + +def _prep_run_nvnmd_train( + train_steps, + step_keys, + prep_train_op: Type[OP], + run_train_op: Type[RunNvNMDTrain], + prep_config: dict = normalize_step_dict({}), + run_config: dict = normalize_step_dict({}), + upload_python_packages: Optional[List[os.PathLike]] = None, + valid_data: Optional[S3Artifact] = None, + optional_files: Optional[List[str]] = None, +): + prep_config = deepcopy(prep_config) + run_config = deepcopy(run_config) + prep_template_config = prep_config.pop("template_config") + run_template_config = run_config.pop("template_config") + prep_executor = init_executor(prep_config.pop("executor")) + run_executor = init_executor(run_config.pop("executor")) + template_slice_config = run_config.pop("template_slice_config", {}) + + prep_train = Step( + "prep-train", + template=PythonOPTemplate( + prep_train_op, + output_artifact_archive={"task_paths": None}, + python_packages=upload_python_packages, + **prep_template_config, + ), + parameters={ + "numb_models": train_steps.inputs.parameters["numb_models"], + "template_script": train_steps.inputs.parameters["template_script"], + }, + artifacts={}, + key=step_keys["prep-train"], + executor=prep_executor, + **prep_config, + ) + train_steps.add(prep_train) + + run_train = Step( + "run-train", + template=PythonOPTemplate( + run_train_op, + slices=Slices( + "int('{{item}}')", + input_parameter=["task_name"], + input_artifact=["task_path", "init_model", "init_model_ckpt_meta", "init_model_ckpt_data", "init_model_ckpt_index"], + output_artifact=["cnn_model", "qnn_model", "model_ckpt_data", "model_ckpt_meta", "model_ckpt_index", "lcurve", "log", "script"], + **template_slice_config, + ), + python_packages=upload_python_packages, + **run_template_config, + ), + parameters={ + "config": train_steps.inputs.parameters["train_config"], + "task_name": prep_train.outputs.parameters["task_names"], + "optional_parameter": train_steps.inputs.parameters[ + "run_optional_parameter" + ], + }, + artifacts={ + "task_path": prep_train.outputs.artifacts["task_paths"], + "init_model": train_steps.inputs.artifacts["init_models"], + "init_model_ckpt_meta": train_steps.inputs.artifacts["init_models_ckpt_meta"], + "init_model_ckpt_data": train_steps.inputs.artifacts["init_models_ckpt_data"], + "init_model_ckpt_index": train_steps.inputs.artifacts["init_models_ckpt_index"], + "init_data": train_steps.inputs.artifacts["init_data"], + "iter_data": train_steps.inputs.artifacts["iter_data"], + "valid_data": valid_data, + "optional_files": upload_artifact(optional_files) + if optional_files is not None + else None, + }, + with_sequence=argo_sequence( + argo_len(prep_train.outputs.parameters["task_names"]), + format=train_index_pattern, + ), + # with_param=argo_range(train_steps.inputs.parameters["numb_models"]), + key=step_keys["run-train"], + executor=run_executor, + **run_config, + ) + train_steps.add(run_train) + + train_steps.outputs.parameters[ + "template_script" + ].value_from_parameter = train_steps.inputs.parameters["template_script"] + train_steps.outputs.artifacts["scripts"]._from = run_train.outputs.artifacts[ + "script" + ] + train_steps.outputs.artifacts["models"]._from = run_train.outputs.artifacts["cnn_model"] + train_steps.outputs.artifacts["nvnmodels"]._from = run_train.outputs.artifacts["qnn_model"] + train_steps.outputs.artifacts["models_ckpt_meta"]._from = run_train.outputs.artifacts["model_ckpt_meta"] + train_steps.outputs.artifacts["models_ckpt_data"]._from = run_train.outputs.artifacts["model_ckpt_data"] + train_steps.outputs.artifacts["models_ckpt_index"]._from = run_train.outputs.artifacts["model_ckpt_index"] + train_steps.outputs.artifacts["logs"]._from = run_train.outputs.artifacts["log"] + train_steps.outputs.artifacts["lcurves"]._from = run_train.outputs.artifacts[ + "lcurve" + ] + + return train_steps diff --git a/dpgen2/utils/download_dpgen2_artifacts.py b/dpgen2/utils/download_dpgen2_artifacts.py index b5f69153..db4b7e6c 100644 --- a/dpgen2/utils/download_dpgen2_artifacts.py +++ b/dpgen2/utils/download_dpgen2_artifacts.py @@ -54,10 +54,16 @@ def add_output( op_download_setting = { "prep-run-train": DownloadDefinition() .add_input("init_models") + .add_input("init_models_ckpt_meta") + .add_input("init_models_ckpt_data") + .add_input("init_models_ckpt_index") .add_input("init_data") .add_input("iter_data") .add_output("scripts") .add_output("models") + .add_output("models_ckpt_meta") + .add_output("models_ckpt_data") + .add_output("models_ckpt_index") .add_output("logs") .add_output("lcurves"), "prep-run-explore": DownloadDefinition() From 7f606867e8295553d11f91bd970155f040f4a8b8 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Mon, 28 Apr 2025 10:19:37 +0800 Subject: [PATCH 02/49] fix run nvnmd --- dpgen2/exploration/task/lmp_template_task_group.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpgen2/exploration/task/lmp_template_task_group.py b/dpgen2/exploration/task/lmp_template_task_group.py index de97b05f..f64f973e 100644 --- a/dpgen2/exploration/task/lmp_template_task_group.py +++ b/dpgen2/exploration/task/lmp_template_task_group.py @@ -206,7 +206,7 @@ def revise_lmp_input_dump(lmp_lines, trj_freq, pimd_bead=None,nvnmd_version=None else: lmp_lines[ idx - ] = f"dump dpgen_dump all custom ${trj_freq} ${lmp_traj_file_name} id type x y z fx fy fz" + ] = f"dump dpgen_dump all custom {trj_freq} {lmp_traj_file_name} id type x y z fx fy fz" lmp_lines.insert( idx+1, 'if \"${rerun} > 0\" then \"jump SELF rerun' @@ -233,7 +233,7 @@ def revise_lmp_input_rerun(lmp_lines): f'rerun rerun {lmp_traj_name}.0 dump x y z fx fy fz' ) lmp_lines.append( - 'labal end' + 'label end' ) return lmp_lines From 5255d2e4b9c6509fbf74c0331f7d7faf557acf7f Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Mon, 28 Apr 2025 10:53:11 +0800 Subject: [PATCH 03/49] fix rerun command in nvnmd if number of atoms in dump is changing. --- dpgen2/exploration/task/lmp/lmp_input.py | 2 +- dpgen2/exploration/task/lmp_template_task_group.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dpgen2/exploration/task/lmp/lmp_input.py b/dpgen2/exploration/task/lmp/lmp_input.py index cb42f214..7b120659 100644 --- a/dpgen2/exploration/task/lmp/lmp_input.py +++ b/dpgen2/exploration/task/lmp/lmp_input.py @@ -203,6 +203,6 @@ def make_lmp_input( if(nvnmd_version is not None): ret += 'jump SELF end\n' ret += 'label rerun\n' - ret += 'rerun %s.0 dump x y z fx fy fz\n' % lmp_traj_file_name + ret += 'rerun %s.0 dump x y z fx fy fz add yes\n' % lmp_traj_file_name ret += 'label end\n' return ret diff --git a/dpgen2/exploration/task/lmp_template_task_group.py b/dpgen2/exploration/task/lmp_template_task_group.py index f64f973e..2b51c6a3 100644 --- a/dpgen2/exploration/task/lmp_template_task_group.py +++ b/dpgen2/exploration/task/lmp_template_task_group.py @@ -230,7 +230,7 @@ def revise_lmp_input_rerun(lmp_lines): 'label rerun' ) lmp_lines.append( - f'rerun rerun {lmp_traj_name}.0 dump x y z fx fy fz' + f'rerun rerun {lmp_traj_name}.0 dump x y z fx fy fz add yes' ) lmp_lines.append( 'label end' From 91a66200f3d90a6235eda1338f85c91250f150da Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Tue, 29 Apr 2025 17:59:18 +0800 Subject: [PATCH 04/49] add model deviation calculation function --- dpgen2/op/run_nvnmd.py | 48 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index 326ef35d..9e891ecc 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -4,6 +4,7 @@ import os import random import re +import itertools from pathlib import ( Path, ) @@ -50,6 +51,10 @@ from dpgen2.utils.run_command import ( run_command, ) +from dpgen2.op.run_caly_model_devi import ( + write_model_devi_out, +) +from ase.io import read class RunNvNMD(OP): @@ -180,10 +185,9 @@ def execute( set_models(lmp_input_name, model_names) # run lmp - calc_model_devi_command = ["python /mnt/nvnmd/input/ljh/calc_model_devi.py", "cp %s.0 %s"%(lmp_traj_name, lmp_traj_name)] commands = " ; ".join([" ".join( ["cp", model_name, "model.pb", "&&", command, "-i", lmp_input_name, "-log", lmp_log_name, "-v", "rerun", "%d"%i, "&&", "cp", lmp_traj_name, lmp_traj_name+".%d"%i]) - for i, model_name in enumerate(model_names)] + calc_model_devi_command) + for i, model_name in enumerate(model_names)]) ret, out, err = run_command(commands, shell=True) if ret != 0: logging.error( @@ -213,6 +217,8 @@ def execute( with open("job.json", "w") as f: json.dump(data, f, indent=4) merge_pimd_files() + + calc_model_devi([lmp_traj_name+f".{i}" for i in range(len(model_names))]) ret_dict = { "log": work_dir / lmp_log_name, @@ -402,3 +408,41 @@ def merge_pimd_files(): for model_devi_file in sorted(model_devi_files): with open(model_devi_file, "r") as f2: f.write(f2.read()) + +def calc_model_devi( + traj_files: list[str], + fname: str = "model_devi.out", +): + + trajectories = [] + for f in traj_files: + traj = read(f, format='lammps-dump-text', index=':', order=True) + trajectories.append(traj) + + num_frames = len(trajectories[0]) + for traj in trajectories: + assert len(traj) == num_frames, "Not match" + + devi = [] + for frame_idx in range(num_frames): + frames = [traj[frame_idx] for traj in trajectories] + + all_forces = [atoms.get_forces() for atoms in frames] + all_errors = [] + + for atom_idx in range(len(frames[0])): + forces = [forces_arr[atom_idx] for forces_arr in all_forces] + + for a, b in itertools.combinations(forces, 2): + error = np.linalg.norm(a - b) + all_errors.append(error) + + max_error = np.max(all_errors) if all_errors else 0.0 + min_error = np.min(all_errors) if all_errors else 0.0 + avg_error = np.mean(all_errors) if all_errors else 0.0 + + # ase verion >= 3.26.0, please update ase using "pip install git+https://gitlab.com/ase/ase.git" + devi.append([trajectories[0][frame_idx].info['timestep'],0,0,0,max_error, min_error, avg_error,0]) + + devi = np.array(devi) + write_model_devi_out(devi, fname=fname) From 48aac9ccdb263e52f80bded08faebfb0c4e5b96f Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Fri, 2 May 2025 09:16:29 +0800 Subject: [PATCH 05/49] fix rerun in lmp-nvnmd-template --- dpgen2/exploration/task/lmp_template_task_group.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpgen2/exploration/task/lmp_template_task_group.py b/dpgen2/exploration/task/lmp_template_task_group.py index 2b51c6a3..d8a28966 100644 --- a/dpgen2/exploration/task/lmp_template_task_group.py +++ b/dpgen2/exploration/task/lmp_template_task_group.py @@ -209,7 +209,7 @@ def revise_lmp_input_dump(lmp_lines, trj_freq, pimd_bead=None,nvnmd_version=None ] = f"dump dpgen_dump all custom {trj_freq} {lmp_traj_file_name} id type x y z fx fy fz" lmp_lines.insert( idx+1, - 'if \"${rerun} > 0\" then \"jump SELF rerun' + 'if \"${rerun} > 0\" then \"jump SELF rerun\"' ) return lmp_lines From 8a6918c66322dee2aafdef678dfab409752ea715 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Fri, 2 May 2025 23:31:13 +0800 Subject: [PATCH 06/49] fix rerun command in lmp-nvnmd-template --- dpgen2/exploration/task/lmp_template_task_group.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpgen2/exploration/task/lmp_template_task_group.py b/dpgen2/exploration/task/lmp_template_task_group.py index d8a28966..c740e448 100644 --- a/dpgen2/exploration/task/lmp_template_task_group.py +++ b/dpgen2/exploration/task/lmp_template_task_group.py @@ -230,7 +230,7 @@ def revise_lmp_input_rerun(lmp_lines): 'label rerun' ) lmp_lines.append( - f'rerun rerun {lmp_traj_name}.0 dump x y z fx fy fz add yes' + f'rerun {lmp_traj_name}.0 dump x y z fx fy fz add yes' ) lmp_lines.append( 'label end' From ff5da8d4ef40d1ac7ccd3117970055bcd59cdb95 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Sat, 3 May 2025 19:41:15 +0800 Subject: [PATCH 07/49] fix traj reader lammps for multi-systems in a traj --- dpgen2/exploration/render/traj_render_lammps.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/dpgen2/exploration/render/traj_render_lammps.py b/dpgen2/exploration/render/traj_render_lammps.py index 00b6a3de..8fec2744 100644 --- a/dpgen2/exploration/render/traj_render_lammps.py +++ b/dpgen2/exploration/render/traj_render_lammps.py @@ -15,6 +15,9 @@ import dpdata import numpy as np +from ase.io import ( + read, +) from dflow.python.opio import ( HDF5Dataset, ) @@ -123,12 +126,14 @@ def get_confs( traj = StringIO(trajs[ii].get_data()) # type: ignore else: traj = trajs[ii] - ss = dpdata.System(traj, fmt=traj_fmt, type_map=type_map) - ss.nopbc = self.nopbc - if ele_temp: - self.set_ele_temp(ss, ele_temp[ii]) - ss = ss.sub_system(id_selected[ii]) - ms.append(ss) + #ss = dpdata.System(traj, fmt=traj_fmt, type_map=type_map) + ss = read(str(traj), format="lammps-dump-text", index=":", specorder=type_map) + for jj in id_selected[ii]: + s = dpdata.System(ss[jj], fmt="ase/structure", type_map=type_map) + s.nopbc = self.nopbc + if ele_temp: + self.set_ele_temp(s, ele_temp[ii]) + ms.append(s) if conf_filters is not None: ms = conf_filters.check(ms) return ms From 2d4e32d6841b5a174790fba8ff34d90c414522a0 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Fri, 16 May 2025 10:50:59 +0800 Subject: [PATCH 08/49] support init-model from freezed model in nvnmd --- dpgen2/entrypoint/submit.py | 3 - dpgen2/flow/dpgen_loop.py | 39 ----- dpgen2/op/prep_nvnmd_train.py | 2 +- dpgen2/op/run_nvnmd_train.py | 174 ++++++---------------- dpgen2/superop/block.py | 18 --- dpgen2/superop/prep_run_nvnmd_train.py | 16 +- dpgen2/utils/download_dpgen2_artifacts.py | 6 - 7 files changed, 49 insertions(+), 209 deletions(-) diff --git a/dpgen2/entrypoint/submit.py b/dpgen2/entrypoint/submit.py index f3f00d3e..7019bdf2 100644 --- a/dpgen2/entrypoint/submit.py +++ b/dpgen2/entrypoint/submit.py @@ -699,9 +699,6 @@ def workflow_concurrent_learning( }, artifacts={ "init_models": init_models, - "init_models_ckpt_meta": None, - "init_models_ckpt_index": None, - "init_models_ckpt_data": None, "init_data": init_data, "iter_data": iter_data, }, diff --git a/dpgen2/flow/dpgen_loop.py b/dpgen2/flow/dpgen_loop.py index 66949e20..190a1090 100644 --- a/dpgen2/flow/dpgen_loop.py +++ b/dpgen2/flow/dpgen_loop.py @@ -186,9 +186,6 @@ def __init__( } self._input_artifacts = { "init_models": InputArtifact(optional=True), - "init_models_ckpt_meta": InputArtifact(optional=True), - "init_models_ckpt_data": InputArtifact(optional=True), - "init_models_ckpt_index": InputArtifact(optional=True), "init_data": InputArtifact(), "iter_data": InputArtifact(), } @@ -197,9 +194,6 @@ def __init__( } self._output_artifacts = { "models": OutputArtifact(), - "models_ckpt_meta": OutputArtifact(), - "models_ckpt_data": OutputArtifact(), - "models_ckpt_index": OutputArtifact(), "iter_data": OutputArtifact(), } @@ -283,9 +277,6 @@ def __init__( self._input_artifacts = { "init_models": InputArtifact(optional=True), - "init_models_ckpt_meta": InputArtifact(optional=True), - "init_models_ckpt_data": InputArtifact(optional=True), - "init_models_ckpt_index": InputArtifact(optional=True), "init_data": InputArtifact(), "iter_data": InputArtifact(), } @@ -294,9 +285,6 @@ def __init__( } self._output_artifacts = { "models": OutputArtifact(), - "models_ckpt_meta": OutputArtifact(), - "models_ckpt_data": OutputArtifact(), - "models_ckpt_index": OutputArtifact(), "iter_data": OutputArtifact(), } @@ -386,9 +374,6 @@ def _loop( parameters=block_common_parameters, artifacts={ "init_models": steps.inputs.artifacts["init_models"], - "init_models_ckpt_meta": steps.inputs.artifacts["init_models_ckpt_meta"], - "init_models_ckpt_index": steps.inputs.artifacts["init_models_ckpt_index"], - "init_models_ckpt_data": steps.inputs.artifacts["init_models_ckpt_data"], "init_data": steps.inputs.artifacts["init_data"], "iter_data": steps.inputs.artifacts["iter_data"], }, @@ -461,9 +446,6 @@ def _loop( parameters=next_common_parameters, artifacts={ "init_models": block_step.outputs.artifacts["models"], - "init_models_ckpt_meta": block_step.outputs.artifacts["models_ckpt_meta"], - "init_models_ckpt_data": block_step.outputs.artifacts["models_ckpt_data"], - "init_models_ckpt_index": block_step.outputs.artifacts["models_ckpt_index"], "init_data": steps.inputs.artifacts["init_data"], "iter_data": block_step.outputs.artifacts["iter_data"], }, @@ -483,21 +465,6 @@ def _loop( _then=block_step.outputs.artifacts["models"], _else=next_step.outputs.artifacts["models"], ) - steps.outputs.artifacts["models_ckpt_meta"].from_expression = if_expression( - _if=(scheduler_step.outputs.parameters["converged"] == True), - _then=block_step.outputs.artifacts["models_ckpt_meta"], - _else=next_step.outputs.artifacts["models_ckpt_meta"], - ) - steps.outputs.artifacts["models_ckpt_data"].from_expression = if_expression( - _if=(scheduler_step.outputs.parameters["converged"] == True), - _then=block_step.outputs.artifacts["models_ckpt_data"], - _else=next_step.outputs.artifacts["models_ckpt_data"], - ) - steps.outputs.artifacts["models_ckpt_index"].from_expression = if_expression( - _if=(scheduler_step.outputs.parameters["converged"] == True), - _then=block_step.outputs.artifacts["models_ckpt_index"], - _else=next_step.outputs.artifacts["models_ckpt_index"], - ) steps.outputs.artifacts["iter_data"].from_expression = if_expression( _if=(scheduler_step.outputs.parameters["converged"] == True), _then=block_step.outputs.artifacts["iter_data"], @@ -583,9 +550,6 @@ def _dpgen( parameters=common_parameters, artifacts={ "init_models": steps.inputs.artifacts["init_models"], - "init_models_ckpt_meta": steps.inputs.artifacts["init_models_ckpt_meta"], - "init_models_ckpt_data": steps.inputs.artifacts["init_models_ckpt_data"], - "init_models_ckpt_index": steps.inputs.artifacts["init_models_ckpt_index"], "init_data": steps.inputs.artifacts["init_data"], "iter_data": steps.inputs.artifacts["iter_data"], }, @@ -597,9 +561,6 @@ def _dpgen( "exploration_scheduler" ].value_from_parameter = loop_step.outputs.parameters["exploration_scheduler"] steps.outputs.artifacts["models"]._from = loop_step.outputs.artifacts["models"] - steps.outputs.artifacts["models_ckpt_meta"]._from = loop_step.outputs.artifacts["models_ckpt_meta"] - steps.outputs.artifacts["models_ckpt_data"]._from = loop_step.outputs.artifacts["models_ckpt_data"] - steps.outputs.artifacts["models_ckpt_index"]._from = loop_step.outputs.artifacts["models_ckpt_index"] steps.outputs.artifacts["iter_data"]._from = loop_step.outputs.artifacts[ "iter_data" ] diff --git a/dpgen2/op/prep_nvnmd_train.py b/dpgen2/op/prep_nvnmd_train.py index a1600635..45a7e121 100644 --- a/dpgen2/op/prep_nvnmd_train.py +++ b/dpgen2/op/prep_nvnmd_train.py @@ -114,6 +114,6 @@ def _script_rand_seed( jtmp = input_dict.copy() # the key "seed" in "nvnmd" is used to set the random seed for the network parameters, it is developing. - #jtmp["nvnmd"]["seed"] = random.randrange(sys.maxsize) % (2**32) + jtmp["nvnmd"]["seed"] = random.randrange(sys.maxsize) % (2**32) jtmp["training"]["seed"] = random.randrange(sys.maxsize) % (2**32) return jtmp diff --git a/dpgen2/op/run_nvnmd_train.py b/dpgen2/op/run_nvnmd_train.py index af0f395c..a2a1db1b 100644 --- a/dpgen2/op/run_nvnmd_train.py +++ b/dpgen2/op/run_nvnmd_train.py @@ -51,17 +51,13 @@ def _make_train_command( dp_command, train_script_name, - impl, do_init_model, init_model, - finetune_mode, - finetune_args, - init_model_with_finetune, train_args="", ): # find checkpoint - if impl == "tensorflow" and os.path.isfile("nvnmd_cnn/checkpoint") and not os.path.isfile("nvnmd_cnn/frozen_model.pb"): + if os.path.isfile("nvnmd_cnn/checkpoint") and not os.path.isfile("nvnmd_cnn/frozen_model.pb"): checkpoint = "nvnmd_cnn/model.ckpt" else: checkpoint = None @@ -69,28 +65,22 @@ def _make_train_command( if checkpoint is not None: command = dp_command + ["train-nvnmd", "--restart", checkpoint, train_script_name] return command - # case of init model and finetune + + # case of init model assert checkpoint is None - case_init_model = do_init_model # and (not init_model_with_finetune) - # nvnmd-train do not support initial model + case_init_model = do_init_model if case_init_model: - # not support initial frozen model in nvnmd - #init_flag = "--init-frz-model" if impl == "tensorflow" else "--init-model" - init_flag = "--init-model" - for i in init_model: - shutil.copy(i, "./") - + init_flag = "--init-frz-model" command = dp_command + [ "train-nvnmd", init_flag, - "model.ckpt", + init_model, train_script_name, ] else: command = dp_command + ["train-nvnmd", train_script_name] command += train_args.split() - print(command) return command @@ -106,7 +96,6 @@ class RunNvNMDTrain(OP): default_optional_parameter = { "mixed_type": False, - "finetune_mode": "no", } @classmethod @@ -121,9 +110,6 @@ def get_input_sign(cls): ), "task_path": Artifact(Path), "init_model": Artifact(Path, optional=True), - "init_model_ckpt_meta": Artifact(Path, optional=True), - "init_model_ckpt_data": Artifact(Path, optional=True), - "init_model_ckpt_index": Artifact(Path, optional=True), "init_data": Artifact(NestedDict[Path]), "iter_data": Artifact(List[Path]), "valid_data": Artifact(NestedDict[Path], optional=True), @@ -138,9 +124,6 @@ def get_output_sign(cls): "script": Artifact(Path), "cnn_model": Artifact(Path), "qnn_model": Artifact(Path), - "model_ckpt_data": Artifact(Path), - "model_ckpt_meta": Artifact(Path), - "model_ckpt_index": Artifact(Path), "lcurve": Artifact(Path), "log": Artifact(Path), } @@ -170,7 +153,8 @@ def execute( Any Output dict with components: - `script`: (`Artifact(Path)`) The training script. - - `model`: (`Artifact(Path)`) The trained frozen model. + - `cnn_model`: (`Artifact(Path)`) The trained continuous frozen model. + - `qnn_model`: (`Artifact(Path)`) The trained quantized frozen model. - `lcurve`: (`Artifact(Path)`) The learning curve file. - `log`: (`Artifact(Path)`) The log file of training. @@ -180,20 +164,13 @@ def execute( On the failure of training or freezing. Human intervention needed. """ mixed_type = ip["optional_parameter"]["mixed_type"] - finetune_mode = ip["optional_parameter"]["finetune_mode"] config = ip["config"] if ip["config"] is not None else {} - impl = ip["config"].get("impl", "tensorflow") dp_command = ip["config"].get("command", "dp").split() - assert impl in ["tensorflow"] - finetune_args = config.get("finetune_args", "") train_args = config.get("train_args", "") config = RunNvNMDTrain.normalize_config(config) task_name = ip["task_name"] task_path = ip["task_path"] init_model = ip["init_model"] - init_model_ckpt_data = ip["init_model_ckpt_data"] - init_model_ckpt_meta = ip["init_model_ckpt_meta"] - init_model_ckpt_index = ip["init_model_ckpt_index"] init_data = ip["init_data"] iter_data = ip["iter_data"] valid_data = ip["valid_data"] @@ -201,7 +178,6 @@ def execute( iter_data_new_exp = _expand_all_multi_sys_to_sys(iter_data[-1:]) iter_data_exp = iter_data_old_exp + iter_data_new_exp work_dir = Path(task_name) - init_model_with_finetune = config["init_model_with_finetune"] # update the input script input_script = Path(task_path) / train_script_name @@ -213,7 +189,6 @@ def execute( major_version = "2" # auto prob style - init_model = [init_model_ckpt_meta, init_model_ckpt_data, init_model_ckpt_index] if init_model is not None else init_model do_init_model = RunNvNMDTrain.decide_init_model( config, init_model, @@ -236,14 +211,14 @@ def execute( init_data, iter_data_exp, auto_prob_str, - "2", + major_version, valid_data, ) train_cnn_dict = RunNvNMDTrain.write_other_to_input_script( - train_dict, config, do_init_model, False, "2", + train_dict, config, do_init_model, False, major_version, ) train_qnn_dict = RunNvNMDTrain.write_other_to_input_script( - train_dict, config, do_init_model, True, "2", + train_dict, config, do_init_model, True, major_version, ) with set_directory(work_dir): @@ -268,51 +243,47 @@ def clean_before_quit(): command = _make_train_command( dp_command, train_cnn_script_name, - impl, do_init_model, init_model, - finetune_mode, - finetune_args, - init_model_with_finetune, train_args = "-s s1", ) - ret, out, err = run_command(command) - if ret != 0: - clean_before_quit() - logging.error( - "".join( - ( - "dp train failed\n", - "out msg: ", - out, - "\n", - "err msg: ", - err, - "\n", + if not RunNvNMDTrain.skip_training( + train_dict, config, init_model, iter_data + ): + ret, out, err = run_command(command) + if ret != 0: + clean_before_quit() + logging.error( + "".join( + ( + "dp train-nvnmd -s s1 failed\n", + "out msg: ", + out, + "\n", + "err msg: ", + err, + "\n", + ) ) ) - ) - raise FatalError("dp train failed") - fplog.write("#=================== train std out ===================\n") - fplog.write(out) - fplog.write("#=================== train std err ===================\n") - fplog.write(err) - - #if RunNvNMDTrain.skip_training( - #work_dir, train_dict, init_model, iter_data, finetune_mode - #): + raise FatalError("dp train-nvnmd -s s1 failed") + fplog.write("#=================== train std out ===================\n") + fplog.write(out) + fplog.write("#=================== train std err ===================\n") + fplog.write(err) + + cnn_model_file = "nvnmd_cnn/frozen_model.pb" - # train model + else: + cnn_model_file = init_model + + # train qnn model command = _make_train_command( dp_command, train_qnn_script_name, - impl, do_init_model, init_model, - finetune_mode, - finetune_args, - init_model_with_finetune, train_args = "-s s2", ) @@ -322,7 +293,7 @@ def clean_before_quit(): logging.error( "".join( ( - "nvnmd train (cnn) failed\n", + "dp train-nvnmd -s s2 failed\n", "out msg: ", out, "\n", @@ -332,32 +303,24 @@ def clean_before_quit(): ) ) ) - raise FatalError("nvnmd train (cnn) failed") + raise FatalError("dp train-nvnmd -s s2 failed") fplog.write("#=================== train std out ===================\n") fplog.write(out) fplog.write("#=================== train std err ===================\n") fplog.write(err) - - if finetune_mode == "finetune" and os.path.exists("input_v2_compat.json"): - shutil.copy2("input_v2_compat.json", train_script_name) - - # freeze model - cnn_model_file = "nvnmd_cnn/frozen_model.pb" - model_ckpt_data_file = "nvnmd_cnn/model.ckpt.data-00000-of-00001" - model_ckpt_index_file = "nvnmd_cnn/model.ckpt.index" - model_ckpt_meta_file = "nvnmd_cnn/model.ckpt.meta" + qnn_model_file = "nvnmd_qnn/model.pb" - lcurve_file = "nvnmd_cnn/lcurve.out" + lcurve_file = "nvnmd_qnn/lcurve.out" + if os.path.exists("input_v2_compat.json"): + shutil.copy2("input_v2_compat.json", train_script_name) + clean_before_quit() return OPIO( { "script": work_dir / train_script_name, "cnn_model": work_dir / cnn_model_file, - "model_ckpt_data": work_dir / model_ckpt_data_file, - "model_ckpt_meta": work_dir / model_ckpt_meta_file, - "model_ckpt_index": work_dir / model_ckpt_index_file, "qnn_model": work_dir / qnn_model_file, "lcurve": work_dir / lcurve_file, "log": work_dir / "train.log", @@ -455,11 +418,7 @@ def skip_training( train_dict, init_model, iter_data, - finetune_mode, ): - # do not skip if we do finetuning - #if finetune_mode is not None and finetune_mode == "finetune": - # return False # we have init model and no iter data, skip training if (init_model is not None) and (iter_data is None or len(iter_data) == 0): with set_directory(work_dir): @@ -513,7 +472,6 @@ def decide_init_model( @staticmethod def training_args(): doc_command = "The command for DP, 'dp' for default" - doc_impl = "The implementation/backend of DP. It can be 'tensorflow' or 'pytorch'. 'tensorflow' for default." doc_init_model_policy = "The policy of init-model training. It can be\n\n\ - 'no': No init-model training. Traing from scratch.\n\n\ - 'yes': Do init-model training.\n\n\ @@ -530,10 +488,6 @@ def training_args(): doc_init_model_start_pref_v = ( "The start virial prefactor in loss when init-model" ) - doc_finetune_args = "Extra arguments for finetuning" - doc_multitask = "Do multitask training" - doc_head = "Head to use in the multitask training" - doc_init_model_with_finetune = "Use finetune for init model" doc_train_args = "Extra arguments for dp train" return [ Argument( @@ -543,14 +497,6 @@ def training_args(): default="dp", doc=doc_command, ), - Argument( - "impl", - str, - optional=True, - default="tensorflow", - doc=doc_impl, - alias=["backend"], - ), Argument( "init_model_policy", str, @@ -601,34 +547,6 @@ def training_args(): default=0.0, doc=doc_init_model_start_pref_v, ), - Argument( - "init_model_with_finetune", - bool, - optional=True, - default=False, - doc=doc_init_model_with_finetune, - ), - Argument( - "finetune_args", - str, - optional=True, - default="", - doc=doc_finetune_args, - ), - Argument( - "multitask", - bool, - optional=True, - default=False, - doc=doc_multitask, - ), - Argument( - "head", - str, - optional=True, - default=None, - doc=doc_head, - ), Argument( "train_args", str, diff --git a/dpgen2/superop/block.py b/dpgen2/superop/block.py index d9d12932..a9beea07 100644 --- a/dpgen2/superop/block.py +++ b/dpgen2/superop/block.py @@ -113,9 +113,6 @@ def __init__( } self._input_artifacts = { "init_models": InputArtifact(optional=True), - "init_models_ckpt_index": InputArtifact(optional=True), - "init_models_ckpt_data": InputArtifact(optional=True), - "init_models_ckpt_meta": InputArtifact(optional=True), "init_data": InputArtifact(), "iter_data": InputArtifact(), } @@ -124,9 +121,6 @@ def __init__( } self._output_artifacts = { "models": OutputArtifact(), - "models_ckpt_index": OutputArtifact(), - "models_ckpt_data": OutputArtifact(), - "models_ckpt_meta": OutputArtifact(), "iter_data": OutputArtifact(), "trajs": OutputArtifact(), } @@ -230,9 +224,6 @@ def _block_cl( }, artifacts={ "init_models": block_steps.inputs.artifacts["init_models"], - "init_models_ckpt_data": block_steps.inputs.artifacts["init_models_ckpt_data"], - "init_models_ckpt_index": block_steps.inputs.artifacts["init_models_ckpt_index"], - "init_models_ckpt_meta": block_steps.inputs.artifacts["init_models_ckpt_meta"], "init_data": block_steps.inputs.artifacts["init_data"], "iter_data": block_steps.inputs.artifacts["iter_data"], }, @@ -331,15 +322,6 @@ def _block_cl( block_steps.outputs.artifacts["models"]._from = prep_run_dp_train.outputs.artifacts[ "models" ] - block_steps.outputs.artifacts["models_ckpt_meta"]._from = prep_run_dp_train.outputs.artifacts[ - "models_ckpt_meta" - ] - block_steps.outputs.artifacts["models_ckpt_data"]._from = prep_run_dp_train.outputs.artifacts[ - "models_ckpt_data" - ] - block_steps.outputs.artifacts["models_ckpt_index"]._from = prep_run_dp_train.outputs.artifacts[ - "models_ckpt_index" - ] block_steps.outputs.artifacts["iter_data"]._from = collect_data.outputs.artifacts[ "iter_data" ] diff --git a/dpgen2/superop/prep_run_nvnmd_train.py b/dpgen2/superop/prep_run_nvnmd_train.py index 9fc8ea70..a95d9ba2 100644 --- a/dpgen2/superop/prep_run_nvnmd_train.py +++ b/dpgen2/superop/prep_run_nvnmd_train.py @@ -79,9 +79,6 @@ def __init__( } self._input_artifacts = { "init_models": InputArtifact(optional=True), - "init_models_ckpt_data": InputArtifact(optional=True), - "init_models_ckpt_index": InputArtifact(optional=True), - "init_models_ckpt_meta": InputArtifact(optional=True), "init_data": InputArtifact(), "iter_data": InputArtifact(), } @@ -92,9 +89,6 @@ def __init__( "scripts": OutputArtifact(), "models": OutputArtifact(), "nvnmodels": OutputArtifact(), - "models_ckpt_meta": OutputArtifact(), - "models_ckpt_data": OutputArtifact(), - "models_ckpt_index": OutputArtifact(), "logs": OutputArtifact(), "lcurves": OutputArtifact(), } @@ -198,8 +192,8 @@ def _prep_run_nvnmd_train( slices=Slices( "int('{{item}}')", input_parameter=["task_name"], - input_artifact=["task_path", "init_model", "init_model_ckpt_meta", "init_model_ckpt_data", "init_model_ckpt_index"], - output_artifact=["cnn_model", "qnn_model", "model_ckpt_data", "model_ckpt_meta", "model_ckpt_index", "lcurve", "log", "script"], + input_artifact=["task_path", "init_model"], + output_artifact=["cnn_model", "qnn_model", "lcurve", "log", "script"], **template_slice_config, ), python_packages=upload_python_packages, @@ -215,9 +209,6 @@ def _prep_run_nvnmd_train( artifacts={ "task_path": prep_train.outputs.artifacts["task_paths"], "init_model": train_steps.inputs.artifacts["init_models"], - "init_model_ckpt_meta": train_steps.inputs.artifacts["init_models_ckpt_meta"], - "init_model_ckpt_data": train_steps.inputs.artifacts["init_models_ckpt_data"], - "init_model_ckpt_index": train_steps.inputs.artifacts["init_models_ckpt_index"], "init_data": train_steps.inputs.artifacts["init_data"], "iter_data": train_steps.inputs.artifacts["iter_data"], "valid_data": valid_data, @@ -244,9 +235,6 @@ def _prep_run_nvnmd_train( ] train_steps.outputs.artifacts["models"]._from = run_train.outputs.artifacts["cnn_model"] train_steps.outputs.artifacts["nvnmodels"]._from = run_train.outputs.artifacts["qnn_model"] - train_steps.outputs.artifacts["models_ckpt_meta"]._from = run_train.outputs.artifacts["model_ckpt_meta"] - train_steps.outputs.artifacts["models_ckpt_data"]._from = run_train.outputs.artifacts["model_ckpt_data"] - train_steps.outputs.artifacts["models_ckpt_index"]._from = run_train.outputs.artifacts["model_ckpt_index"] train_steps.outputs.artifacts["logs"]._from = run_train.outputs.artifacts["log"] train_steps.outputs.artifacts["lcurves"]._from = run_train.outputs.artifacts[ "lcurve" diff --git a/dpgen2/utils/download_dpgen2_artifacts.py b/dpgen2/utils/download_dpgen2_artifacts.py index db4b7e6c..b5f69153 100644 --- a/dpgen2/utils/download_dpgen2_artifacts.py +++ b/dpgen2/utils/download_dpgen2_artifacts.py @@ -54,16 +54,10 @@ def add_output( op_download_setting = { "prep-run-train": DownloadDefinition() .add_input("init_models") - .add_input("init_models_ckpt_meta") - .add_input("init_models_ckpt_data") - .add_input("init_models_ckpt_index") .add_input("init_data") .add_input("iter_data") .add_output("scripts") .add_output("models") - .add_output("models_ckpt_meta") - .add_output("models_ckpt_data") - .add_output("models_ckpt_index") .add_output("logs") .add_output("lcurves"), "prep-run-explore": DownloadDefinition() From c1dd7415eba4e381c81d2ef0890700b4f74bed23 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Fri, 16 May 2025 11:45:50 +0800 Subject: [PATCH 09/49] fix args in nvnmd --- dpgen2/entrypoint/args.py | 49 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/dpgen2/entrypoint/args.py b/dpgen2/entrypoint/args.py index b4553157..7fcd91ce 100644 --- a/dpgen2/entrypoint/args.py +++ b/dpgen2/entrypoint/args.py @@ -28,6 +28,9 @@ from dpgen2.op.run_dp_train import ( RunDPTrain, ) +from dpgen2.op.run_nvnmd_train import ( + RunNvNMDTrain, +) from dpgen2.op.run_lmp import ( RunLmp, ) @@ -126,6 +129,50 @@ def dp_train_args(): ), ] +def nvnmd_train_args(): + doc_numb_models = "Number of models trained for evaluating the model deviation" + doc_config = "Configuration of training" + doc_template_script = "File names of the template training script. It can be a `List[str]`, the length of which is the same as `numb_models`. Each template script in the list is used to train a model. Can be a `str`, the models share the same template training script. " + doc_init_models_paths = "the paths to initial models" + doc_init_models_uri = "The URI of initial models" + doc_optional_files = "Optional files for training" + + return [ + Argument( + "config", + dict, + RunNvNMDTrain.training_args(), + optional=True, + default=RunNvNMDTrain.normalize_config({}), + doc=doc_numb_models, + ), + Argument("numb_models", int, optional=True, default=4, doc=doc_numb_models), + Argument( + "template_script", [List[str], str], optional=False, doc=doc_template_script + ), + Argument( + "init_models_paths", + List[str], + optional=True, + default=None, + doc=doc_init_models_paths, + alias=["training_iter0_model_path"], + ), + Argument( + "init_models_uri", + str, + optional=True, + default=None, + doc=doc_init_models_uri, + ), + Argument( + "optional_files", + list, + optional=True, + default=None, + doc=doc_optional_files, + ), + ] def variant_train(): doc = "the type of the training" @@ -133,7 +180,7 @@ def variant_train(): "type", [ Argument("dp", dict, dp_train_args()), - Argument("dp-nvnmd", dict, dp_train_args()), + Argument("dp-nvnmd", dict, nvnmd_train_args()), Argument("dp-dist", dict, dp_dist_train_args()), ], doc=doc, From 6b3a0e1df5e74cd9376bd463d838ffe6eb7aa099 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Sat, 17 May 2025 10:29:44 +0800 Subject: [PATCH 10/49] add nvnmd test unit --- dpgen2/op/run_nvnmd_train.py | 3 +- tests/mocked_ops.py | 316 +++++++++++++ tests/op/test_prep_nvnmd_train.py | 173 +++++++ tests/op/test_run_nvnmd.py | 351 ++++++++++++++ tests/op/test_run_nvnmd_train.py | 735 +++++++++++++++++++++++++++++ tests/test_prep_run_lmp.py | 1 + tests/test_prep_run_nvnmd_train.py | 395 ++++++++++++++++ 7 files changed, 1972 insertions(+), 2 deletions(-) create mode 100644 tests/op/test_prep_nvnmd_train.py create mode 100644 tests/op/test_run_nvnmd.py create mode 100644 tests/op/test_run_nvnmd_train.py create mode 100644 tests/test_prep_run_nvnmd_train.py diff --git a/dpgen2/op/run_nvnmd_train.py b/dpgen2/op/run_nvnmd_train.py index a2a1db1b..283d52b3 100644 --- a/dpgen2/op/run_nvnmd_train.py +++ b/dpgen2/op/run_nvnmd_train.py @@ -249,7 +249,7 @@ def clean_before_quit(): ) if not RunNvNMDTrain.skip_training( - train_dict, config, init_model, iter_data + work_dir, train_dict, init_model, iter_data ): ret, out, err = run_command(command) if ret != 0: @@ -405,7 +405,6 @@ def write_other_to_input_script( v["start_pref_e"] = 1 v["start_pref_f"] = 1 v["start_pref_v"] = 1 - odict["learning_rate"]["start_lr"] = odict["learning_rate"]["stop_lr"] if major_version == "1": odict["training"]["stop_batch"] = 0 elif major_version == "2": diff --git a/tests/mocked_ops.py b/tests/mocked_ops.py index 9cd13c00..1c2bad66 100644 --- a/tests/mocked_ops.py +++ b/tests/mocked_ops.py @@ -92,9 +92,15 @@ from dpgen2.op.run_dp_train import ( RunDPTrain, ) +from dpgen2.op.run_nvnmd_train import ( + RunNvNMDTrain, +) from dpgen2.op.run_lmp import ( RunLmp, ) +from dpgen2.op.run_nvnmd import ( + RunNvNMD, +) from dpgen2.op.select_confs import ( SelectConfs, ) @@ -351,6 +357,239 @@ def execute( ) +class MockedPrepNvNMDTrain(PrepDPTrain): + @OP.exec_sign_check + def execute( + self, + ip: OPIO, + ) -> OPIO: + template = ip["template_script"] + numb_models = ip["numb_models"] + ofiles = [] + osubdirs = [] + + assert template == mocked_template_script + assert numb_models == mocked_numb_models + + for ii in range(numb_models): + jtmp = template + jtmp["seed"] = ii + subdir = Path(train_task_pattern % ii) + subdir.mkdir(exist_ok=True, parents=True) + fname = subdir / "input.json" + with open(fname, "w") as fp: + json.dump(jtmp, fp, indent=4) + osubdirs.append(str(subdir)) + ofiles.append(fname) + + op = OPIO( + { + "task_names": osubdirs, + "task_paths": [Path(ii) for ii in osubdirs], + } + ) + return op + + +class MockedRunNvNMDTrain(RunNvNMDTrain): + @OP.exec_sign_check + def execute( + self, + ip: OPIO, + ) -> OPIO: + work_dir = Path(ip["task_name"]) + script = ip["task_path"] / "input.json" + init_model = Path(ip["init_model"]) + init_data = ip["init_data"] + iter_data = ip["iter_data"] + + assert script.is_file() + assert ip["task_path"].is_dir() + assert init_model.is_file() + assert len(init_data) == 2 + assert re.match("task.[0-9][0-9][0-9][0-9]", ip["task_name"]) + task_id = int(ip["task_name"].split(".")[1]) + assert ip["task_name"] in str(ip["task_path"]) + assert "model" in str(ip["init_model"]) + assert ".pb" in str(ip["init_model"]) + list_init_data = sorted([str(ii) for ii in init_data]) + assert "init_data/bar" in list_init_data[0] + assert "init_data/foo" in list_init_data[1] + assert Path(list_init_data[0]).is_dir() + assert Path(list_init_data[1]).is_dir() + + script = Path(script).resolve() + init_model = init_model.resolve() + init_model_str = str(init_model) + init_data = [ii.resolve() for ii in init_data] + iter_data = [ii.resolve() for ii in iter_data] + init_data_str = [str(ii) for ii in init_data] + iter_data_str = [str(ii) for ii in iter_data] + + with open(script) as fp: + jtmp = json.load(fp) + data = [] + for ii in sorted(init_data_str): + data.append(ii) + for ii in sorted(iter_data_str): + data.append(ii) + jtmp["data"] = data + with open(script, "w") as fp: + json.dump(jtmp, fp, indent=4) + + cwd = os.getcwd() + work_dir.mkdir(exist_ok=True, parents=True) + os.chdir(work_dir) + + oscript = Path("input.json") + if not oscript.exists(): + from shutil import ( + copyfile, + ) + + copyfile(script, oscript) + cnn_model = Path("frozen_model.pb") + qnn_model = Path("model.pb") + lcurve = Path("lcurve.out") + log = Path("log") + + assert init_model.exists() + with log.open("w") as f: + f.write(f"init_model {str(init_model)} OK\n") + for ii in jtmp["data"]: + assert Path(ii).exists() + assert (ii in init_data_str) or (ii in iter_data_str) + with log.open("a") as f: + f.write(f"data {str(ii)} OK\n") + assert script.exists() + with log.open("a") as f: + f.write(f"script {str(script)} OK\n") + + with cnn_model.open("w") as f: + f.write("read from init model: \n") + f.write(init_model.read_text() + "\n") + with qnn_model.open("w") as f: + f.write("read from init model: \n") + f.write(init_model.read_text() + "\n") + with lcurve.open("w") as f: + f.write("read from train_script: \n") + f.write(script.read_text() + "\n") + + os.chdir(cwd) + + return OPIO( + { + "script": work_dir / oscript, + "cnn_model": work_dir / cnn_model, + "qnn_model": work_dir / qnn_model, + "lcurve": work_dir / lcurve, + "log": work_dir / log, + } + ) + + +class MockedRunNvNMDTrainCheckOptParam(RunDPTrain): + @OP.exec_sign_check + def execute( + self, + ip: OPIO, + ) -> OPIO: + if not ip["optional_parameter"]["mixed_type"]: + raise FatalError( + f"the value of mixed_type is {ip['optional_parameter']['mixed_type']} " + ) + if not ip["optional_parameter"]["finetune_mode"]: + raise FatalError( + f"the value of finetune_mode is {ip['optional_parameter']['finetune_mode']} " + ) + return MockedRunDPTrain.execute(self, ip) + + +class MockedRunNvNMDTrainNoneInitModel(RunNvNMDTrain): + @OP.exec_sign_check + def execute( + self, + ip: OPIO, + ) -> OPIO: + work_dir = Path(ip["task_name"]) + script = ip["task_path"] / "input.json" + if ip["init_model"] is not None: + raise FatalError("init model is not None") + init_data = ip["init_data"] + iter_data = ip["iter_data"] + + assert script.is_file() + assert ip["task_path"].is_dir() + assert len(init_data) == 2 + assert re.match("task.[0-9][0-9][0-9][0-9]", ip["task_name"]) + task_id = int(ip["task_name"].split(".")[1]) + assert ip["task_name"] in str(ip["task_path"]) + list_init_data = sorted([str(ii) for ii in init_data]) + assert "init_data/bar" in list_init_data[0] + assert "init_data/foo" in list_init_data[1] + assert Path(list_init_data[0]).is_dir() + assert Path(list_init_data[1]).is_dir() + + script = Path(script).resolve() + init_data = [ii.resolve() for ii in init_data] + iter_data = [ii.resolve() for ii in iter_data] + init_data_str = [str(ii) for ii in init_data] + iter_data_str = [str(ii) for ii in iter_data] + + with open(script) as fp: + jtmp = json.load(fp) + data = [] + for ii in sorted(init_data_str): + data.append(ii) + for ii in sorted(iter_data_str): + data.append(ii) + jtmp["data"] = data + with open(script, "w") as fp: + json.dump(jtmp, fp, indent=4) + + cwd = os.getcwd() + work_dir.mkdir(exist_ok=True, parents=True) + os.chdir(work_dir) + + oscript = Path("input.json") + if not oscript.exists(): + from shutil import ( + copyfile, + ) + + copyfile(script, oscript) + model = Path("model.pb") + lcurve = Path("lcurve.out") + log = Path("log") + + for ii in jtmp["data"]: + assert Path(ii).exists() + assert (ii in init_data_str) or (ii in iter_data_str) + with log.open("a") as f: + f.write(f"data {str(ii)} OK\n") + assert script.exists() + with log.open("a") as f: + f.write(f"script {str(script)} OK\n") + + with model.open("w") as f: + f.write("read from init model: \n") + with lcurve.open("w") as f: + f.write("read from train_script: \n") + f.write(script.read_text() + "\n") + + os.chdir(cwd) + + return OPIO( + { + "script": work_dir / oscript, + "cnn_model": work_dir / model, + "qnn_model": work_dir / model, + "lcurve": work_dir / lcurve, + "log": work_dir / log, + } + ) + + class MockedRunLmp(RunLmp): @OP.exec_sign_check def execute( @@ -428,6 +667,83 @@ def execute( ) +class MockedRunNvNMD(RunNvNMD): + @OP.exec_sign_check + def execute( + self, + ip: OPIO, + ) -> OPIO: + task_name = ip["task_name"] + task_path = ip["task_path"] + models = ip["models"] + + assert ip["task_path"].is_dir() + assert re.match("task.[0-9][0-9][0-9][0-9][0-9][0-9]", ip["task_name"]) + task_id = int(ip["task_name"].split(".")[1]) + assert task_path.is_dir() + assert ip["task_name"] in str(ip["task_path"]) + assert ( + len(models) == mocked_numb_models + ), f"{len(models)} == {mocked_numb_models}" + for ii in range(mocked_numb_models): + assert ip["models"][ii].is_file() + assert "model" in str(ip["models"][ii]) + assert ".pb" in str(ip["models"][ii]) + assert (task_path / lmp_conf_name).is_file() + assert (task_path / lmp_input_name).is_file() + + task_path = task_path.resolve() + models = [ii.resolve() for ii in models] + models_str = [str(ii) for ii in models] + + work_dir = Path(task_name) + + cwd = os.getcwd() + work_dir.mkdir(exist_ok=True, parents=True) + os.chdir(work_dir) + + import glob + + ifiles = glob.glob(str(task_path / "*")) + for ii in ifiles: + if not Path(Path(ii).name).exists(): + Path(Path(ii).name).symlink_to(ii) + for ii in models: + if not Path(Path(ii).name).exists(): + Path(Path(ii).name).symlink_to(ii) + + log = Path(lmp_log_name) + traj = Path(lmp_traj_name) + model_devi = Path(lmp_model_devi_name) + + # fc = ['log of {task_name}'] + # for ii in ['conf.lmp', 'in.lammps'] + models_str: + # if Path(ii).exists(): + # fc.append(f'{ii} OK') + # log.write_text('\n'.join(fc)) + # log.write_text('log of {task_name}') + fc = [] + for ii in [lmp_conf_name, lmp_input_name] + [ii.name for ii in models]: + fc.append(Path(ii).read_text()) + log.write_text("\n".join(fc)) + model_devi.write_text(f"model_devi of {task_name}") + traj_out = [] + traj_out.append(f"traj of {task_name}") + traj_out.append(Path(lmp_conf_name).read_text()) + traj_out.append(Path(lmp_input_name).read_text()) + traj.write_text("\n".join(traj_out)) + + os.chdir(cwd) + + return OPIO( + { + "log": work_dir / log, + "traj": work_dir / traj, + "model_devi": work_dir / model_devi, + } + ) + + class MockedPrepVasp(PrepVasp): @OP.exec_sign_check def execute( diff --git a/tests/op/test_prep_nvnmd_train.py b/tests/op/test_prep_nvnmd_train.py new file mode 100644 index 00000000..0d5fe698 --- /dev/null +++ b/tests/op/test_prep_nvnmd_train.py @@ -0,0 +1,173 @@ +import json +import shutil +import unittest +from pathlib import ( + Path, +) + +import numpy as np +from dflow.python import ( + OP, + OPIO, + Artifact, + OPIOSign, +) +from mock import ( + mock, +) + +# isort: off +from .context import ( + dpgen2, +) +from dpgen2.constants import ( + train_script_name, + train_task_pattern, +) +from dpgen2.op.prep_nvnmd_train import ( + PrepNvNMDTrain, +) + +# isort: on + +template_script_nvnmd_v0 = { + "nvnmd": { + "version": 0, + "seed": 1 + }, + "training": { + "systems": [], + "stop_batch": 2000, + "batch_size": "auto", + "seed": 1, + }, +} + + +template_script_nvnmd_v1 = { + "nvnmd": { + "version": 1, + "seed": 1 + }, + "training": { + "systems": [], + "stop_batch": 2000, + "batch_size": "auto", + "seed": 1, + }, +} + + +class faked_rg: + faked_random = -1 + + @classmethod + def randrange(cls, xx): + cls.faked_random += 1 + return cls.faked_random + + +class TestPrepNvNMDTrain(unittest.TestCase): + def setUp(self): + self.numb_models = 2 + self.ptrain = PrepNvNMDTrain() + + def tearDown(self): + for ii in range(self.numb_models): + if Path(train_task_pattern % ii).exists(): + shutil.rmtree(train_task_pattern % ii) + + def _check_output_dir_and_file_exist(self, op, numb_models): + task_names = op["task_names"] + task_paths = op["task_paths"] + for ii in range(self.numb_models): + self.assertEqual(train_task_pattern % ii, task_names[ii]) + self.assertEqual(Path(train_task_pattern % ii), task_paths[ii]) + self.assertTrue(task_paths[ii].is_dir()) + self.assertTrue((task_paths[ii] / train_script_name).is_file()) + + def test_template_nvnmd_v1(self): + ip = OPIO( + {"template_script": template_script_nvnmd_v1, "numb_models": self.numb_models} + ) + + faked_rg.faked_random = -1 + with mock.patch("random.randrange", faked_rg.randrange): + op = self.ptrain.execute(ip) + + self._check_output_dir_and_file_exist(op, self.numb_models) + + for ii in range(self.numb_models): + with open(Path(train_task_pattern % ii) / train_script_name) as fp: + jdata = json.load(fp) + self.assertEqual(jdata["nvnmd"]["version"], 1) + self.assertEqual(jdata["nvnmd"]["seed"], 2 * ii + 0) + self.assertEqual(jdata["training"]["seed"], 2 * ii + 1) + + def test_template_nvnmd_v0(self): + ip = OPIO( + { + "template_script": template_script_nvnmd_v0, + "numb_models": self.numb_models, + } + ) + + faked_rg.faked_random = -1 + with mock.patch("random.randrange", faked_rg.randrange): + op = self.ptrain.execute(ip) + + self._check_output_dir_and_file_exist(op, self.numb_models) + + for ii in range(self.numb_models): + with open(Path(train_task_pattern % ii) / train_script_name) as fp: + jdata = json.load(fp) + self.assertEqual(jdata["nvnmd"]["version"], 0) + self.assertEqual(jdata["nvnmd"]["seed"], 2 * ii + 0) + self.assertEqual(jdata["training"]["seed"], 2 * ii + 1) + + def test_template_list_nvnmd_v0_v1(self): + ip = OPIO( + { + "template_script": [template_script_nvnmd_v0, template_script_nvnmd_v1], + "numb_models": self.numb_models, + } + ) + + faked_rg.faked_random = -1 + with mock.patch("random.randrange", faked_rg.randrange): + op = self.ptrain.execute(ip) + + self._check_output_dir_and_file_exist(op, self.numb_models) + + ii = 0 + with open(Path(train_task_pattern % ii) / train_script_name) as fp: + jdata = json.load(fp) + self.assertEqual(jdata["nvnmd"]["version"], 0) + self.assertEqual(jdata["nvnmd"]["seed"], 2 * ii) + self.assertEqual(jdata["training"]["seed"], 2 * ii + 1) + ii = 1 + with open(Path(train_task_pattern % ii) / train_script_name) as fp: + jdata = json.load(fp) + self.assertEqual(jdata["nvnmd"]["version"], 1) + self.assertEqual(jdata["nvnmd"]["seed"], 2 * ii) + self.assertEqual(jdata["training"]["seed"], 2 * ii + 1) + + def test_template_raise_wrong_list_length(self): + ip = OPIO( + { + "template_script": [ + template_script_nvnmd_v1, + template_script_nvnmd_v0, + template_script_nvnmd_v1 + ], + "numb_models": self.numb_models, + } + ) + + with self.assertRaises(RuntimeError) as context: + faked_rg.faked_random = -1 + with mock.patch("random.randrange", faked_rg.randrange): + op = self.ptrain.execute(ip) + self.assertTrue( + "length of the template list should be equal to 2" in str(context.exception) + ) diff --git a/tests/op/test_run_nvnmd.py b/tests/op/test_run_nvnmd.py new file mode 100644 index 00000000..44785437 --- /dev/null +++ b/tests/op/test_run_nvnmd.py @@ -0,0 +1,351 @@ +import json +import os +import shutil +import unittest +from pathlib import ( + Path, +) + +import dpdata +import numpy as np +from dflow.python import ( + OP, + OPIO, + Artifact, + OPIOSign, + TransientError, +) +from mock import ( + call, + mock, + patch, +) + +# isort: off +from .context import ( + dpgen2, +) +from dpgen2.constants import ( + lmp_conf_name, + lmp_input_name, + lmp_log_name, + lmp_model_devi_name, + lmp_traj_name, + model_name_pattern, +) +from dpgen2.op.run_nvnmd import ( + RunNvNMD, + get_ele_temp, + merge_pimd_files, + set_models, +) +from dpgen2.utils import ( + BinaryFileInput, +) + +# isort: on + + +class TestRunNvNMD(unittest.TestCase): + def setUp(self): + self.task_path = Path("task/path") + self.task_path.mkdir(parents=True, exist_ok=True) + self.model_path = Path("models/path") + self.model_path.mkdir(parents=True, exist_ok=True) + (self.task_path / lmp_conf_name).write_text("foo") + (self.task_path / lmp_input_name).write_text("bar") + self.task_name = "task_000" + self.models = [self.model_path / Path(f"model_{ii}.pb") for ii in range(4)] + for idx, ii in enumerate(self.models): + ii.write_text(f"model{idx}") + + def tearDown(self): + if Path("task").is_dir(): + shutil.rmtree("task") + if Path("models").is_dir(): + shutil.rmtree("models") + if Path(self.task_name).is_dir(): + shutil.rmtree(self.task_name) + + @patch("dpgen2.op.run_lmp.run_command") + def test_success(self, mocked_run): + mocked_run.side_effect = [(0, "foo\n", "")] + op = RunNvNMD() + out = op.execute( + OPIO( + { + "config": {"command": "mylmp"}, + "task_name": self.task_name, + "task_path": self.task_path, + "models": self.models, + } + ) + ) + work_dir = Path(self.task_name) + # check output + self.assertEqual(out["log"], work_dir / lmp_log_name) + self.assertEqual(out["traj"], work_dir / lmp_traj_name) + self.assertEqual(out["model_devi"], work_dir / lmp_model_devi_name) + # check call + calls = [ + call( + " ".join(["mylmp", "-i", lmp_input_name, "-log", lmp_log_name]), + shell=True, + ), + ] + mocked_run.assert_has_calls(calls) + # check input files are correctly linked + self.assertEqual((work_dir / lmp_conf_name).read_text(), "foo") + self.assertEqual((work_dir / lmp_input_name).read_text(), "bar") + for ii in range(4): + self.assertEqual( + (work_dir / (model_name_pattern % ii)).read_text(), f"model{ii}" + ) + + @patch("dpgen2.op.run_lmp.run_command") + def test_error(self, mocked_run): + mocked_run.side_effect = [(1, "foo\n", "")] + op = RunNvNMD() + with self.assertRaises(TransientError) as ee: + out = op.execute( + OPIO( + { + "config": {"command": "mylmp"}, + "task_name": self.task_name, + "task_path": self.task_path, + "models": self.models, + } + ) + ) + # check call + calls = [ + call( + " ".join(["mylmp", "-i", lmp_input_name, "-log", lmp_log_name]), + shell=True, + ), + ] + mocked_run.assert_has_calls(calls) + + +class TestRunNvNMDDist(unittest.TestCase): + lmp_config = """variable NSTEPS equal 1000 + +units metal +boundary p p p +atom_style atomic + +neighbor 1.0 bin + +box tilt large +if "${restart} > 0" then "read_restart dpgen.restart.*" else "read_data conf.lmp" + +group target_element_1 type 4 +#set group other_element type/subset ${ELEMENT_TYPE_4} ${ELEMENT_NUMB_4} ${OUTER_RANDOM_SEED_4} + +change_box all triclinic +mass 6 26.980000 +pair_style deepmd model.000.pb out_freq 10 out_file model_devi.out +pair_coeff * * + +thermo_style custom step temp pe ke etotal press vol lx ly lz xy xz yz +thermo ${THERMO_FREQ} +#dump 1 all custom ${DUMP_FREQ} traj/*.lammpstrj id type x y z fx fy fz + +if "${restart} == 0" then "velocity all create 2754.34 709383" +fix 1 all npt temp 2754.34 2754.34 ${TAU_T} iso 1.0 1.0 ${TAU_P} +timestep 0.002000 +run 3000 upto +""" + + def setUp(self): + self.task_path = Path("task/path") + self.task_path.mkdir(parents=True, exist_ok=True) + self.model_path = Path("models/path") + self.model_path.mkdir(parents=True, exist_ok=True) + self.teacher_path = Path("models/teacher") + self.teacher_path.mkdir(parents=True, exist_ok=True) + + (self.task_path / lmp_conf_name).write_text("foo") + (self.task_path / lmp_input_name).write_text(TestRunNvNMDDist.lmp_config) + + self.task_name = "task_000" + self.models = [self.model_path / Path(f"model_{ii}.pb") for ii in range(1)] + for idx, ii in enumerate(self.models): + ii.write_text(f"model{idx}") + + (self.teacher_path / "teacher.pb").write_text("teacher model") + self.teacher_model = BinaryFileInput(self.teacher_path / "teacher.pb", "pb") + + self.maxDiff = None + + def tearDown(self): + if Path("task").is_dir(): + shutil.rmtree("task") + if Path("models").is_dir(): + shutil.rmtree("models") + if Path(self.task_name).is_dir(): + shutil.rmtree(self.task_name) + + @patch("dpgen2.op.run_lmp.run_command") + def test_success(self, mocked_run): + mocked_run.side_effect = [(0, "foo\n", "")] + op = RunNvNMD() + out = op.execute( + OPIO( + { + "config": { + "command": "mylmp", + "teacher_model_path": self.teacher_model, + }, + "task_name": self.task_name, + "task_path": self.task_path, + "models": self.models, + } + ) + ) + work_dir = Path(self.task_name) + + # check input files are correctly linked + self.assertEqual((work_dir / lmp_conf_name).read_text(), "foo") + + lmp_config = TestRunNvNMDDist.lmp_config.replace( + "pair_style deepmd model.000.pb", + "pair_style deepmd model.000.pb model.001.pb", + ) + self.assertEqual((work_dir / lmp_input_name).read_text(), lmp_config) + + # check if the teacher model is linked to model.000.pb + ii = 0 + self.assertEqual( + (work_dir / (model_name_pattern % ii)).read_text(), f"teacher model" + ) + + ii = 1 + self.assertEqual( + (work_dir / (model_name_pattern % ii)).read_text(), f"model{ii - 1}" + ) + + # The number of models have to be 2 in knowledge distillation + self.assertEqual(len(list((work_dir.glob("*.pb")))), 2) + + +def swap_element(arg): + bk = arg.copy() + arg[1] = bk[0] + arg[0] = bk[1] + + +class TestSetModels(unittest.TestCase): + def setUp(self): + self.input_name = Path("lmp.input") + self.model_names = ["model.000.pth", "model.001.pb"] + + def tearDown(self): + os.remove(self.input_name) + + def test(self): + lmp_config = "pair_style deepmd model.000.pb model.001.pb out_freq 10 out_file model_devi.out\n" + expected_output = "pair_style deepmd model.000.pth model.001.pb out_freq 10 out_file model_devi.out\n" + input_name = self.input_name + input_name.write_text(lmp_config) + set_models(input_name, self.model_names) + self.assertEqual(input_name.read_text(), expected_output) + + def test_failed(self): + lmp_config = "pair_style deepmd model.000.pb model.001.pb out_freq 10 out_file model_devi.out model.002.pb\n" + input_name = self.input_name + input_name = Path("lmp.input") + input_name.write_text(lmp_config) + with self.assertRaises(RuntimeError) as re: + set_models(input_name, self.model_names) + + def test_failed_no_matching(self): + lmp_config = "pair_style deepmd out_freq 10 out_file model_devi.out\n" + input_name = self.input_name + input_name = Path("lmp.input") + input_name.write_text(lmp_config) + with self.assertRaises(RuntimeError) as re: + set_models(input_name, self.model_names) + + +class TestGetEleTemp(unittest.TestCase): + def test_get_ele_temp_none(self): + with open("log", "w") as f: + f.write( + "pair_style deepmd model.000.pb model.001.pb model.002.pb model.003.pb model.004.pb out_freq 10 out_file model_devi.out" + ) + ele_temp = get_ele_temp("log") + self.assertIsNone(ele_temp) + + def test_get_ele_temp(self): + with open("log", "w") as f: + f.write( + "pair_style deepmd model.000.pb model.001.pb model.002.pb model.003.pb model.004.pb out_freq 10 out_file model_devi.out fparam 6.6" + ) + ele_temp = get_ele_temp("log") + self.assertEqual(ele_temp, 6.6) + + def tearDown(self): + if os.path.exists("log"): + os.remove("log") + + +class TestMergePIMDFiles(unittest.TestCase): + def test_merge_pimd_files(self): + for i in range(1, 3): + with open("traj.%s.dump" % i, "w") as f: + f.write( + """ITEM: TIMESTEP +0 +ITEM: NUMBER OF ATOMS +3 +ITEM: BOX BOUNDS xy xz yz pp pp pp +0.0000000000000000e+00 1.2444661140399999e+01 0.0000000000000000e+00 +0.0000000000000000e+00 1.2444661140399999e+01 0.0000000000000000e+00 +0.0000000000000000e+00 1.2444661140399999e+01 0.0000000000000000e+00 +ITEM: ATOMS id type x y z +1 8 7.23489 0.826309 4.61669 +2 1 8.04419 0.520382 5.14395 +3 1 6.48126 0.446895 4.99766 +ITEM: TIMESTEP +10 +ITEM: NUMBER OF ATOMS +3 +ITEM: BOX BOUNDS xy xz yz pp pp pp +0.0000000000000000e+00 1.2444661140399999e+01 0.0000000000000000e+00 +0.0000000000000000e+00 1.2444661140399999e+01 0.0000000000000000e+00 +0.0000000000000000e+00 1.2444661140399999e+01 0.0000000000000000e+00 +ITEM: ATOMS id type x y z +1 8 7.23103 0.814939 4.59892 +2 1 7.96453 0.61699 5.19158 +3 1 6.43661 0.370311 5.09854 +""" + ) + for i in range(1, 3): + with open("model_devi.%s.out" % i, "w") as f: + f.write( + """# step max_devi_v min_devi_v avg_devi_v max_devi_f min_devi_f avg_devi_f + 0 9.023897e-17 3.548771e-17 5.237314e-17 8.196123e-16 1.225653e-16 3.941002e-16 + 10 1.081667e-16 4.141596e-17 7.534462e-17 9.070597e-16 1.067947e-16 4.153524e-16 +""" + ) + + merge_pimd_files() + self.assertTrue(os.path.exists(lmp_traj_name)) + self.assertTrue(os.path.exists(lmp_model_devi_name)) + s = dpdata.System(lmp_traj_name, fmt="lammps/dump") + assert len(s) == 4 + model_devi = np.loadtxt(lmp_model_devi_name) + assert model_devi.shape[0] == 4 + + def tearDown(self): + for f in [ + lmp_traj_name, + "traj.1.dump", + "traj.2.dump", + lmp_model_devi_name, + "model_devi.1.out", + "model_devi.2.out", + ]: + if os.path.exists(f): + os.remove(f) diff --git a/tests/op/test_run_nvnmd_train.py b/tests/op/test_run_nvnmd_train.py new file mode 100644 index 00000000..3386e44e --- /dev/null +++ b/tests/op/test_run_nvnmd_train.py @@ -0,0 +1,735 @@ +import itertools +import json +import os +import shutil +import unittest +from pathlib import ( + Path, +) + +import numpy as np +from dflow.python import ( + OP, + OPIO, + Artifact, + FatalError, + OPIOSign, + TransientError, +) +from fake_data_set import ( + fake_multi_sys, + fake_system, +) +from mock import ( + call, + patch, +) + +# isort: off +from .context import ( + dpgen2, +) +from dpgen2.constants import ( + train_script_name, + train_task_pattern, +) +from dpgen2.op.run_nvnmd_train import ( + RunNvNMDTrain, + _get_data_size_of_all_mult_sys, + _make_train_command, +) + +# isort: on + + +class TestRunNvNMDTrain(unittest.TestCase): + def setUp(self): + self.atom_name = "foo" + self.nframes_0 = [2, 5, 3] + self.natoms_0 = [4, 3, 4] + self.nframes_1 = [3, 4, 2] + self.natoms_1 = [5, 3, 2] + ms_0 = fake_multi_sys(self.nframes_0, self.natoms_0, self.atom_name) + ms_1 = fake_multi_sys(self.nframes_1, self.natoms_1, self.atom_name) + ms_0.to_deepmd_npy("data-0") + ms_1.to_deepmd_npy("data-1") + self.iter_data = [Path("data-0"), Path("data-1")] + self.iter_data_exp = [ + "data-0/foo3", + "data-0/foo4", + "data-1/foo2", + "data-1/foo3", + "data-1/foo5", + ] + ms_0.to_deepmd_npy_mixed("mixed-data-0") + ms_1.to_deepmd_npy_mixed("mixed-data-1") + self.mixed_iter_data = [Path("mixed-data-0"), Path("mixed-data-1")] + + self.init_nframs_0 = 3 + self.init_natoms_0 = 5 + self.init_nframs_1 = 4 + self.init_natoms_1 = 2 + ss_0 = fake_system(self.init_nframs_0, self.init_natoms_0, self.atom_name) + ss_1 = fake_system(self.init_nframs_1, self.init_natoms_1, self.atom_name) + ss_0.to_deepmd_npy("init/data-0") + ss_1.to_deepmd_npy("init/data-1") + self.init_data = [Path("init/data-0"), Path("init/data-1")] + self.init_data = sorted(list(self.init_data)) + + self.init_model = Path("bar.pb") + + self.config = { + "init_model_policy": "no", + "init_model_old_ratio": 0.9, + "init_model_numb_steps": 400000, + "init_model_start_lr": 1e-4, + "init_model_start_pref_e": 0.1, + "init_model_start_pref_f": 100, + "init_model_start_pref_v": 0.0, + } + self.config = RunNvNMDTrain.normalize_config(self.config) + + self.old_data_size = ( + self.init_nframs_0 + self.init_nframs_1 + sum(self.nframes_0) + ) + self.task_name = "task-000" + self.task_path = "input-000" + + self.idict_v2 = { + "training": { + "training_data": { + "systems": [], + }, + "validation_data": { + "systems": [], + }, + }, + "learning_rate": { + "start_lr": 1.0, + }, + "loss": { + "start_pref_e": 1.0, + "start_pref_f": 1.0, + "start_pref_v": 1.0, + }, + } + self.expected_odict_v2 = { + "training": { + "training_data": { + "systems": [ + "init/data-0", + "init/data-1", + "data-0/foo3", + "data-0/foo4", + "data-1/foo2", + "data-1/foo3", + "data-1/foo5", + ], + "batch_size": "auto", + "auto_prob": "prob_sys_size", + }, + "disp_file": "lcurve.out", + "save_ckpt": "model.ckpt" + }, + "learning_rate": { + "start_lr": 1.0, + }, + "loss": { + "start_pref_e": 1.0, + "start_pref_f": 1.0, + "start_pref_v": 1.0, + }, + } + self.expected_init_model_odict_v2 = { + "training": { + "training_data": { + "systems": [ + "init/data-0", + "init/data-1", + "data-0/foo3", + "data-0/foo4", + "data-1/foo2", + "data-1/foo3", + "data-1/foo5", + ], + "batch_size": "auto", + "auto_prob": "prob_sys_size; 0:4:0.9; 4:7:0.1", + }, + "disp_file": "lcurve.out", + "save_ckpt": "model.ckpt", + "numb_steps": 400000, + }, + "learning_rate": { + "start_lr": 1e-4, + }, + "loss": { + "start_pref_e": 0.1, + "start_pref_f": 100, + "start_pref_v": 0.0, + }, + } + + self.idict_v1 = { + "training": { + "systems": [], + }, + "learning_rate": { + "start_lr": 1.0, + }, + "loss": { + "start_pref_e": 1.0, + "start_pref_f": 1.0, + "start_pref_v": 1.0, + }, + } + self.expected_odict_v1 = { + "training": { + "systems": [ + "init/data-0", + "init/data-1", + "data-0/foo3", + "data-0/foo4", + "data-1/foo2", + "data-1/foo3", + "data-1/foo5", + ], + "batch_size": "auto", + "auto_prob_style": "prob_sys_size", + "disp_file": "lcurve.out", + "save_ckpt": "model.ckpt" + }, + "learning_rate": { + "start_lr": 1.0, + }, + "loss": { + "start_pref_e": 1.0, + "start_pref_f": 1.0, + "start_pref_v": 1.0, + }, + } + self.expected_init_model_odict_v1 = { + "training": { + "systems": [ + "init/data-0", + "init/data-1", + "data-0/foo3", + "data-0/foo4", + "data-1/foo2", + "data-1/foo3", + "data-1/foo5", + ], + "batch_size": "auto", + "auto_prob_style": "prob_sys_size; 0:4:0.9; 4:7:0.1", + "disp_file": "lcurve.out", + "save_ckpt": "model.ckpt", + "stop_batch": 400000, + }, + "learning_rate": { + "start_lr": 1e-4, + }, + "loss": { + "start_pref_e": 0.1, + "start_pref_f": 100, + "start_pref_v": 0.0, + }, + } + + def tearDown(self): + for ii in [ + "init", + "data-0", + "data-1", + "mixed-data-0", + "mixed-data-1", + self.task_path, + self.task_name, + ]: + if Path(ii).exists(): + shutil.rmtree(str(ii)) + + def test_normalize_config(self): + config = self.config + self.assertEqual(config["init_model_policy"], "no") + self.assertAlmostEqual(config["init_model_old_ratio"], 0.9) + self.assertEqual(config["init_model_numb_steps"], 400000) + self.assertAlmostEqual(config["init_model_start_lr"], 1e-4) + self.assertAlmostEqual(config["init_model_start_pref_e"], 0.1) + self.assertAlmostEqual(config["init_model_start_pref_f"], 100) + self.assertAlmostEqual(config["init_model_start_pref_v"], 0.0) + + def test_get_size_of_all_mult_sys(self): + cc = _get_data_size_of_all_mult_sys(self.iter_data) + self.assertEqual(cc, sum(self.nframes_0) + sum(self.nframes_1)) + cc = _get_data_size_of_all_mult_sys(self.mixed_iter_data, mixed_type=True) + self.assertEqual(cc, sum(self.nframes_0) + sum(self.nframes_1)) + # read the mixed type systems as if they were standard system, + # should give the correct estimate of the data size + cc = _get_data_size_of_all_mult_sys(self.mixed_iter_data, mixed_type=False) + self.assertEqual(cc, sum(self.nframes_0) + sum(self.nframes_1)) + + def test_decide_init_model_no_model(self): + do_init_model = RunNvNMDTrain.decide_init_model( + self.config, None, self.init_data, self.iter_data + ) + self.assertFalse(do_init_model) + + def test_decide_init_model_none_iter_data(self): + do_init_model = RunNvNMDTrain.decide_init_model( + self.config, self.init_model, self.init_data, None + ) + self.assertFalse(do_init_model) + + def test_decide_init_model_no_iter_data(self): + do_init_model = RunNvNMDTrain.decide_init_model( + self.config, self.init_model, self.init_data, [] + ) + self.assertFalse(do_init_model) + + def test_decide_init_model_config_no(self): + config = self.config.copy() + config["init_model_policy"] = "no" + do_init_model = RunNvNMDTrain.decide_init_model( + config, self.init_model, self.init_data, self.iter_data + ) + self.assertFalse(do_init_model) + + def test_decide_init_model_config_yes(self): + config = self.config.copy() + config["init_model_policy"] = "yes" + do_init_model = RunNvNMDTrain.decide_init_model( + config, self.init_model, self.init_data, self.iter_data + ) + self.assertTrue(do_init_model) + + def test_decide_init_model_config_larger_than_no(self): + config = self.config.copy() + config["init_model_policy"] = f"old_data_larger_than:{self.old_data_size}" + do_init_model = RunNvNMDTrain.decide_init_model( + config, self.init_model, self.init_data, self.iter_data + ) + self.assertFalse(do_init_model) + + def test_decide_init_model_config_larger_than_yes(self): + config = self.config.copy() + config["init_model_policy"] = f"old_data_larger_than:{self.old_data_size-1}" + do_init_model = RunNvNMDTrain.decide_init_model( + config, self.init_model, self.init_data, self.iter_data + ) + self.assertTrue(do_init_model) + + def test_update_input_dict_v1_init_model(self): + odict = RunNvNMDTrain.write_data_to_input_script( + self.idict_v1, + self.config, + self.init_data, + self.iter_data_exp, + auto_prob_str="prob_sys_size; 0:4:0.9; 4:7:0.1", + major_version="1", + ) + config = self.config.copy() + config["init_model_policy"] = "yes" + odict = RunNvNMDTrain.write_other_to_input_script( + odict, config, True, False, major_version="1" + ) + self.assertDictEqual(odict, self.expected_init_model_odict_v1) + + def test_update_input_dict_v1(self): + odict = RunNvNMDTrain.write_data_to_input_script( + self.idict_v1, + self.config, + self.init_data, + self.iter_data_exp, + auto_prob_str="prob_sys_size", + major_version="1", + ) + config = self.config.copy() + config["init_model_policy"] = "no" + odict = RunNvNMDTrain.write_other_to_input_script( + odict, config, False, False, major_version="1" + ) + self.assertDictEqual(odict, self.expected_odict_v1) + + def test_update_input_dict_v2_init_model(self): + idict = self.idict_v2 + odict = RunNvNMDTrain.write_data_to_input_script( + idict, + self.config, + self.init_data, + self.iter_data_exp, + auto_prob_str="prob_sys_size; 0:4:0.9; 4:7:0.1", + major_version="2", + ) + config = self.config.copy() + config["init_model_policy"] = "yes" + odict = RunNvNMDTrain.write_other_to_input_script( + odict, config, True, False, major_version="2" + ) + self.assertDictEqual(odict, self.expected_init_model_odict_v2) + + def test_update_input_dict_v2(self): + idict = self.idict_v2 + odict = RunNvNMDTrain.write_data_to_input_script( + idict, + self.config, + self.init_data, + self.iter_data_exp, + auto_prob_str="prob_sys_size", + major_version="2", + ) + config = self.config.copy() + config["init_model_policy"] = "no" + odict = RunNvNMDTrain.write_other_to_input_script( + odict, config, False, False,major_version="2" + ) + self.assertDictEqual(odict, self.expected_odict_v2) + + @patch("dpgen2.op.run_nvnmd_train.run_command") + def test_exec_v1(self, mocked_run): + mocked_run.side_effect = [(0, "foo\n", ""), (0, "bar\n", "")] + + config = self.config.copy() + config["init_model_policy"] = "no" + + task_path = self.task_path + Path(task_path).mkdir(exist_ok=True) + with open(Path(task_path) / train_script_name, "w") as fp: + json.dump(self.idict_v1, fp, indent=4) + task_name = self.task_name + work_dir = Path(task_name) + + ptrain = RunNvNMDTrain() + out = ptrain.execute( + OPIO( + { + "config": config, + "task_name": task_name, + "task_path": Path(task_path), + "init_model": Path(self.init_model), + "init_data": [Path(ii) for ii in self.init_data], + "iter_data": [Path(ii) for ii in self.iter_data], + } + ) + ) + self.assertEqual(out["script"], work_dir / train_script_name) + self.assertEqual(out["model"], work_dir / "frozen_model.pb") + self.assertEqual(out["lcurve"], work_dir / "lcurve.out") + self.assertEqual(out["log"], work_dir / "train.log") + + calls = [ + call(["dp", "train", train_script_name]), + call(["dp", "freeze", "-o", "frozen_model.pb"]), + ] + mocked_run.assert_has_calls(calls) + + self.assertTrue(work_dir.is_dir()) + self.assertTrue(out["log"].is_file()) + self.assertEqual( + out["log"].read_text(), + "#=================== train std out ===================\n" + "foo\n" + "#=================== train std err ===================\n" + "#=================== freeze std out ===================\n" + "bar\n" + "#=================== freeze std err ===================\n", + ) + with open(out["script"]) as fp: + jdata = json.load(fp) + self.assertDictEqual(jdata, self.expected_odict_v1) + + @patch("dpgen2.op.run_nvnmd_train.run_command") + def test_exec_v2(self, mocked_run): + mocked_run.side_effect = [(0, "foo\n", ""), (0, "bar\n", "")] + + config = self.config.copy() + config["init_model_policy"] = "no" + + task_path = self.task_path + Path(task_path).mkdir(exist_ok=True) + with open(Path(task_path) / train_script_name, "w") as fp: + json.dump(self.idict_v2, fp, indent=4) + task_name = self.task_name + work_dir = Path(task_name) + + ptrain = RunNvNMDTrain() + out = ptrain.execute( + OPIO( + { + "config": config, + "task_name": task_name, + "task_path": Path(task_path), + "init_model": Path(self.init_model), + "init_data": [Path(ii) for ii in self.init_data], + "iter_data": [Path(ii) for ii in self.iter_data], + } + ) + ) + self.assertEqual(out["script"], work_dir / train_script_name) + self.assertEqual(out["model"], work_dir / "frozen_model.pb") + self.assertEqual(out["lcurve"], work_dir / "lcurve.out") + self.assertEqual(out["log"], work_dir / "train.log") + + calls = [ + call(["dp", "train", train_script_name]), + call(["dp", "freeze", "-o", "frozen_model.pb"]), + ] + mocked_run.assert_has_calls(calls) + + self.assertTrue(work_dir.is_dir()) + self.assertTrue(out["log"].is_file()) + self.assertEqual( + out["log"].read_text(), + "#=================== train std out ===================\n" + "foo\n" + "#=================== train std err ===================\n" + "#=================== freeze std out ===================\n" + "bar\n" + "#=================== freeze std err ===================\n", + ) + with open(out["script"]) as fp: + jdata = json.load(fp) + self.assertDictEqual(jdata, self.expected_odict_v2) + + @patch("dpgen2.op.run_nvnmd_train.run_command") + def test_exec_v2_init_model(self, mocked_run): + mocked_run.side_effect = [(0, "foo\n", ""), (0, "bar\n", "")] + + config = self.config.copy() + config["init_model_policy"] = "yes" + + task_path = self.task_path + Path(task_path).mkdir(exist_ok=True) + with open(Path(task_path) / train_script_name, "w") as fp: + json.dump(self.idict_v2, fp, indent=4) + task_name = self.task_name + work_dir = Path(task_name) + + ptrain = RunNvNMDTrain() + out = ptrain.execute( + OPIO( + { + "config": config, + "task_name": task_name, + "task_path": Path(task_path), + "init_model": Path(self.init_model), + "init_data": [Path(ii) for ii in self.init_data], + "iter_data": [Path(ii) for ii in self.iter_data], + } + ) + ) + self.assertEqual(out["script"], work_dir / train_script_name) + self.assertEqual(out["model"], work_dir / "frozen_model.pb") + self.assertEqual(out["lcurve"], work_dir / "lcurve.out") + self.assertEqual(out["log"], work_dir / "train.log") + + calls = [ + call( + [ + "dp", + "train", + "--init-frz-model", + str(self.init_model), + train_script_name, + ] + ), + call(["dp", "freeze", "-o", "frozen_model.pb"]), + ] + mocked_run.assert_has_calls(calls) + + self.assertTrue(work_dir.is_dir()) + self.assertTrue(out["log"].is_file()) + self.assertEqual( + out["log"].read_text(), + "#=================== train std out ===================\n" + "foo\n" + "#=================== train std err ===================\n" + "#=================== freeze std out ===================\n" + "bar\n" + "#=================== freeze std err ===================\n", + ) + with open(out["script"]) as fp: + jdata = json.load(fp) + self.assertDictEqual(jdata, self.expected_init_model_odict_v2) + + @patch("dpgen2.op.run_nvnmd_train.run_command") + def test_exec_v2_train_error(self, mocked_run): + mocked_run.side_effect = [(1, "", "foo\n"), (0, "bar\n", "")] + + config = self.config.copy() + config["init_model_policy"] = "no" + + task_path = self.task_path + Path(task_path).mkdir(exist_ok=True) + with open(Path(task_path) / train_script_name, "w") as fp: + json.dump(self.idict_v2, fp, indent=4) + task_name = self.task_name + work_dir = Path(task_name) + + ptrain = RunNvNMDTrain() + with self.assertRaises(FatalError) as ee: + out = ptrain.execute( + OPIO( + { + "config": config, + "task_name": task_name, + "task_path": Path(task_path), + "init_model": Path(self.init_model), + "init_data": [Path(ii) for ii in self.init_data], + "iter_data": [Path(ii) for ii in self.iter_data], + } + ) + ) + + calls = [ + call(["dp", "train", train_script_name]), + ] + mocked_run.assert_has_calls(calls) + + self.assertTrue(work_dir.is_dir()) + with open(work_dir / train_script_name) as fp: + jdata = json.load(fp) + self.assertDictEqual(jdata, self.expected_odict_v2) + +class TestRunNvNMDTrainNullIterData(unittest.TestCase): + def setUp(self): + self.atom_name = "foo" + self.init_nframs_0 = 3 + self.init_natoms_0 = 5 + self.init_nframs_1 = 4 + self.init_natoms_1 = 2 + ss_0 = fake_system(self.init_nframs_0, self.init_natoms_0, self.atom_name) + ss_1 = fake_system(self.init_nframs_1, self.init_natoms_1, self.atom_name) + ss_0.to_deepmd_npy("init/data-0") + ss_1.to_deepmd_npy("init/data-1") + self.init_data = [Path("init/data-0"), Path("init/data-1")] + self.init_data = sorted(list(self.init_data)) + + self.init_model = Path("bar.pb") + + self.config = { + "init_model_policy": "no", + "init_model_old_ratio": 0.9, + "init_model_numb_steps": 400000, + "init_model_start_lr": 1e-4, + "init_model_start_pref_e": 0.1, + "init_model_start_pref_f": 100, + "init_model_start_pref_v": 0.0, + } + self.config = RunNvNMDTrain.normalize_config(self.config) + + self.task_name = "task-000" + self.task_path = "input-000" + + self.idict_v2 = { + "training": { + "training_data": { + "systems": [], + }, + "validation_data": { + "systems": [], + }, + }, + "learning_rate": { + "start_lr": 1.0, + }, + "loss": { + "start_pref_e": 1.0, + "start_pref_f": 1.0, + "start_pref_v": 1.0, + }, + } + self.expected_odict_v2 = { + "training": { + "training_data": { + "systems": ["init/data-0", "init/data-1"], + "batch_size": "auto", + "auto_prob": "prob_sys_size", + }, + "disp_file": "lcurve.out", + "save_ckpt": "model.ckpt" + }, + "learning_rate": { + "start_lr": 1.0, + }, + "loss": { + "start_pref_e": 1.0, + "start_pref_f": 1.0, + "start_pref_v": 1.0, + }, + } + + def tearDown(self): + for ii in ["init", self.task_path, self.task_name, "foo"]: + if Path(ii).exists(): + shutil.rmtree(str(ii)) + + def test_update_input_dict_v2_empty_list(self): + idict = self.idict_v2 + odict = RunNvNMDTrain.write_data_to_input_script( + idict, + self.config, + self.init_data, + [], + auto_prob_str="prob_sys_size", + major_version="2", + ) + config = self.config.copy() + config["init_model_policy"] = "no" + odict = RunNvNMDTrain.write_other_to_input_script( + odict, config, False, False, major_version="2" + ) + self.assertDictEqual(odict, self.expected_odict_v2) + + @patch("dpgen2.op.run_nvnmd_train.run_command") + def test_exec_v2_empty_dir(self, mocked_run): + mocked_run.side_effect = [(0, "foo\n", ""), (0, "bar\n", "")] + + config = self.config.copy() + config["init_model_policy"] = "no" + + task_path = self.task_path + Path(task_path).mkdir(exist_ok=True) + with open(Path(task_path) / train_script_name, "w") as fp: + json.dump(self.idict_v2, fp, indent=4) + task_name = self.task_name + work_dir = Path(task_name) + empty_data = Path("foo") + empty_data.mkdir(exist_ok=True) + + ptrain = RunNvNMDTrain() + out = ptrain.execute( + OPIO( + { + "config": config, + "task_name": task_name, + "task_path": Path(task_path), + "init_model": Path(self.init_model), + "init_data": [Path(ii) for ii in self.init_data], + "iter_data": [empty_data], + } + ) + ) + self.assertEqual(out["script"], work_dir / train_script_name) + self.assertEqual(out["model"], work_dir / "frozen_model.pb") + self.assertEqual(out["lcurve"], work_dir / "lcurve.out") + self.assertEqual(out["log"], work_dir / "train.log") + + calls = [ + call(["dp", "train", train_script_name]), + call(["dp", "freeze", "-o", "frozen_model.pb"]), + ] + mocked_run.assert_has_calls(calls) + + self.assertTrue(work_dir.is_dir()) + self.assertTrue(out["log"].is_file()) + self.assertEqual( + out["log"].read_text(), + "#=================== train std out ===================\n" + "foo\n" + "#=================== train std err ===================\n" + "#=================== freeze std out ===================\n" + "bar\n" + "#=================== freeze std err ===================\n", + ) + with open(out["script"]) as fp: + jdata = json.load(fp) + self.assertDictEqual(jdata, self.expected_odict_v2) diff --git a/tests/test_prep_run_lmp.py b/tests/test_prep_run_lmp.py index 3b350240..b070041e 100644 --- a/tests/test_prep_run_lmp.py +++ b/tests/test_prep_run_lmp.py @@ -53,6 +53,7 @@ ) from mocked_ops import ( MockedRunLmp, + MockedRunNvNMD, mocked_numb_models, ) diff --git a/tests/test_prep_run_nvnmd_train.py b/tests/test_prep_run_nvnmd_train.py new file mode 100644 index 00000000..202f0cc9 --- /dev/null +++ b/tests/test_prep_run_nvnmd_train.py @@ -0,0 +1,395 @@ +import json +import os +import shutil +import time +import unittest +from pathlib import ( + Path, +) +from typing import ( + List, + Set, +) + +import numpy as np +from dflow import ( + InputArtifact, + InputParameter, + Inputs, + OutputArtifact, + OutputParameter, + Outputs, + S3Artifact, + Step, + Steps, + Workflow, + argo_range, + download_artifact, + upload_artifact, +) +from dflow.python import ( + OP, + OPIO, + Artifact, + OPIOSign, + PythonOPTemplate, +) + +try: + from context import ( + dpgen2, + ) +except ModuleNotFoundError: + # case of upload everything to argo, no context needed + pass +from context import ( + default_host, + default_image, + skip_ut_with_dflow, + skip_ut_with_dflow_reason, + upload_python_packages, +) +from mocked_ops import ( + MockedPrepNvNMDTrain, + MockedRunNvNMDTrain, + MockedRunNvNMDTrainNoneInitModel, + make_mocked_init_data, + make_mocked_init_models, + mocked_numb_models, + mocked_template_script, +) + +from dpgen2.constants import ( + train_task_pattern, +) +from dpgen2.superop.prep_run_nvnmd_train import ( + PrepRunNvNMDTrain, +) +from dpgen2.utils.step_config import normalize as normalize_step_dict + +default_config = normalize_step_dict( + { + "template_config": { + "image": default_image, + } + } +) + + +def _check_log( + tcase, fname, path, script, init_model, init_data, iter_data, only_check_name=False +): + with open(fname) as fp: + lines_ = fp.read().strip().split("\n") + if only_check_name: + lines = [] + for ii in lines_: + ww = ii.split(" ") + ww[1] = str(Path(ww[1]).name) + lines.append(" ".join(ww)) + else: + lines = lines_ + revised_fname = lambda ff: Path(ff).name if only_check_name else Path(ff) + tcase.assertEqual( + lines[0].split(" "), + ["init_model", str(revised_fname(Path(path) / init_model)), "OK"], + ) + for ii in range(2): + tcase.assertEqual( + lines[1 + ii].split(" "), + [ + "data", + str(revised_fname(Path(path) / sorted(list(init_data))[ii])), + "OK", + ], + ) + for ii in range(2): + tcase.assertEqual( + lines[3 + ii].split(" "), + [ + "data", + str(revised_fname(Path(path) / sorted(list(iter_data))[ii])), + "OK", + ], + ) + tcase.assertEqual( + lines[5].split(" "), ["script", str(revised_fname(Path(path) / script)), "OK"] + ) + + +def _check_model( + tcase, + fname, + path, + model, +): + with open(fname) as fp: + flines = fp.read().strip().split("\n") + with open(Path(path) / model) as fp: + mlines = fp.read().strip().split("\n") + tcase.assertEqual(flines[0], "read from init model: ") + for ii in range(len(mlines)): + tcase.assertEqual(flines[ii + 1], mlines[ii]) + + +def _check_lcurve( + tcase, + fname, + path, + script, +): + with open(fname) as fp: + flines = fp.read().strip().split("\n") + with open(Path(path) / script) as fp: + mlines = fp.read().strip().split("\n") + tcase.assertEqual(flines[0], "read from train_script: ") + for ii in range(len(mlines)): + tcase.assertEqual(flines[ii + 1], mlines[ii]) + + +def check_run_train_nvnmd_output( + tcase, + work_dir, + script, + init_model, + init_data, + iter_data, + only_check_name=False, +): + cwd = os.getcwd() + os.chdir(work_dir) + _check_log( + tcase, + "log", + cwd, + script, + init_model, + init_data, + iter_data, + only_check_name=only_check_name, + ) + _check_model(tcase, "model.pb", cwd, init_model) + _check_lcurve(tcase, "lcurve.out", cwd, script) + os.chdir(cwd) + + +class TestMockedPrepNvNMDTrain(unittest.TestCase): + def setUp(self): + self.numb_models = mocked_numb_models + self.template_script = mocked_template_script.copy() + self.expected_subdirs = ["task.0000", "task.0001", "task.0002"] + self.expected_train_scripts = [ + Path("task.0000/input.json"), + Path("task.0001/input.json"), + Path("task.0002/input.json"), + ] + + def tearDown(self): + for ii in self.expected_subdirs: + if Path(ii).exists(): + shutil.rmtree(ii) + + def test(self): + prep = MockedPrepNvNMDTrain() + ip = OPIO( + { + "template_script": self.template_script, + "numb_models": self.numb_models, + } + ) + op = prep.execute(ip) + # self.assertEqual(self.expected_train_scripts, op["train_scripts"]) + self.assertEqual(self.expected_subdirs, op["task_names"]) + self.assertEqual([Path(ii) for ii in self.expected_subdirs], op["task_paths"]) + + +class TestMockedRunNvNMDTrain(unittest.TestCase): + def setUp(self): + self.numb_models = mocked_numb_models + + self.init_models = make_mocked_init_models(self.numb_models) + + tmp_init_data = make_mocked_init_data() + self.init_data = tmp_init_data + + tmp_iter_data = [Path("iter_data/foo"), Path("iter_data/bar")] + for ii in tmp_iter_data: + ii.mkdir(exist_ok=True, parents=True) + (ii / "a").write_text("data a") + (ii / "b").write_text("data b") + self.iter_data = tmp_iter_data + + self.template_script = mocked_template_script.copy() + + self.task_names = ["task.0000", "task.0001", "task.0002"] + self.task_paths = [Path(ii) for ii in self.task_names] + self.train_scripts = [ + Path("task.0000/input.json"), + Path("task.0001/input.json"), + Path("task.0002/input.json"), + ] + + for ii in range(3): + Path(self.task_names[ii]).mkdir(exist_ok=True, parents=True) + Path(self.train_scripts[ii]).write_text("{}") + + def tearDown(self): + for ii in ["init_data", "iter_data"] + self.task_names: + if Path(ii).exists(): + shutil.rmtree(str(ii)) + for ii in self.init_models: + if Path(ii).exists(): + os.remove(ii) + + def test(self): + for ii in range(3): + run = MockedRunNvNMDTrain() + ip = OPIO( + { + "config": {}, + "task_name": self.task_names[ii], + "task_path": self.task_paths[ii], + "init_model": self.init_models[ii], + "init_data": self.init_data, + "iter_data": self.iter_data, + } + ) + op = run.execute(ip) + self.assertEqual(op["script"], Path(train_task_pattern % ii) / "input.json") + self.assertTrue(op["script"].is_file()) + self.assertEqual(op["cnn_model"], Path(train_task_pattern % ii) / "frozen_model.pb") + self.assertEqual(op["qnn_model"], Path(train_task_pattern % ii) / "model.pb") + self.assertEqual(op["log"], Path(train_task_pattern % ii) / "log") + self.assertEqual(op["lcurve"], Path(train_task_pattern % ii) / "lcurve.out") + check_run_train_nvnmd_output( + self, + self.task_names[ii], + self.train_scripts[ii], + self.init_models[ii], + self.init_data, + self.iter_data, + ) + + +@unittest.skipIf(skip_ut_with_dflow, skip_ut_with_dflow_reason) +class TestTrainNvNMD(unittest.TestCase): + def setUp(self): + self.numb_models = mocked_numb_models + + tmp_models = make_mocked_init_models(self.numb_models) + self.init_models = upload_artifact(tmp_models) + self.str_init_models = tmp_models + + tmp_init_data = make_mocked_init_data() + self.init_data = upload_artifact(tmp_init_data) + self.path_init_data = tmp_init_data + + tmp_iter_data = [Path("iter_data/foo"), Path("iter_data/bar")] + for ii in tmp_iter_data: + ii.mkdir(exist_ok=True, parents=True) + (ii / "a").write_text("data a") + (ii / "b").write_text("data b") + self.iter_data = upload_artifact(tmp_iter_data) + self.path_iter_data = tmp_iter_data + + self.template_script = mocked_template_script.copy() + + self.task_names = ["task.0000", "task.0001", "task.0002"] + self.task_paths = [Path(ii) for ii in self.task_names] + self.train_scripts = [ + Path("task.0000/input.json"), + Path("task.0001/input.json"), + Path("task.0002/input.json"), + ] + + def tearDown(self): + for ii in ["init_data", "iter_data"] + self.task_names: + if Path(ii).exists(): + shutil.rmtree(str(ii)) + for ii in self.str_init_models: + if Path(ii).exists(): + os.remove(ii) + + def test_train(self): + steps = PrepRunNvNMDTrain( + "train-steps", + MockedPrepNvNMDTrain, + MockedRunNvNMDTrain, + upload_python_packages=upload_python_packages, + prep_config=default_config, + run_config=default_config, + ) + train_step = Step( + "train-step", + template=steps, + parameters={ + "numb_models": self.numb_models, + "template_script": self.template_script, + "train_config": {}, + }, + artifacts={ + "init_models": self.init_models, + "init_data": self.init_data, + "iter_data": self.iter_data, + }, + ) + wf = Workflow(name="dp-train", host=default_host) + wf.add(train_step) + wf.submit() + + while wf.query_status() in ["Pending", "Running"]: + time.sleep(4) + + self.assertEqual(wf.query_status(), "Succeeded") + step = wf.query_step(name="train-step")[0] + self.assertEqual(step.phase, "Succeeded") + + download_artifact(step.outputs.artifacts["scripts"]) + download_artifact(step.outputs.artifacts["models"]) + download_artifact(step.outputs.artifacts["logs"]) + download_artifact(step.outputs.artifacts["lcurves"]) + + for ii in range(3): + check_run_train_nvnmd_output( + self, + self.task_names[ii], + self.train_scripts[ii], + self.str_init_models[ii], + self.path_init_data, + self.path_iter_data, + only_check_name=True, + ) + + def test_train_no_init_model(self): + steps = PrepRunNvNMDTrain( + "train-steps", + MockedPrepNvNMDTrain, + MockedRunNvNMDTrainNoneInitModel, + upload_python_packages=upload_python_packages, + prep_config=default_config, + run_config=default_config, + ) + train_step = Step( + "train-step", + template=steps, + parameters={ + "numb_models": self.numb_models, + "template_script": self.template_script, + "train_config": {}, + }, + artifacts={ + "init_models": None, + "init_data": self.init_data, + "iter_data": self.iter_data, + }, + ) + wf = Workflow(name="dp-train", host=default_host) + wf.add(train_step) + wf.submit() + + while wf.query_status() in ["Pending", "Running"]: + time.sleep(4) + + self.assertEqual(wf.query_status(), "Succeeded") + step = wf.query_step(name="train-step")[0] + self.assertEqual(step.phase, "Succeeded") From af591a4213531a1538e5f6373c410f02c9b7d1eb Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Wed, 4 Jun 2025 15:03:45 +0800 Subject: [PATCH 11/49] add nvnmd test unit --- tests/test_prep_run_lmp.py | 1 - tests/test_prep_run_nvnmd.py | 307 +++++++++++++++++++++++++++++ tests/test_prep_run_nvnmd_train.py | 7 +- 3 files changed, 310 insertions(+), 5 deletions(-) create mode 100644 tests/test_prep_run_nvnmd.py diff --git a/tests/test_prep_run_lmp.py b/tests/test_prep_run_lmp.py index b070041e..3b350240 100644 --- a/tests/test_prep_run_lmp.py +++ b/tests/test_prep_run_lmp.py @@ -53,7 +53,6 @@ ) from mocked_ops import ( MockedRunLmp, - MockedRunNvNMD, mocked_numb_models, ) diff --git a/tests/test_prep_run_nvnmd.py b/tests/test_prep_run_nvnmd.py new file mode 100644 index 00000000..b64d1316 --- /dev/null +++ b/tests/test_prep_run_nvnmd.py @@ -0,0 +1,307 @@ +import json +import os +import pickle +import shutil +import time +import unittest +from pathlib import ( + Path, +) +from typing import ( + List, + Set, +) + +import jsonpickle +import numpy as np +from dflow import ( + InputArtifact, + InputParameter, + Inputs, + OutputArtifact, + OutputParameter, + Outputs, + S3Artifact, + Step, + Steps, + Workflow, + argo_range, + download_artifact, + upload_artifact, +) +from dflow.python import ( + OP, + OPIO, + Artifact, + OPIOSign, + PythonOPTemplate, +) + +try: + from context import ( + dpgen2, + ) +except ModuleNotFoundError: + # case of upload everything to argo, no context needed + pass +from context import ( + default_host, + default_image, + skip_ut_with_dflow, + skip_ut_with_dflow_reason, + upload_python_packages, +) +from mocked_ops import ( + MockedRunNvNMD, + mocked_numb_models, +) + +from dpgen2.constants import ( + lmp_conf_name, + lmp_input_name, + lmp_log_name, + lmp_model_devi_name, + lmp_task_pattern, + lmp_traj_name, + model_name_pattern, + train_log_name, + train_script_name, + train_task_pattern, +) +from dpgen2.exploration.task import ( + BaseExplorationTaskGroup, + ExplorationTask, +) +from dpgen2.op.prep_lmp import ( + PrepLmp, +) +from dpgen2.superop.prep_run_lmp import ( + PrepRunLmp, +) +from dpgen2.utils.step_config import normalize as normalize_step_dict + +default_config = normalize_step_dict( + { + "template_config": { + "image": default_image, + } + } +) + + +def make_task_group_list(ngrp, ntask_per_grp): + tgrp = BaseExplorationTaskGroup() + for ii in range(ngrp): + for jj in range(ntask_per_grp): + tt = ExplorationTask() + tt.add_file(lmp_conf_name, f"group{ii} task{jj} conf").add_file( + lmp_input_name, f"group{ii} task{jj} input" + ) + tgrp.add_task(tt) + return tgrp + + +def check_lmp_tasks(tcase, ngrp, ntask_per_grp): + cc = 0 + tdirs = [] + for ii in range(ngrp): + for jj in range(ntask_per_grp): + tdir = lmp_task_pattern % cc + tdirs.append(tdir) + tcase.assertTrue(Path(tdir).is_dir()) + fconf = Path(tdir) / lmp_conf_name + finpt = Path(tdir) / lmp_input_name + tcase.assertTrue(fconf.is_file()) + tcase.assertTrue(finpt.is_file()) + tcase.assertEqual(fconf.read_text(), f"group{ii} task{jj} conf") + tcase.assertEqual(finpt.read_text(), f"group{ii} task{jj} input") + cc += 1 + return tdirs + + +class TestPrepLmp(unittest.TestCase): + def setUp(self): + self.ngrp = 2 + self.ntask_per_grp = 3 + self.task_group_list = make_task_group_list(self.ngrp, self.ntask_per_grp) + + def tearDown(self): + for ii in range(self.ngrp * self.ntask_per_grp): + work_path = Path(lmp_task_pattern % ii) + if work_path.is_dir(): + shutil.rmtree(work_path) + + def test(self): + op = PrepLmp() + out = op.execute( + OPIO( + { + "lmp_task_grp": self.task_group_list, + } + ) + ) + tdirs = check_lmp_tasks(self, self.ngrp, self.ntask_per_grp) + tdirs = [str(ii) for ii in tdirs] + + self.assertEqual(tdirs, out["task_names"]) + self.assertEqual(tdirs, [str(ii) for ii in out["task_paths"]]) + + +class TestMockedRunNvNMD(unittest.TestCase): + def setUp(self): + self.ntask = 2 + self.nmodels = 3 + self.task_list = [] + self.model_list = [] + for ii in range(self.ntask): + work_path = Path(lmp_task_pattern % ii) + work_path.mkdir(exist_ok=True, parents=True) + (work_path / lmp_conf_name).write_text(f"conf {ii}") + (work_path / lmp_input_name).write_text(f"input {ii}") + self.task_list.append(work_path) + for ii in range(self.nmodels): + model = Path(f"model{ii}.pb") + model.write_text(f"model {ii}") + self.model_list.append(model) + + def check_run_lmp_output( + self, + task_name: str, + models: List[Path], + ): + cwd = os.getcwd() + os.chdir(task_name) + fc = [] + for ii in [lmp_conf_name, lmp_input_name] + [ii.name for ii in models]: + fc.append(Path(ii).read_text()) + self.assertEqual(fc, Path(lmp_log_name).read_text().strip().split("\n")) + self.assertEqual( + f"traj of {task_name}", Path(lmp_traj_name).read_text().split("\n")[0] + ) + self.assertEqual( + f"model_devi of {task_name}", Path(lmp_model_devi_name).read_text() + ) + os.chdir(cwd) + + def tearDown(self): + for ii in range(self.ntask): + work_path = Path(lmp_task_pattern % ii) + if work_path.is_dir(): + shutil.rmtree(work_path) + for ii in range(self.nmodels): + model = Path(f"model{ii}.pb") + if model.is_file(): + os.remove(model) + + def test(self): + self.task_list_str = [str(ii) for ii in self.task_list] + self.model_list_str = [str(ii) for ii in self.model_list] + for ii in range(self.ntask): + ip = OPIO( + { + "task_name": self.task_list_str[ii], + "task_path": self.task_list[ii], + "models": self.model_list, + "config": {}, + } + ) + op = MockedRunNvNMD() + out = op.execute(ip) + self.assertEqual(out["log"], Path(f"task.{ii:06d}") / lmp_log_name) + self.assertEqual(out["traj"], Path(f"task.{ii:06d}") / lmp_traj_name) + self.assertEqual( + out["model_devi"], Path(f"task.{ii:06d}") / lmp_model_devi_name + ) + self.assertTrue(out["log"].is_file()) + self.assertTrue(out["traj"].is_file()) + self.assertTrue(out["model_devi"].is_file()) + self.check_run_lmp_output(self.task_list_str[ii], self.model_list) + + +# @unittest.skip("temp") +@unittest.skipIf(skip_ut_with_dflow, skip_ut_with_dflow_reason) +class TestPrepRunNvNMD(unittest.TestCase): + def setUp(self): + self.ngrp = 2 + self.ntask_per_grp = 3 + self.task_group_list = make_task_group_list(self.ngrp, self.ntask_per_grp) + self.nmodels = mocked_numb_models + self.model_list = [] + for ii in range(self.nmodels): + model = Path(f"model{ii}.pb") + model.write_text(f"model {ii}") + self.model_list.append(model) + self.models = upload_artifact(self.model_list) + + def tearDown(self): + for ii in range(self.nmodels): + model = Path(f"model{ii}.pb") + if model.is_file(): + os.remove(model) + for ii in range(self.ngrp * self.ntask_per_grp): + work_path = Path(f"task.{ii:06d}") + if work_path.is_dir(): + shutil.rmtree(work_path) + + def check_run_lmp_output( + self, + task_name: str, + models: List[Path], + ): + cwd = os.getcwd() + os.chdir(task_name) + fc = [] + idx = int(task_name.split(".")[1]) + ii = idx // self.ntask_per_grp + jj = idx - ii * self.ntask_per_grp + fc.append(f"group{ii} task{jj} conf") + fc.append(f"group{ii} task{jj} input") + for ii in [ii.name for ii in models]: + fc.append((Path("..") / Path(ii)).read_text()) + self.assertEqual(fc, Path(lmp_log_name).read_text().strip().split("\n")) + self.assertEqual( + f"traj of {task_name}", Path(lmp_traj_name).read_text().split("\n")[0] + ) + self.assertEqual( + f"model_devi of {task_name}", Path(lmp_model_devi_name).read_text() + ) + os.chdir(cwd) + + def test(self): + steps = PrepRunLmp( + "prep-run-lmp", + PrepLmp, + MockedRunNvNMD, + upload_python_packages=upload_python_packages, + prep_config=default_config, + run_config=default_config, + ) + prep_run_step = Step( + "prep-run-step", + template=steps, + parameters={ + "explore_config": {}, + "expl_task_grp": self.task_group_list, + }, + artifacts={ + "models": self.models, + }, + ) + + wf = Workflow(name="dp-train", host=default_host) + wf.add(prep_run_step) + wf.submit() + + while wf.query_status() in ["Pending", "Running"]: + time.sleep(4) + + self.assertEqual(wf.query_status(), "Succeeded") + step = wf.query_step(name="prep-run-step")[0] + self.assertEqual(step.phase, "Succeeded") + + download_artifact(step.outputs.artifacts["model_devis"]) + download_artifact(step.outputs.artifacts["trajs"]) + download_artifact(step.outputs.artifacts["logs"]) + + for ii in step.outputs.parameters["task_names"].value: + self.check_run_lmp_output(ii, self.model_list) diff --git a/tests/test_prep_run_nvnmd_train.py b/tests/test_prep_run_nvnmd_train.py index 202f0cc9..9bbadc21 100644 --- a/tests/test_prep_run_nvnmd_train.py +++ b/tests/test_prep_run_nvnmd_train.py @@ -168,7 +168,7 @@ def check_run_train_nvnmd_output( iter_data, only_check_name=only_check_name, ) - _check_model(tcase, "model.pb", cwd, init_model) + _check_model(tcase, "frozen_model.pb", cwd, init_model) _check_lcurve(tcase, "lcurve.out", cwd, script) os.chdir(cwd) @@ -270,7 +270,6 @@ def test(self): self.iter_data, ) - @unittest.skipIf(skip_ut_with_dflow, skip_ut_with_dflow_reason) class TestTrainNvNMD(unittest.TestCase): def setUp(self): @@ -333,7 +332,7 @@ def test_train(self): "iter_data": self.iter_data, }, ) - wf = Workflow(name="dp-train", host=default_host) + wf = Workflow(name="nvnmd-train", host=default_host) wf.add(train_step) wf.submit() @@ -383,7 +382,7 @@ def test_train_no_init_model(self): "iter_data": self.iter_data, }, ) - wf = Workflow(name="dp-train", host=default_host) + wf = Workflow(name="nvnmd-train", host=default_host) wf.add(train_step) wf.submit() From 2c0ea101ad295dff58b7a01eb436f72ebea7218b Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Fri, 6 Jun 2025 08:10:41 +0800 Subject: [PATCH 12/49] add nvnmd test unit --- dpgen2/op/run_nvnmd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index 9e891ecc..e0cba722 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -410,8 +410,8 @@ def merge_pimd_files(): f.write(f2.read()) def calc_model_devi( - traj_files: list[str], - fname: str = "model_devi.out", + traj_files, + fname = "model_devi.out", ): trajectories = [] From 6c69e4eae3442d122450d1e28c9ad2079279c68d Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Fri, 6 Jun 2025 09:21:40 +0800 Subject: [PATCH 13/49] fix test unit --- dpgen2/superop/block.py | 2 +- dpgen2/superop/prep_run_nvnmd_train.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/dpgen2/superop/block.py b/dpgen2/superop/block.py index a9beea07..0e39ab38 100644 --- a/dpgen2/superop/block.py +++ b/dpgen2/superop/block.py @@ -243,7 +243,7 @@ def _block_cl( "type_map": block_steps.inputs.parameters["type_map"], }, artifacts={ - "models": prep_run_dp_train.outputs.artifacts["nvnmodels"], + "models": prep_run_dp_train.outputs.artifacts["models"], }, key="--".join( ["%s" % block_steps.inputs.parameters["block_id"], "prep-run-explore"] diff --git a/dpgen2/superop/prep_run_nvnmd_train.py b/dpgen2/superop/prep_run_nvnmd_train.py index a95d9ba2..50987613 100644 --- a/dpgen2/superop/prep_run_nvnmd_train.py +++ b/dpgen2/superop/prep_run_nvnmd_train.py @@ -88,7 +88,6 @@ def __init__( self._output_artifacts = { "scripts": OutputArtifact(), "models": OutputArtifact(), - "nvnmodels": OutputArtifact(), "logs": OutputArtifact(), "lcurves": OutputArtifact(), } @@ -234,7 +233,7 @@ def _prep_run_nvnmd_train( "script" ] train_steps.outputs.artifacts["models"]._from = run_train.outputs.artifacts["cnn_model"] - train_steps.outputs.artifacts["nvnmodels"]._from = run_train.outputs.artifacts["qnn_model"] + #train_steps.outputs.artifacts["nvnmodels"]._from = run_train.outputs.artifacts["cnn_model"] train_steps.outputs.artifacts["logs"]._from = run_train.outputs.artifacts["log"] train_steps.outputs.artifacts["lcurves"]._from = run_train.outputs.artifacts[ "lcurve" From 77a0e3a1de0d18c16ab6134764227acddd29fbf3 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Fri, 6 Jun 2025 09:42:36 +0800 Subject: [PATCH 14/49] fix test unit --- tests/op/test_run_nvnmd.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/op/test_run_nvnmd.py b/tests/op/test_run_nvnmd.py index 44785437..c8cdfe13 100644 --- a/tests/op/test_run_nvnmd.py +++ b/tests/op/test_run_nvnmd.py @@ -67,7 +67,7 @@ def tearDown(self): if Path(self.task_name).is_dir(): shutil.rmtree(self.task_name) - @patch("dpgen2.op.run_lmp.run_command") + @patch("dpgen2.op.run_nvnmd.run_command") def test_success(self, mocked_run): mocked_run.side_effect = [(0, "foo\n", "")] op = RunNvNMD() @@ -102,7 +102,7 @@ def test_success(self, mocked_run): (work_dir / (model_name_pattern % ii)).read_text(), f"model{ii}" ) - @patch("dpgen2.op.run_lmp.run_command") + @patch("dpgen2.op.run_nvnmd.run_command") def test_error(self, mocked_run): mocked_run.side_effect = [(1, "foo\n", "")] op = RunNvNMD() @@ -186,7 +186,7 @@ def tearDown(self): if Path(self.task_name).is_dir(): shutil.rmtree(self.task_name) - @patch("dpgen2.op.run_lmp.run_command") + @patch("dpgen2.op.run_nvnmd.run_command") def test_success(self, mocked_run): mocked_run.side_effect = [(0, "foo\n", "")] op = RunNvNMD() From afecaaddef8952d7b90bf66484f946b0c15d0b3b Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Fri, 6 Jun 2025 10:39:53 +0800 Subject: [PATCH 15/49] fix test unit --- dpgen2/superop/block.py | 5 ++++- tests/op/test_run_nvnmd.py | 16 +++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/dpgen2/superop/block.py b/dpgen2/superop/block.py index 0e39ab38..b9928e8b 100644 --- a/dpgen2/superop/block.py +++ b/dpgen2/superop/block.py @@ -58,6 +58,9 @@ from .prep_run_dp_train import ( PrepRunDPTrain, ) +from .prep_run_nvnmd_train import ( + PrepRunNvNMDTrain, +) from .prep_run_fp import ( PrepRunFp, ) @@ -88,7 +91,7 @@ class ConcurrentLearningBlock(Steps): def __init__( self, name: str, - prep_run_dp_train_op: PrepRunDPTrain, + prep_run_dp_train_op: Union[PrepRunDPTrain, PrepRunNvNMDTrain], prep_run_explore_op: Union[PrepRunLmp, PrepRunCaly, PrepRunDiffCSP], select_confs_op: Type[OP], prep_run_fp_op: PrepRunFp, diff --git a/tests/op/test_run_nvnmd.py b/tests/op/test_run_nvnmd.py index c8cdfe13..f6230862 100644 --- a/tests/op/test_run_nvnmd.py +++ b/tests/op/test_run_nvnmd.py @@ -87,9 +87,23 @@ def test_success(self, mocked_run): self.assertEqual(out["traj"], work_dir / lmp_traj_name) self.assertEqual(out["model_devi"], work_dir / lmp_model_devi_name) # check call + models = ["model.%03d.pb" for i in range(len(self.models))] calls = [ call( - " ".join(["mylmp", "-i", lmp_input_name, "-log", lmp_log_name]), + " ; ".join( + [" ".join + ( + [ + "cp", model_name, "model.pb", "&&", + "mylmp", "-i", lmp_input_name, + "-log", lmp_log_name, + "-v", "rerun", "%d"%i, "&&", + "cp", lmp_traj_name, lmp_traj_name+".%d"%i + ] + ) + for i, model_name in enumerate(models) + ] + ), shell=True, ), ] From e542b0a5a326dfa33dbb546f115715a48645a634 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Fri, 6 Jun 2025 10:57:35 +0800 Subject: [PATCH 16/49] fix test unit --- dpgen2/exploration/task/lmp/lmp_input.py | 6 +++--- dpgen2/op/run_nvnmd.py | 2 +- tests/op/test_run_nvnmd.py | 16 +++++++++++++++- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/dpgen2/exploration/task/lmp/lmp_input.py b/dpgen2/exploration/task/lmp/lmp_input.py index 7b120659..777aef3a 100644 --- a/dpgen2/exploration/task/lmp/lmp_input.py +++ b/dpgen2/exploration/task/lmp/lmp_input.py @@ -50,7 +50,7 @@ def make_lmp_input( nopbc: bool = False, max_seed: int = 1000000, deepmd_version="2.0", - nvnmd_version="0.0", + nvnmd_version=None, trj_seperate_files=True, pimd_bead: Optional[str] = None, ): @@ -139,7 +139,7 @@ def make_lmp_input( ret += "\n" ret += "thermo_style custom step temp pe ke etotal press vol lx ly lz xy xz yz\n" ret += "thermo ${THERMO_FREQ}\n" - if trj_seperate_files: + if trj_seperate_files and nvnmd_version is None: ret += "dump 1 all custom ${DUMP_FREQ} traj/*.lammpstrj id type x y z fx fy fz\n" else: lmp_traj_file_name = ( @@ -203,6 +203,6 @@ def make_lmp_input( if(nvnmd_version is not None): ret += 'jump SELF end\n' ret += 'label rerun\n' - ret += 'rerun %s.0 dump x y z fx fy fz add yes\n' % lmp_traj_file_name + ret += 'rerun %s.0 dump x y z fx fy fz add yes\n' % lmp_traj_name ret += 'label end\n' return ret diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index e0cba722..18d6d784 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -165,7 +165,7 @@ def execute( Path(mname).symlink_to(mm) except: logging.warning( - "failed to link %s, maybe already linked" % iname + "failed to link %s, maybe already linked" % mname ) pass diff --git a/tests/op/test_run_nvnmd.py b/tests/op/test_run_nvnmd.py index f6230862..0dd1df85 100644 --- a/tests/op/test_run_nvnmd.py +++ b/tests/op/test_run_nvnmd.py @@ -132,9 +132,23 @@ def test_error(self, mocked_run): ) ) # check call + models = ["model.%03d.pb" for i in range(len(self.models))] calls = [ call( - " ".join(["mylmp", "-i", lmp_input_name, "-log", lmp_log_name]), + " ; ".join( + [" ".join + ( + [ + "cp", model_name, "model.pb", "&&", + "mylmp", "-i", lmp_input_name, + "-log", lmp_log_name, + "-v", "rerun", "%d"%i, "&&", + "cp", lmp_traj_name, lmp_traj_name+".%d"%i + ] + ) + for i, model_name in enumerate(models) + ] + ), shell=True, ), ] From ca17fe125dd7fc7bf83eb7e8a53219d1cf8656e3 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Fri, 6 Jun 2025 11:07:29 +0800 Subject: [PATCH 17/49] fix test unit --- dpgen2/op/run_nvnmd.py | 17 +++++++++++--- tests/op/test_run_nvnmd.py | 48 +++++++++++++++++++------------------- 2 files changed, 38 insertions(+), 27 deletions(-) diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index 18d6d784..61470f07 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -185,9 +185,20 @@ def execute( set_models(lmp_input_name, model_names) # run lmp - commands = " ; ".join([" ".join( - ["cp", model_name, "model.pb", "&&", command, "-i", lmp_input_name, "-log", lmp_log_name, "-v", "rerun", "%d"%i, "&&", "cp", lmp_traj_name, lmp_traj_name+".%d"%i]) - for i, model_name in enumerate(model_names)]) + commands = " ; ".join( + [ + " ".join( + [ + "cp", model_name, "model.pb", "&&", + "mylmp", "-i", lmp_input_name, + "-log", lmp_log_name, + "-v", "rerun", "%d"%i, "&&", + "cp", lmp_traj_name, lmp_traj_name+".%d"%i + ] + ) + for i, model_name in enumerate(models) + ] + ) ret, out, err = run_command(commands, shell=True) if ret != 0: logging.error( diff --git a/tests/op/test_run_nvnmd.py b/tests/op/test_run_nvnmd.py index 0dd1df85..d5d4613b 100644 --- a/tests/op/test_run_nvnmd.py +++ b/tests/op/test_run_nvnmd.py @@ -87,21 +87,21 @@ def test_success(self, mocked_run): self.assertEqual(out["traj"], work_dir / lmp_traj_name) self.assertEqual(out["model_devi"], work_dir / lmp_model_devi_name) # check call - models = ["model.%03d.pb" for i in range(len(self.models))] + models = ["model.%03d.pb"%i for i in range(len(self.models))] calls = [ call( " ; ".join( - [" ".join - ( - [ - "cp", model_name, "model.pb", "&&", - "mylmp", "-i", lmp_input_name, - "-log", lmp_log_name, - "-v", "rerun", "%d"%i, "&&", - "cp", lmp_traj_name, lmp_traj_name+".%d"%i - ] - ) - for i, model_name in enumerate(models) + [ + " ".join( + [ + "cp", model_name, "model.pb", "&&", + "mylmp", "-i", lmp_input_name, + "-log", lmp_log_name, + "-v", "rerun", "%d"%i, "&&", + "cp", lmp_traj_name, lmp_traj_name+".%d"%i + ] + ) + for i, model_name in enumerate(models) ] ), shell=True, @@ -132,21 +132,21 @@ def test_error(self, mocked_run): ) ) # check call - models = ["model.%03d.pb" for i in range(len(self.models))] + models = ["model.%03d.pb"%i for i in range(len(self.models))] calls = [ call( " ; ".join( - [" ".join - ( - [ - "cp", model_name, "model.pb", "&&", - "mylmp", "-i", lmp_input_name, - "-log", lmp_log_name, - "-v", "rerun", "%d"%i, "&&", - "cp", lmp_traj_name, lmp_traj_name+".%d"%i - ] - ) - for i, model_name in enumerate(models) + [ + " ".join( + [ + "cp", model_name, "model.pb", "&&", + "mylmp", "-i", lmp_input_name, + "-log", lmp_log_name, + "-v", "rerun", "%d"%i, "&&", + "cp", lmp_traj_name, lmp_traj_name+".%d"%i + ] + ) + for i, model_name in enumerate(models) ] ), shell=True, From 694889dcb8f9bb1bf47f288d9f0375c8d00b3ec2 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Fri, 6 Jun 2025 11:12:09 +0800 Subject: [PATCH 18/49] fix test unit --- dpgen2/op/run_nvnmd.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index 61470f07..99af3d51 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -186,19 +186,19 @@ def execute( # run lmp commands = " ; ".join( - [ - " ".join( - [ - "cp", model_name, "model.pb", "&&", - "mylmp", "-i", lmp_input_name, - "-log", lmp_log_name, - "-v", "rerun", "%d"%i, "&&", - "cp", lmp_traj_name, lmp_traj_name+".%d"%i - ] - ) - for i, model_name in enumerate(models) - ] - ) + [ + " ".join( + [ + "cp", model_name, "model.pb", "&&", + "mylmp", "-i", lmp_input_name, + "-log", lmp_log_name, + "-v", "rerun", "%d"%i, "&&", + "cp", lmp_traj_name, lmp_traj_name+".%d"%i + ] + ) + for i, model_name in enumerate(models) + ] + ) ret, out, err = run_command(commands, shell=True) if ret != 0: logging.error( From daae4598fc35d15d7b98861068999086975c4b88 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Fri, 6 Jun 2025 11:27:47 +0800 Subject: [PATCH 19/49] fix run_nvnmd --- dpgen2/op/run_nvnmd.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index 99af3d51..9da27a4c 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -189,10 +189,12 @@ def execute( [ " ".join( [ - "cp", model_name, "model.pb", "&&", + "cp", str(model_name), "model.pb", + "&&", "mylmp", "-i", lmp_input_name, "-log", lmp_log_name, - "-v", "rerun", "%d"%i, "&&", + "-v", "rerun", "%d"%i, + "&&", "cp", lmp_traj_name, lmp_traj_name+".%d"%i ] ) From 26ab174d6ce513f0180f85660918c423c33dc259 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Fri, 6 Jun 2025 11:33:13 +0800 Subject: [PATCH 20/49] fix test unit --- tests/op/test_run_nvnmd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/op/test_run_nvnmd.py b/tests/op/test_run_nvnmd.py index d5d4613b..fedc164b 100644 --- a/tests/op/test_run_nvnmd.py +++ b/tests/op/test_run_nvnmd.py @@ -87,7 +87,7 @@ def test_success(self, mocked_run): self.assertEqual(out["traj"], work_dir / lmp_traj_name) self.assertEqual(out["model_devi"], work_dir / lmp_model_devi_name) # check call - models = ["model.%03d.pb"%i for i in range(len(self.models))] + models = ["models/path/model_%d.pb"%i for i in range(len(self.models))] calls = [ call( " ; ".join( @@ -132,7 +132,7 @@ def test_error(self, mocked_run): ) ) # check call - models = ["model.%03d.pb"%i for i in range(len(self.models))] + models = ["models/path/model_%d.pb"%i for i in range(len(self.models))] calls = [ call( " ; ".join( From c4e3f461d363a266ab48640f6321519f6ded888d Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Fri, 6 Jun 2025 13:21:48 +0800 Subject: [PATCH 21/49] fix run nvnmd --- dpgen2/entrypoint/submit.py | 2 +- dpgen2/op/run_nvnmd.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dpgen2/entrypoint/submit.py b/dpgen2/entrypoint/submit.py index 7019bdf2..cb91c40c 100644 --- a/dpgen2/entrypoint/submit.py +++ b/dpgen2/entrypoint/submit.py @@ -212,7 +212,7 @@ def make_concurrent_learning_op( prep_run_explore_op = PrepRunLmp( "prep-run-nvnmd", PrepLmp, - RunNvNMD, + RunNvNMD, prep_config=prep_explore_config, run_config=run_explore_config, upload_python_packages=upload_python_packages, diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index 9da27a4c..8c893e4d 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -230,8 +230,9 @@ def execute( with open("job.json", "w") as f: json.dump(data, f, indent=4) merge_pimd_files() - - calc_model_devi([lmp_traj_name+f".{i}" for i in range(len(model_names))]) + + if os.path.exists(lmp_traj_name): + calc_model_devi([lmp_traj_name+f".{i}" for i in range(len(model_names))]) ret_dict = { "log": work_dir / lmp_log_name, From a89bd704eadd376a5586465e9b21a08c83884b46 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Fri, 6 Jun 2025 14:26:02 +0800 Subject: [PATCH 22/49] fix test unit --- dpgen2/op/run_nvnmd_train.py | 15 +++--- tests/op/test_run_nvnmd_train.py | 87 +++++++++++++++++--------------- 2 files changed, 55 insertions(+), 47 deletions(-) diff --git a/dpgen2/op/run_nvnmd_train.py b/dpgen2/op/run_nvnmd_train.py index 283d52b3..23ae625a 100644 --- a/dpgen2/op/run_nvnmd_train.py +++ b/dpgen2/op/run_nvnmd_train.py @@ -74,7 +74,7 @@ def _make_train_command( command = dp_command + [ "train-nvnmd", init_flag, - init_model, + str(init_model), train_script_name, ] else: @@ -268,15 +268,17 @@ def clean_before_quit(): ) ) raise FatalError("dp train-nvnmd -s s1 failed") - fplog.write("#=================== train std out ===================\n") + fplog.write("#=================== train_cnn std out ===================\n") fplog.write(out) - fplog.write("#=================== train std err ===================\n") + fplog.write("#=================== train_cnn std err ===================\n") fplog.write(err) cnn_model_file = "nvnmd_cnn/frozen_model.pb" + lcurve_file = "nvnmd_cnn/lcurve.out" else: cnn_model_file = init_model + lcurve_file = "nvnmd_qnn/lcurve.out" # train qnn model command = _make_train_command( @@ -304,13 +306,12 @@ def clean_before_quit(): ) ) raise FatalError("dp train-nvnmd -s s2 failed") - fplog.write("#=================== train std out ===================\n") + fplog.write("#=================== train_qnn std out ===================\n") fplog.write(out) - fplog.write("#=================== train std err ===================\n") + fplog.write("#=================== train_qnn std err ===================\n") fplog.write(err) qnn_model_file = "nvnmd_qnn/model.pb" - lcurve_file = "nvnmd_qnn/lcurve.out" if os.path.exists("input_v2_compat.json"): shutil.copy2("input_v2_compat.json", train_script_name) @@ -319,7 +320,7 @@ def clean_before_quit(): return OPIO( { - "script": work_dir / train_script_name, + "script": work_dir / train_cnn_script_name, "cnn_model": work_dir / cnn_model_file, "qnn_model": work_dir / qnn_model_file, "lcurve": work_dir / lcurve_file, diff --git a/tests/op/test_run_nvnmd_train.py b/tests/op/test_run_nvnmd_train.py index 3386e44e..c88f8c0a 100644 --- a/tests/op/test_run_nvnmd_train.py +++ b/tests/op/test_run_nvnmd_train.py @@ -31,6 +31,8 @@ ) from dpgen2.constants import ( train_script_name, + train_cnn_script_name, + train_qnn_script_name, train_task_pattern, ) from dpgen2.op.run_nvnmd_train import ( @@ -410,14 +412,15 @@ def test_exec_v1(self, mocked_run): } ) ) - self.assertEqual(out["script"], work_dir / train_script_name) - self.assertEqual(out["model"], work_dir / "frozen_model.pb") - self.assertEqual(out["lcurve"], work_dir / "lcurve.out") + self.assertEqual(out["script"], work_dir / train_cnn_script_name) + self.assertEqual(out["cnn_model"], work_dir / "nvnmd_cnn/frozen_model.pb") + self.assertEqual(out["qnn_model"], work_dir / "nvnmd_qnn/model.pb") + self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") self.assertEqual(out["log"], work_dir / "train.log") calls = [ - call(["dp", "train", train_script_name]), - call(["dp", "freeze", "-o", "frozen_model.pb"]), + call(["dp", "train-nvnmd", train_cnn_script_name, "-s", "s1"]), + call(["dp", "train-nvnmd", train_qnn_script_name, "-s", "s2"]) ] mocked_run.assert_has_calls(calls) @@ -425,12 +428,12 @@ def test_exec_v1(self, mocked_run): self.assertTrue(out["log"].is_file()) self.assertEqual( out["log"].read_text(), - "#=================== train std out ===================\n" + "#=================== train_cnn std out ===================\n" "foo\n" - "#=================== train std err ===================\n" - "#=================== freeze std out ===================\n" + "#=================== train_cnn std err ===================\n" + "#=================== train_qnn std out ===================\n" "bar\n" - "#=================== freeze std err ===================\n", + "#=================== train_qnn std err ===================\n", ) with open(out["script"]) as fp: jdata = json.load(fp) @@ -463,14 +466,15 @@ def test_exec_v2(self, mocked_run): } ) ) - self.assertEqual(out["script"], work_dir / train_script_name) - self.assertEqual(out["model"], work_dir / "frozen_model.pb") - self.assertEqual(out["lcurve"], work_dir / "lcurve.out") + self.assertEqual(out["script"], work_dir / train_cnn_script_name) + self.assertEqual(out["cnn_model"], work_dir / "nvnmd_cnn/frozen_model.pb") + self.assertEqual(out["qnn_model"], work_dir / "nvnmd_qnn/model.pb") + self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") self.assertEqual(out["log"], work_dir / "train.log") calls = [ - call(["dp", "train", train_script_name]), - call(["dp", "freeze", "-o", "frozen_model.pb"]), + call(["dp", "train-nvnmd", train_cnn_script_name, "-s", "s1"]), + call(["dp", "train-nvnmd", train_qnn_script_name, "-s", "s2"]) ] mocked_run.assert_has_calls(calls) @@ -478,12 +482,12 @@ def test_exec_v2(self, mocked_run): self.assertTrue(out["log"].is_file()) self.assertEqual( out["log"].read_text(), - "#=================== train std out ===================\n" + "#=================== train_cnn std out ===================\n" "foo\n" - "#=================== train std err ===================\n" - "#=================== freeze std out ===================\n" + "#=================== train_cnn std err ===================\n" + "#=================== train_qnn std out ===================\n" "bar\n" - "#=================== freeze std err ===================\n", + "#=================== train_qnn std err ===================\n", ) with open(out["script"]) as fp: jdata = json.load(fp) @@ -516,22 +520,24 @@ def test_exec_v2_init_model(self, mocked_run): } ) ) - self.assertEqual(out["script"], work_dir / train_script_name) - self.assertEqual(out["model"], work_dir / "frozen_model.pb") - self.assertEqual(out["lcurve"], work_dir / "lcurve.out") + self.assertEqual(out["script"], work_dir / train_cnn_script_name) + self.assertEqual(out["cnn_model"], work_dir / "nvnmd_cnn/frozen_model.pb") + self.assertEqual(out["qnn_model"], work_dir / "nvnmd_qnn/model.pb") + self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") self.assertEqual(out["log"], work_dir / "train.log") calls = [ call( [ "dp", - "train", + "train-nvnmd", "--init-frz-model", str(self.init_model), - train_script_name, + train_cnn_script_name, + "-s", + "s1" ] - ), - call(["dp", "freeze", "-o", "frozen_model.pb"]), + ) ] mocked_run.assert_has_calls(calls) @@ -539,12 +545,12 @@ def test_exec_v2_init_model(self, mocked_run): self.assertTrue(out["log"].is_file()) self.assertEqual( out["log"].read_text(), - "#=================== train std out ===================\n" + "#=================== train_cnn std out ===================\n" "foo\n" - "#=================== train std err ===================\n" - "#=================== freeze std out ===================\n" + "#=================== train_cnn std err ===================\n" + "#=================== train_qnn std out ===================\n" "bar\n" - "#=================== freeze std err ===================\n", + "#=================== train_qnn std err ===================\n", ) with open(out["script"]) as fp: jdata = json.load(fp) @@ -580,12 +586,12 @@ def test_exec_v2_train_error(self, mocked_run): ) calls = [ - call(["dp", "train", train_script_name]), + call(["dp", "train-nvnmd", train_cnn_script_name, "-s", "s1"]), ] mocked_run.assert_has_calls(calls) self.assertTrue(work_dir.is_dir()) - with open(work_dir / train_script_name) as fp: + with open(work_dir / train_cnn_script_name) as fp: jdata = json.load(fp) self.assertDictEqual(jdata, self.expected_odict_v2) @@ -708,14 +714,15 @@ def test_exec_v2_empty_dir(self, mocked_run): } ) ) - self.assertEqual(out["script"], work_dir / train_script_name) - self.assertEqual(out["model"], work_dir / "frozen_model.pb") - self.assertEqual(out["lcurve"], work_dir / "lcurve.out") + self.assertEqual(out["script"], work_dir / train_cnn_script_name) + self.assertEqual(out["cnn_model"], work_dir / "nvnmd_cnn/frozen_model.pb") + self.assertEqual(out["qnn_model"], work_dir / "nvnmd_qnn/model.pb") + self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") self.assertEqual(out["log"], work_dir / "train.log") calls = [ - call(["dp", "train", train_script_name]), - call(["dp", "freeze", "-o", "frozen_model.pb"]), + call(["dp", "train-nvnmd", train_cnn_script_name, "-s", "s1"]), + call(["dp", "train-nvnmd", train_qnn_script_name, "-s", "s2"]) ] mocked_run.assert_has_calls(calls) @@ -723,12 +730,12 @@ def test_exec_v2_empty_dir(self, mocked_run): self.assertTrue(out["log"].is_file()) self.assertEqual( out["log"].read_text(), - "#=================== train std out ===================\n" + "#=================== train_cnn std out ===================\n" "foo\n" - "#=================== train std err ===================\n" - "#=================== freeze std out ===================\n" + "#=================== train_cnn std err ===================\n" + "#=================== train_qnn std out ===================\n" "bar\n" - "#=================== freeze std err ===================\n", + "#=================== train_qnn std err ===================\n", ) with open(out["script"]) as fp: jdata = json.load(fp) From 2aa102bc10af1e8ca871f02a39652946370d6c55 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Tue, 10 Jun 2025 02:22:42 +0800 Subject: [PATCH 23/49] support init model from model.ckpt in dp-nvnmd train --- dpgen2/entrypoint/submit.py | 3 + dpgen2/flow/dpgen_loop.py | 162 ++++++++++++++++++++----- dpgen2/op/run_nvnmd_train.py | 44 ++++++- dpgen2/superop/block.py | 84 ++++++++++--- dpgen2/superop/prep_run_nvnmd_train.py | 23 +++- 5 files changed, 253 insertions(+), 63 deletions(-) diff --git a/dpgen2/entrypoint/submit.py b/dpgen2/entrypoint/submit.py index cb91c40c..c35c123d 100644 --- a/dpgen2/entrypoint/submit.py +++ b/dpgen2/entrypoint/submit.py @@ -699,6 +699,9 @@ def workflow_concurrent_learning( }, artifacts={ "init_models": init_models, + "init_models_ckpt_meta": None, + "init_models_ckpt_index": None, + "init_models_ckpt_data": None, "init_data": init_data, "iter_data": iter_data, }, diff --git a/dpgen2/flow/dpgen_loop.py b/dpgen2/flow/dpgen_loop.py index 190a1090..f1b6ac06 100644 --- a/dpgen2/flow/dpgen_loop.py +++ b/dpgen2/flow/dpgen_loop.py @@ -186,6 +186,9 @@ def __init__( } self._input_artifacts = { "init_models": InputArtifact(optional=True), + "init_models_ckpt_meta": InputArtifact(optional=True), + "init_models_ckpt_data": InputArtifact(optional=True), + "init_models_ckpt_index": InputArtifact(optional=True), "init_data": InputArtifact(), "iter_data": InputArtifact(), } @@ -194,6 +197,9 @@ def __init__( } self._output_artifacts = { "models": OutputArtifact(), + "models_ckpt_meta": OutputArtifact(optional=True), + "models_ckpt_data": OutputArtifact(optional=True), + "models_ckpt_index": OutputArtifact(optional=True), "iter_data": OutputArtifact(), } @@ -278,6 +284,9 @@ def __init__( self._input_artifacts = { "init_models": InputArtifact(optional=True), "init_data": InputArtifact(), + "init_models_ckpt_meta": InputArtifact(optional=True), + "init_models_ckpt_data": InputArtifact(optional=True), + "init_models_ckpt_index": InputArtifact(optional=True), "iter_data": InputArtifact(), } self._output_parameters = { @@ -285,6 +294,9 @@ def __init__( } self._output_artifacts = { "models": OutputArtifact(), + "models_ckpt_meta": OutputArtifact(optional=True), + "models_ckpt_data": OutputArtifact(optional=True), + "models_ckpt_index": OutputArtifact(optional=True), "iter_data": OutputArtifact(), } @@ -368,17 +380,35 @@ def _loop( "explore_config": steps.inputs.parameters["explore_config"], "expl_task_grp": steps.inputs.parameters["expl_task_grp"], } - block_step = Step( - name=name + "-block", - template=block_op, - parameters=block_common_parameters, - artifacts={ - "init_models": steps.inputs.artifacts["init_models"], - "init_data": steps.inputs.artifacts["init_data"], - "iter_data": steps.inputs.artifacts["iter_data"], - }, - key=step_keys["block"], - ) + if hasattr( steps.inputs.artifacts["init_models_ckpt_meta"], "local_path" ) and \ + hasattr( steps.inputs.artifacts["init_models_ckpt_data"], "local_path" ) and \ + hasattr( steps.inputs.artifacts["init_models_ckpt_index"], "local_path" ): + block_step = Step( + name=name + "-block", + template=block_op, + parameters=block_common_parameters, + artifacts={ + "init_models": steps.inputs.artifacts["init_models"], + "init_models_ckpt_meta": steps.inputs.artifacts["init_models_ckpt_meta"], + "init_models_ckpt_index": steps.inputs.artifacts["init_models_ckpt_index"], + "init_models_ckpt_data": steps.inputs.artifacts["init_models_ckpt_data"], + "init_data": steps.inputs.artifacts["init_data"], + "iter_data": steps.inputs.artifacts["iter_data"], + }, + key=step_keys["block"], + ) + else: + block_step = Step( + name=name + "-block", + template=block_op, + parameters=block_common_parameters, + artifacts={ + "init_models": steps.inputs.artifacts["init_models"], + "init_data": steps.inputs.artifacts["init_data"], + "iter_data": steps.inputs.artifacts["iter_data"], + }, + key=step_keys["block"], + ) steps.add(block_step) scheduler_step = Step( @@ -440,17 +470,35 @@ def _loop( ), "expl_task_grp": scheduler_step.outputs.parameters["expl_task_grp"], } - next_step = Step( - name=name + "-next", - template=steps, - parameters=next_common_parameters, - artifacts={ - "init_models": block_step.outputs.artifacts["models"], - "init_data": steps.inputs.artifacts["init_data"], - "iter_data": block_step.outputs.artifacts["iter_data"], - }, - when="%s == false" % (scheduler_step.outputs.parameters["converged"]), - ) + if hasattr( block_step.outputs.artifacts["models_ckpt_meta"], "local_path" ) and \ + hasattr( block_step.outputs.artifacts["models_ckpt_data"], "local_path" ) and \ + hasattr( block_step.outputs.artifacts["models_ckpt_index"], "local_path" ): + next_step = Step( + name=name + "-next", + template=steps, + parameters=next_common_parameters, + artifacts={ + "init_models": block_step.outputs.artifacts["models"], + "init_models_ckpt_meta": block_step.outputs.artifacts["models_ckpt_meta"], + "init_models_ckpt_index": block_step.outputs.artifacts["models_ckpt_index"], + "init_models_ckpt_data": block_step.outputs.artifacts["models_ckpt_data"], + "init_data": steps.inputs.artifacts["init_data"], + "iter_data": block_step.outputs.artifacts["iter_data"], + }, + when="%s == false" % (scheduler_step.outputs.parameters["converged"]), + ) + else: + next_step = Step( + name=name + "-next", + template=steps, + parameters=next_common_parameters, + artifacts={ + "init_models": block_step.outputs.artifacts["models"], + "init_data": steps.inputs.artifacts["init_data"], + "iter_data": block_step.outputs.artifacts["iter_data"], + }, + when="%s == false" % (scheduler_step.outputs.parameters["converged"]), + ) steps.add(next_step) steps.outputs.parameters[ @@ -465,6 +513,30 @@ def _loop( _then=block_step.outputs.artifacts["models"], _else=next_step.outputs.artifacts["models"], ) + if ( hasattr(block_step.outputs.artifacts["models_ckpt_meta"], "local_path") and + hasattr(next_step.outputs.artifacts["models_ckpt_meta"], "local_path") + ): + steps.outputs.artifacts["models_ckpt_meta"].from_expression = if_expression( + _if=(scheduler_step.outputs.parameters["converged"] == True), + _then=block_step.outputs.artifacts["models_ckpt_meta"], + _else=next_step.outputs.artifacts["models_ckpt_meta"], + ) + if ( hasattr(block_step.outputs.artifacts["models_ckpt_data"], "local_path") and + hasattr(next_step.outputs.artifacts["models_ckpt_data"], "local_path") + ): + steps.outputs.artifacts["models_ckpt_data"].from_expression = if_expression( + _if=(scheduler_step.outputs.parameters["converged"] == True), + _then=block_step.outputs.artifacts["models_ckpt_data"], + _else=next_step.outputs.artifacts["models_ckpt_data"], + ) + if ( hasattr(block_step.outputs.artifacts["models_ckpt_index"], "local_path") and + hasattr(next_step.outputs.artifacts["models_ckpt_index"], "local_path") + ): + steps.outputs.artifacts["models_ckpt_index"].from_expression = if_expression( + _if=(scheduler_step.outputs.parameters["converged"] == True), + _then=block_step.outputs.artifacts["models_ckpt_index"], + _else=next_step.outputs.artifacts["models_ckpt_index"], + ) steps.outputs.artifacts["iter_data"].from_expression = if_expression( _if=(scheduler_step.outputs.parameters["converged"] == True), _then=block_step.outputs.artifacts["iter_data"], @@ -544,23 +616,47 @@ def _dpgen( "optional_parameter": steps.inputs.parameters["optional_parameter"], "expl_task_grp": scheduler_step.outputs.parameters["expl_task_grp"], } - loop_step = Step( - name=name + "-loop", - template=loop_op, - parameters=common_parameters, - artifacts={ - "init_models": steps.inputs.artifacts["init_models"], - "init_data": steps.inputs.artifacts["init_data"], - "iter_data": steps.inputs.artifacts["iter_data"], - }, - key="--".join(["%s" % id_step.outputs.parameters["block_id"], loop_key]), - ) + if hasattr( steps.inputs.artifacts["init_models_ckpt_meta"], "local_path" ) and \ + hasattr( steps.inputs.artifacts["init_models_ckpt_data"], "local_path" ) and \ + hasattr( steps.inputs.artifacts["init_models_ckpt_index"], "local_path" ): + loop_step = Step( + name=name + "-loop", + template=loop_op, + parameters=common_parameters, + artifacts={ + "init_models": steps.inputs.artifacts["init_models"], + "init_models_ckpt_meta": steps.inputs.artifacts["init_models_ckpt_meta"], + "init_models_ckpt_data": steps.inputs.artifacts["init_models_ckpt_data"], + "init_models_ckpt_index": steps.inputs.artifacts["init_models_ckpt_index"], + "init_data": steps.inputs.artifacts["init_data"], + "iter_data": steps.inputs.artifacts["iter_data"], + }, + key="--".join(["%s" % id_step.outputs.parameters["block_id"], loop_key]), + ) + else: + loop_step = Step( + name=name + "-loop", + template=loop_op, + parameters=common_parameters, + artifacts={ + "init_models": steps.inputs.artifacts["init_models"], + "init_data": steps.inputs.artifacts["init_data"], + "iter_data": steps.inputs.artifacts["iter_data"], + }, + key="--".join(["%s" % id_step.outputs.parameters["block_id"], loop_key]), + ) steps.add(loop_step) steps.outputs.parameters[ "exploration_scheduler" ].value_from_parameter = loop_step.outputs.parameters["exploration_scheduler"] steps.outputs.artifacts["models"]._from = loop_step.outputs.artifacts["models"] + if hasattr(loop_step.outputs.artifacts["models_ckpt_meta"], "local_path"): + steps.outputs.artifacts["models_ckpt_meta"]._from = loop_step.outputs.artifacts["models_ckpt_meta"] + if hasattr(loop_step.outputs.artifacts["models_ckpt_data"], "local_path"): + steps.outputs.artifacts["models_ckpt_data"]._from = loop_step.outputs.artifacts["models_ckpt_data"] + if hasattr(loop_step.outputs.artifacts["models_ckpt_index"], "local_path"): + steps.outputs.artifacts["models_ckpt_index"]._from = loop_step.outputs.artifacts["models_ckpt_index"] steps.outputs.artifacts["iter_data"]._from = loop_step.outputs.artifacts[ "iter_data" ] diff --git a/dpgen2/op/run_nvnmd_train.py b/dpgen2/op/run_nvnmd_train.py index 23ae625a..abd05f59 100644 --- a/dpgen2/op/run_nvnmd_train.py +++ b/dpgen2/op/run_nvnmd_train.py @@ -61,6 +61,7 @@ def _make_train_command( checkpoint = "nvnmd_cnn/model.ckpt" else: checkpoint = None + # case of restart if checkpoint is not None: command = dp_command + ["train-nvnmd", "--restart", checkpoint, train_script_name] @@ -70,7 +71,15 @@ def _make_train_command( assert checkpoint is None case_init_model = do_init_model if case_init_model: - init_flag = "--init-frz-model" + + if isinstance(init_model, list): # initialize from model.ckpt + for i in init_model: + shutil.copy(i, "./") + init_model = "model.ckpt" + init_flag = "--init-imodel" + else: # initialize from frozen model + init_flag = "--init-frz-model" + command = dp_command + [ "train-nvnmd", init_flag, @@ -110,6 +119,9 @@ def get_input_sign(cls): ), "task_path": Artifact(Path), "init_model": Artifact(Path, optional=True), + "init_model_ckpt_meta": Artifact(Path, optional=True), + "init_model_ckpt_data": Artifact(Path, optional=True), + "init_model_ckpt_index": Artifact(Path, optional=True), "init_data": Artifact(NestedDict[Path]), "iter_data": Artifact(List[Path]), "valid_data": Artifact(NestedDict[Path], optional=True), @@ -124,6 +136,9 @@ def get_output_sign(cls): "script": Artifact(Path), "cnn_model": Artifact(Path), "qnn_model": Artifact(Path), + "model_ckpt_data": Artifact(Path), + "model_ckpt_meta": Artifact(Path), + "model_ckpt_index": Artifact(Path), "lcurve": Artifact(Path), "log": Artifact(Path), } @@ -145,8 +160,13 @@ def execute( - `task_name`: (`str`) The name of training task. - `task_path`: (`Artifact(Path)`) The path that contains all input files prepareed by `PrepDPTrain`. - `init_model`: (`Artifact(Path)`) A frozen model to initialize the training. + - `init_model_ckpt_meta`: (`Artifact(Path)`, optional) The meta file of the frozen model. + - `init_model_ckpt_data`: (`Artifact(Path)`, optional) The data file of the frozen model. + - `init_model_ckpt_index`: (`Artifact(Path)`, optional) The index file of the frozen model. - `init_data`: (`Artifact(NestedDict[Path])`) Initial training data. - `iter_data`: (`Artifact(List[Path])`) Training data generated in the DPGEN iterations. + - `valid_data`: (`Artifact(NestedDict[Path])`, optional) Validation data. + - `optional_files`: (`Artifact(List[Path])`, optional) Optional files to be copied to the working directory. Returns ------- @@ -155,6 +175,9 @@ def execute( - `script`: (`Artifact(Path)`) The training script. - `cnn_model`: (`Artifact(Path)`) The trained continuous frozen model. - `qnn_model`: (`Artifact(Path)`) The trained quantized frozen model. + - `model_ckpt_data`: (`Artifact(Path)`) The data file of the trained model. + - `model_ckpt_meta`: (`Artifact(Path)`) The meta file of the trained model. + - `model_ckpt_index`: (`Artifact(Path)`) The index file of the trained model. - `lcurve`: (`Artifact(Path)`) The learning curve file. - `log`: (`Artifact(Path)`) The log file of training. @@ -171,6 +194,9 @@ def execute( task_name = ip["task_name"] task_path = ip["task_path"] init_model = ip["init_model"] + init_model_ckpt_data = ip["init_model_ckpt_data"] + init_model_ckpt_meta = ip["init_model_ckpt_meta"] + init_model_ckpt_index = ip["init_model_ckpt_index"] init_data = ip["init_data"] iter_data = ip["iter_data"] valid_data = ip["valid_data"] @@ -189,9 +215,10 @@ def execute( major_version = "2" # auto prob style + init_model_ckpt = [init_model_ckpt_meta, init_model_ckpt_data, init_model_ckpt_index] do_init_model = RunNvNMDTrain.decide_init_model( config, - init_model, + init_model_ckpt if init_model_ckpt_data is not None else init_model, init_data, iter_data, mixed_type=mixed_type, @@ -244,7 +271,7 @@ def clean_before_quit(): dp_command, train_cnn_script_name, do_init_model, - init_model, + init_model_ckpt if init_model_ckpt_data is not None else init_model, train_args = "-s s1", ) @@ -274,10 +301,16 @@ def clean_before_quit(): fplog.write(err) cnn_model_file = "nvnmd_cnn/frozen_model.pb" + model_ckpt_data_file = "nvnmd_cnn/model.ckpt.data-00000-of-00001" + model_ckpt_index_file = "nvnmd_cnn/model.ckpt.index" + model_ckpt_meta_file = "nvnmd_cnn/model.ckpt.meta" lcurve_file = "nvnmd_cnn/lcurve.out" else: cnn_model_file = init_model + model_ckpt_data_file = "" + model_ckpt_index_file = "" + model_ckpt_meta_file = "" lcurve_file = "nvnmd_qnn/lcurve.out" # train qnn model @@ -285,7 +318,7 @@ def clean_before_quit(): dp_command, train_qnn_script_name, do_init_model, - init_model, + init_model_ckpt if init_model_ckpt_data is not None else init_model, train_args = "-s s2", ) @@ -323,6 +356,9 @@ def clean_before_quit(): "script": work_dir / train_cnn_script_name, "cnn_model": work_dir / cnn_model_file, "qnn_model": work_dir / qnn_model_file, + "model_ckpt_data": work_dir / model_ckpt_data_file, + "model_ckpt_meta": work_dir / model_ckpt_meta_file, + "model_ckpt_index": work_dir / model_ckpt_index_file, "lcurve": work_dir / lcurve_file, "log": work_dir / "train.log", } diff --git a/dpgen2/superop/block.py b/dpgen2/superop/block.py index b9928e8b..fa225b3d 100644 --- a/dpgen2/superop/block.py +++ b/dpgen2/superop/block.py @@ -116,6 +116,9 @@ def __init__( } self._input_artifacts = { "init_models": InputArtifact(optional=True), + "init_models_ckpt_index": InputArtifact(optional=True), + "init_models_ckpt_data": InputArtifact(optional=True), + "init_models_ckpt_meta": InputArtifact(optional=True), "init_data": InputArtifact(), "iter_data": InputArtifact(), } @@ -124,6 +127,9 @@ def __init__( } self._output_artifacts = { "models": OutputArtifact(), + "models_ckpt_index": OutputArtifact(optional=True), + "models_ckpt_data": OutputArtifact(optional=True), + "models_ckpt_meta": OutputArtifact(optional=True), "iter_data": OutputArtifact(), "trajs": OutputArtifact(), } @@ -215,25 +221,55 @@ def _block_cl( block_steps.inputs.parameters["optional_parameter"] ) - prep_run_dp_train = Step( - name + "-prep-run-dp-train", - template=prep_run_dp_train_op, - parameters={ - "block_id": block_steps.inputs.parameters["block_id"], - "train_config": block_steps.inputs.parameters["train_config"], - "numb_models": block_steps.inputs.parameters["numb_models"], - "template_script": block_steps.inputs.parameters["template_script"], - "run_optional_parameter": run_dp_train_optional_parameter, - }, - artifacts={ - "init_models": block_steps.inputs.artifacts["init_models"], - "init_data": block_steps.inputs.artifacts["init_data"], - "iter_data": block_steps.inputs.artifacts["iter_data"], - }, - key="--".join( - ["%s" % block_steps.inputs.parameters["block_id"], "prep-run-train"] - ), - ) + if isinstance(prep_run_dp_train_op, PrepRunNvNMDTrain): + prep_run_dp_train = Step( + name + "-prep-run-nvnmd-train", + template=prep_run_dp_train_op, + parameters={ + "block_id": block_steps.inputs.parameters["block_id"], + "train_config": block_steps.inputs.parameters["train_config"], + "numb_models": block_steps.inputs.parameters["numb_models"], + "template_script": block_steps.inputs.parameters["template_script"], + "run_optional_parameter": run_dp_train_optional_parameter, + }, + artifacts={ + "init_models": block_steps.inputs.artifacts["init_models"], + "init_models_ckpt_index": block_steps.inputs.artifacts[ + "init_models_ckpt_index" + ], + "init_models_ckpt_data": block_steps.inputs.artifacts[ + "init_models_ckpt_data" + ], + "init_models_ckpt_meta": block_steps.inputs.artifacts[ + "init_models_ckpt_meta" + ], + "init_data": block_steps.inputs.artifacts["init_data"], + "iter_data": block_steps.inputs.artifacts["iter_data"], + }, + key="--".join( + ["%s" % block_steps.inputs.parameters["block_id"], "prep-run-train"] + ), + ) + else: + prep_run_dp_train = Step( + name + "-prep-run-dp-train", + template=prep_run_dp_train_op, + parameters={ + "block_id": block_steps.inputs.parameters["block_id"], + "train_config": block_steps.inputs.parameters["train_config"], + "numb_models": block_steps.inputs.parameters["numb_models"], + "template_script": block_steps.inputs.parameters["template_script"], + "run_optional_parameter": run_dp_train_optional_parameter, + }, + artifacts={ + "init_models": block_steps.inputs.artifacts["init_models"], + "init_data": block_steps.inputs.artifacts["init_data"], + "iter_data": block_steps.inputs.artifacts["iter_data"], + }, + key="--".join( + ["%s" % block_steps.inputs.parameters["block_id"], "prep-run-train"] + ), + ) block_steps.add(prep_run_dp_train) prep_run_explore = Step( @@ -246,7 +282,9 @@ def _block_cl( "type_map": block_steps.inputs.parameters["type_map"], }, artifacts={ - "models": prep_run_dp_train.outputs.artifacts["models"], + "models": prep_run_dp_train.outputs.artifacts["nvnmodels"] + if isinstance(prep_run_dp_train_op, PrepRunNvNMDTrain) + else prep_run_dp_train.outputs.artifacts["models"] }, key="--".join( ["%s" % block_steps.inputs.parameters["block_id"], "prep-run-explore"] @@ -325,6 +363,12 @@ def _block_cl( block_steps.outputs.artifacts["models"]._from = prep_run_dp_train.outputs.artifacts[ "models" ] + if "models_ckpt_meta" in prep_run_dp_train.outputs.artifacts: + block_steps.outputs.artifacts["models_ckpt_meta"]._from = prep_run_dp_train.outputs.artifacts["models_ckpt_meta"] + if "models_ckpt_meta" in prep_run_dp_train.outputs.artifacts: + block_steps.outputs.artifacts["models_ckpt_data"]._from = prep_run_dp_train.outputs.artifacts["models_ckpt_data"] + if "models_ckpt_meta" in prep_run_dp_train.outputs.artifacts: + block_steps.outputs.artifacts["models_ckpt_index"]._from = prep_run_dp_train.outputs.artifacts["models_ckpt_index"] block_steps.outputs.artifacts["iter_data"]._from = collect_data.outputs.artifacts[ "iter_data" ] diff --git a/dpgen2/superop/prep_run_nvnmd_train.py b/dpgen2/superop/prep_run_nvnmd_train.py index 50987613..47dd89d1 100644 --- a/dpgen2/superop/prep_run_nvnmd_train.py +++ b/dpgen2/superop/prep_run_nvnmd_train.py @@ -79,6 +79,9 @@ def __init__( } self._input_artifacts = { "init_models": InputArtifact(optional=True), + "init_models_ckpt_data": InputArtifact(optional=True), + "init_models_ckpt_index": InputArtifact(optional=True), + "init_models_ckpt_meta": InputArtifact(optional=True), "init_data": InputArtifact(), "iter_data": InputArtifact(), } @@ -88,6 +91,10 @@ def __init__( self._output_artifacts = { "scripts": OutputArtifact(), "models": OutputArtifact(), + "nvnmodels": OutputArtifact(), + "models_ckpt_meta": OutputArtifact(optional=True), + "models_ckpt_data": OutputArtifact(optional=True), + "models_ckpt_index": OutputArtifact(optional=True), "logs": OutputArtifact(), "lcurves": OutputArtifact(), } @@ -191,8 +198,8 @@ def _prep_run_nvnmd_train( slices=Slices( "int('{{item}}')", input_parameter=["task_name"], - input_artifact=["task_path", "init_model"], - output_artifact=["cnn_model", "qnn_model", "lcurve", "log", "script"], + input_artifact=["task_path", "init_model", "init_model_ckpt_meta", "init_model_ckpt_data", "init_model_ckpt_index"], + output_artifact=["cnn_model", "qnn_model", "model_ckpt_data", "model_ckpt_meta", "model_ckpt_index", "lcurve", "log", "script"], **template_slice_config, ), python_packages=upload_python_packages, @@ -208,6 +215,9 @@ def _prep_run_nvnmd_train( artifacts={ "task_path": prep_train.outputs.artifacts["task_paths"], "init_model": train_steps.inputs.artifacts["init_models"], + "init_model_ckpt_meta": train_steps.inputs.artifacts["init_models_ckpt_meta"], + "init_model_ckpt_data": train_steps.inputs.artifacts["init_models_ckpt_data"], + "init_model_ckpt_index": train_steps.inputs.artifacts["init_models_ckpt_index"], "init_data": train_steps.inputs.artifacts["init_data"], "iter_data": train_steps.inputs.artifacts["iter_data"], "valid_data": valid_data, @@ -233,10 +243,11 @@ def _prep_run_nvnmd_train( "script" ] train_steps.outputs.artifacts["models"]._from = run_train.outputs.artifacts["cnn_model"] - #train_steps.outputs.artifacts["nvnmodels"]._from = run_train.outputs.artifacts["cnn_model"] + train_steps.outputs.artifacts["nvnmodels"]._from = run_train.outputs.artifacts["qnn_model"] + train_steps.outputs.artifacts["models_ckpt_meta"]._from = run_train.outputs.artifacts["model_ckpt_meta"] + train_steps.outputs.artifacts["models_ckpt_data"]._from = run_train.outputs.artifacts["model_ckpt_data"] + train_steps.outputs.artifacts["models_ckpt_index"]._from = run_train.outputs.artifacts["model_ckpt_index"] train_steps.outputs.artifacts["logs"]._from = run_train.outputs.artifacts["log"] - train_steps.outputs.artifacts["lcurves"]._from = run_train.outputs.artifacts[ - "lcurve" - ] + train_steps.outputs.artifacts["lcurves"]._from = run_train.outputs.artifacts["lcurve"] return train_steps From c1e46ac899678fd28933e5330b956199f167e203 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Tue, 10 Jun 2025 10:07:13 +0800 Subject: [PATCH 24/49] fix nvnmd test unit --- dpgen2/op/run_nvnmd_train.py | 4 +--- tests/op/test_run_nvnmd_train.py | 37 ++++++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/dpgen2/op/run_nvnmd_train.py b/dpgen2/op/run_nvnmd_train.py index abd05f59..979f21ef 100644 --- a/dpgen2/op/run_nvnmd_train.py +++ b/dpgen2/op/run_nvnmd_train.py @@ -73,10 +73,8 @@ def _make_train_command( if case_init_model: if isinstance(init_model, list): # initialize from model.ckpt - for i in init_model: - shutil.copy(i, "./") init_model = "model.ckpt" - init_flag = "--init-imodel" + init_flag = "--init-model" else: # initialize from frozen model init_flag = "--init-frz-model" diff --git a/tests/op/test_run_nvnmd_train.py b/tests/op/test_run_nvnmd_train.py index c88f8c0a..2e1495d8 100644 --- a/tests/op/test_run_nvnmd_train.py +++ b/tests/op/test_run_nvnmd_train.py @@ -79,6 +79,9 @@ def setUp(self): self.init_data = sorted(list(self.init_data)) self.init_model = Path("bar.pb") + self.init_model_ckpt_meta = Path("model.ckpt.meta") + self.init_model_ckpt_data = Path("model.ckpt.data") + self.init_model_ckpt_index = Path("model.ckpt.index") self.config = { "init_model_policy": "no", @@ -407,6 +410,9 @@ def test_exec_v1(self, mocked_run): "task_name": task_name, "task_path": Path(task_path), "init_model": Path(self.init_model), + "init_model_ckpt_meta": Path(self.init_model_ckpt_meta), + "init_model_ckpt_data": Path(self.init_model_ckpt_data), + "init_model_ckpt_index": Path(self.init_model_ckpt_index), "init_data": [Path(ii) for ii in self.init_data], "iter_data": [Path(ii) for ii in self.iter_data], } @@ -415,6 +421,9 @@ def test_exec_v1(self, mocked_run): self.assertEqual(out["script"], work_dir / train_cnn_script_name) self.assertEqual(out["cnn_model"], work_dir / "nvnmd_cnn/frozen_model.pb") self.assertEqual(out["qnn_model"], work_dir / "nvnmd_qnn/model.pb") + self.assertEqual(out["model_ckpt_data"], work_dir / "nvnmd_cnn/model.ckpt.data-00000-of-00001") + self.assertEqual(out["model_ckpt_meta"], work_dir / "nvnmd_cnn/model.ckpt.meta") + self.assertEqual(out["model_ckpt_index"], work_dir / "nvnmd_cnn/model.ckpt.index") self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") self.assertEqual(out["log"], work_dir / "train.log") @@ -461,6 +470,9 @@ def test_exec_v2(self, mocked_run): "task_name": task_name, "task_path": Path(task_path), "init_model": Path(self.init_model), + "init_model_ckpt_meta": Path(self.init_model_ckpt_meta), + "init_model_ckpt_data": Path(self.init_model_ckpt_data), + "init_model_ckpt_index": Path(self.init_model_ckpt_index), "init_data": [Path(ii) for ii in self.init_data], "iter_data": [Path(ii) for ii in self.iter_data], } @@ -469,6 +481,9 @@ def test_exec_v2(self, mocked_run): self.assertEqual(out["script"], work_dir / train_cnn_script_name) self.assertEqual(out["cnn_model"], work_dir / "nvnmd_cnn/frozen_model.pb") self.assertEqual(out["qnn_model"], work_dir / "nvnmd_qnn/model.pb") + self.assertEqual(out["model_ckpt_data"], work_dir / "nvnmd_cnn/model.ckpt.data-00000-of-00001") + self.assertEqual(out["model_ckpt_meta"], work_dir / "nvnmd_cnn/model.ckpt.meta") + self.assertEqual(out["model_ckpt_index"], work_dir / "nvnmd_cnn/model.ckpt.index") self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") self.assertEqual(out["log"], work_dir / "train.log") @@ -515,6 +530,9 @@ def test_exec_v2_init_model(self, mocked_run): "task_name": task_name, "task_path": Path(task_path), "init_model": Path(self.init_model), + "init_model_ckpt_meta": Path(self.init_model_ckpt_meta), + "init_model_ckpt_data": Path(self.init_model_ckpt_data), + "init_model_ckpt_index": Path(self.init_model_ckpt_index), "init_data": [Path(ii) for ii in self.init_data], "iter_data": [Path(ii) for ii in self.iter_data], } @@ -523,6 +541,9 @@ def test_exec_v2_init_model(self, mocked_run): self.assertEqual(out["script"], work_dir / train_cnn_script_name) self.assertEqual(out["cnn_model"], work_dir / "nvnmd_cnn/frozen_model.pb") self.assertEqual(out["qnn_model"], work_dir / "nvnmd_qnn/model.pb") + self.assertEqual(out["model_ckpt_data"], work_dir / "nvnmd_cnn/model.ckpt.data-00000-of-00001") + self.assertEqual(out["model_ckpt_meta"], work_dir / "nvnmd_cnn/model.ckpt.meta") + self.assertEqual(out["model_ckpt_index"], work_dir / "nvnmd_cnn/model.ckpt.index") self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") self.assertEqual(out["log"], work_dir / "train.log") @@ -531,8 +552,8 @@ def test_exec_v2_init_model(self, mocked_run): [ "dp", "train-nvnmd", - "--init-frz-model", - str(self.init_model), + "--init-model", + "model.ckpt", train_cnn_script_name, "-s", "s1" @@ -579,6 +600,9 @@ def test_exec_v2_train_error(self, mocked_run): "task_name": task_name, "task_path": Path(task_path), "init_model": Path(self.init_model), + "init_model_ckpt_meta": Path(self.init_model_ckpt_meta), + "init_model_ckpt_data": Path(self.init_model_ckpt_data), + "init_model_ckpt_index": Path(self.init_model_ckpt_index), "init_data": [Path(ii) for ii in self.init_data], "iter_data": [Path(ii) for ii in self.iter_data], } @@ -610,6 +634,9 @@ def setUp(self): self.init_data = sorted(list(self.init_data)) self.init_model = Path("bar.pb") + self.init_model_ckpt_meta = Path("model.ckpt.meta") + self.init_model_ckpt_data = Path("model.ckpt.data") + self.init_model_ckpt_index = Path("model.ckpt.index") self.config = { "init_model_policy": "no", @@ -709,6 +736,9 @@ def test_exec_v2_empty_dir(self, mocked_run): "task_name": task_name, "task_path": Path(task_path), "init_model": Path(self.init_model), + "init_model_ckpt_meta": Path(self.init_model_ckpt_meta), + "init_model_ckpt_data": Path(self.init_model_ckpt_data), + "init_model_ckpt_index": Path(self.init_model_ckpt_index), "init_data": [Path(ii) for ii in self.init_data], "iter_data": [empty_data], } @@ -717,6 +747,9 @@ def test_exec_v2_empty_dir(self, mocked_run): self.assertEqual(out["script"], work_dir / train_cnn_script_name) self.assertEqual(out["cnn_model"], work_dir / "nvnmd_cnn/frozen_model.pb") self.assertEqual(out["qnn_model"], work_dir / "nvnmd_qnn/model.pb") + self.assertEqual(out["model_ckpt_data"], work_dir / "nvnmd_cnn/model.ckpt.data-00000-of-00001") + self.assertEqual(out["model_ckpt_meta"], work_dir / "nvnmd_cnn/model.ckpt.meta") + self.assertEqual(out["model_ckpt_index"], work_dir / "nvnmd_cnn/model.ckpt.index") self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") self.assertEqual(out["log"], work_dir / "train.log") From 3791618140b9fce0652b5f57024a310347f01db3 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Tue, 10 Jun 2025 15:53:16 +0800 Subject: [PATCH 25/49] fix nvnmd test unit --- dpgen2/constants.py | 4 ++ dpgen2/op/run_nvnmd_train.py | 2 +- dpgen2/superop/prep_run_nvnmd_train.py | 6 +- tests/mocked_ops.py | 87 +++++++++++++++++++++++--- tests/test_prep_run_nvnmd_train.py | 85 ++++++++++++++++++++++--- 5 files changed, 163 insertions(+), 21 deletions(-) diff --git a/dpgen2/constants.py b/dpgen2/constants.py index 3f9c1e69..c591d37c 100644 --- a/dpgen2/constants.py +++ b/dpgen2/constants.py @@ -6,6 +6,10 @@ train_log_name = "train.log" model_name_pattern = "model.%03d.pb" pytorch_model_name_pattern = "model.%03d.pth" +model_ckpt_pattern = "model.ckpt.%03d" +model_ckpt_meta_pattern = "model.ckpt.%03d/model.ckpt.meta" +model_ckpt_data_pattern = "model.ckpt.%03d/model.ckpt.data" +model_ckpt_index_pattern = "model.ckpt.%03d/model.ckpt.index" model_name_match_pattern = r"model\.[0-9]{3,}(\.pb|\.pth)" lmp_index_pattern = "%06d" lmp_task_pattern = "task." + lmp_index_pattern diff --git a/dpgen2/op/run_nvnmd_train.py b/dpgen2/op/run_nvnmd_train.py index 979f21ef..3f9b6fe2 100644 --- a/dpgen2/op/run_nvnmd_train.py +++ b/dpgen2/op/run_nvnmd_train.py @@ -73,7 +73,7 @@ def _make_train_command( if case_init_model: if isinstance(init_model, list): # initialize from model.ckpt - init_model = "model.ckpt" + init_model = ".".join(str(init_model[0]).split('.')[:-1]) init_flag = "--init-model" else: # initialize from frozen model init_flag = "--init-frz-model" diff --git a/dpgen2/superop/prep_run_nvnmd_train.py b/dpgen2/superop/prep_run_nvnmd_train.py index 47dd89d1..21bd376c 100644 --- a/dpgen2/superop/prep_run_nvnmd_train.py +++ b/dpgen2/superop/prep_run_nvnmd_train.py @@ -198,8 +198,8 @@ def _prep_run_nvnmd_train( slices=Slices( "int('{{item}}')", input_parameter=["task_name"], - input_artifact=["task_path", "init_model", "init_model_ckpt_meta", "init_model_ckpt_data", "init_model_ckpt_index"], - output_artifact=["cnn_model", "qnn_model", "model_ckpt_data", "model_ckpt_meta", "model_ckpt_index", "lcurve", "log", "script"], + input_artifact=["task_path", "init_model", "init_model_ckpt_data", "init_model_ckpt_index", "init_model_ckpt_meta"], + output_artifact=["cnn_model", "qnn_model", "model_ckpt_meta", "model_ckpt_data", "model_ckpt_index", "lcurve", "log", "script"], **template_slice_config, ), python_packages=upload_python_packages, @@ -217,7 +217,7 @@ def _prep_run_nvnmd_train( "init_model": train_steps.inputs.artifacts["init_models"], "init_model_ckpt_meta": train_steps.inputs.artifacts["init_models_ckpt_meta"], "init_model_ckpt_data": train_steps.inputs.artifacts["init_models_ckpt_data"], - "init_model_ckpt_index": train_steps.inputs.artifacts["init_models_ckpt_index"], + "init_model_ckpt_index": train_steps.inputs.artifacts["init_models_ckpt_index"], "init_data": train_steps.inputs.artifacts["init_data"], "iter_data": train_steps.inputs.artifacts["iter_data"], "valid_data": valid_data, diff --git a/tests/mocked_ops.py b/tests/mocked_ops.py index 1c2bad66..48488b24 100644 --- a/tests/mocked_ops.py +++ b/tests/mocked_ops.py @@ -41,6 +41,10 @@ lmp_task_pattern, lmp_traj_name, model_name_pattern, + model_ckpt_pattern, + model_ckpt_meta_pattern, + model_ckpt_data_pattern, + model_ckpt_index_pattern, train_log_name, train_script_name, train_task_pattern, @@ -121,6 +125,21 @@ def make_mocked_init_models(numb_models): return tmp_models +def make_mocked_init_models_ckpt(numb_models): + tmp_models_ckpt = [] + for ii in range(numb_models): + dir = Path(model_ckpt_pattern %ii) + dir.mkdir(exist_ok=True, parents=True) + ff_meta = Path(model_ckpt_meta_pattern % ii) + ff_meta.write_text(f"This is init model ckpt meta {ii}") + ff_data = Path(model_ckpt_data_pattern % ii) + ff_data.write_text(f"This is init model ckpt data {ii}") + ff_index = Path(model_ckpt_index_pattern % ii) + ff_index.write_text(f"This is init model ckpt index {ii}") + tmp_models_ckpt.append(dir) + return tmp_models_ckpt + + def make_mocked_init_data(): tmp_init_data = [Path("init_data/foo"), Path("init_data/bar")] for ii in tmp_init_data: @@ -400,12 +419,18 @@ def execute( work_dir = Path(ip["task_name"]) script = ip["task_path"] / "input.json" init_model = Path(ip["init_model"]) + init_model_ckpt_meta = Path(ip["init_model_ckpt_meta"]) + init_model_ckpt_data = Path(ip["init_model_ckpt_data"]) + init_model_ckpt_index = Path(ip["init_model_ckpt_index"]) init_data = ip["init_data"] iter_data = ip["iter_data"] assert script.is_file() assert ip["task_path"].is_dir() assert init_model.is_file() + assert init_model_ckpt_meta.is_file() + assert init_model_ckpt_data.is_file() + assert init_model_ckpt_index.is_file() assert len(init_data) == 2 assert re.match("task.[0-9][0-9][0-9][0-9]", ip["task_name"]) task_id = int(ip["task_name"].split(".")[1]) @@ -421,6 +446,9 @@ def execute( script = Path(script).resolve() init_model = init_model.resolve() init_model_str = str(init_model) + init_model_ckpt_meta = init_model_ckpt_meta.resolve() + init_model_ckpt_data = init_model_ckpt_data.resolve() + init_model_ckpt_index = init_model_ckpt_index.resolve() init_data = [ii.resolve() for ii in init_data] iter_data = [ii.resolve() for ii in iter_data] init_data_str = [str(ii) for ii in init_data] @@ -448,14 +476,29 @@ def execute( ) copyfile(script, oscript) - cnn_model = Path("frozen_model.pb") - qnn_model = Path("model.pb") - lcurve = Path("lcurve.out") + + cnn_dir = Path("nvnmd_cnn") + qnn_dir = Path("nvnmd_qnn") + cnn_model = cnn_dir / Path("frozen_model.pb") + qnn_model = qnn_dir / Path("model.pb") + model_ckpt_meta_file = cnn_dir / Path("model.ckpt.meta") + model_ckpt_data_file = cnn_dir / Path("model.ckpt.data-00000-of-00001") + model_ckpt_index_file = cnn_dir / Path("model.ckpt.index") + lcurve = cnn_dir / Path("lcurve.out") log = Path("log") assert init_model.exists() with log.open("w") as f: f.write(f"init_model {str(init_model)} OK\n") + assert init_model_ckpt_meta.exists() + with log.open("a") as f: + f.write(f"init_model_ckpt_meta {str(init_model_ckpt_meta)} OK\n") + assert init_model_ckpt_data.exists() + with log.open("a") as f: + f.write(f"init_model_ckpt_data {str(init_model_ckpt_data)} OK\n") + assert init_model_ckpt_index.exists() + with log.open("a") as f: + f.write(f"init_model_ckpt_index {str(init_model_ckpt_index)} OK\n") for ii in jtmp["data"]: assert Path(ii).exists() assert (ii in init_data_str) or (ii in iter_data_str) @@ -465,9 +508,21 @@ def execute( with log.open("a") as f: f.write(f"script {str(script)} OK\n") + + cnn_dir.mkdir(exist_ok=True, parents=True) with cnn_model.open("w") as f: f.write("read from init model: \n") f.write(init_model.read_text() + "\n") + with model_ckpt_meta_file.open("w") as f: + f.write("read from init model ckpt: \n") + f.write(init_model_ckpt_meta.read_text() + "\n") + with model_ckpt_data_file.open("w") as f: + f.write("read from init model ckpt: \n") + f.write(init_model_ckpt_data.read_text() + "\n") + with model_ckpt_index_file.open("w") as f: + f.write("read from init model ckpt: \n") + f.write(init_model_ckpt_index.read_text() + "\n") + qnn_dir.mkdir(exist_ok=True, parents=True) with qnn_model.open("w") as f: f.write("read from init model: \n") f.write(init_model.read_text() + "\n") @@ -482,6 +537,9 @@ def execute( "script": work_dir / oscript, "cnn_model": work_dir / cnn_model, "qnn_model": work_dir / qnn_model, + "model_ckpt_data": work_dir / model_ckpt_data_file, + "model_ckpt_meta": work_dir / model_ckpt_meta_file, + "model_ckpt_index": work_dir / model_ckpt_index_file, "lcurve": work_dir / lcurve, "log": work_dir / log, } @@ -558,7 +616,11 @@ def execute( ) copyfile(script, oscript) - model = Path("model.pb") + cnn_model = Path("frozen_model.pb") + qnn_model = Path("model.pb") + model_ckpt_meta_file = Path("model.ckpt.meta") + model_ckpt_data_file = Path("model.ckpt.data-00000-of-00001") + model_ckpt_index_file = Path("model.ckpt.index") lcurve = Path("lcurve.out") log = Path("log") @@ -571,8 +633,16 @@ def execute( with log.open("a") as f: f.write(f"script {str(script)} OK\n") - with model.open("w") as f: + with cnn_model.open("w") as f: f.write("read from init model: \n") + with qnn_model.open("w") as f: + f.write("read from init model: \n") + with model_ckpt_meta_file.open("w") as f: + f.write("read from init model ckpt: \n") + with model_ckpt_data_file.open("w") as f: + f.write("read from init model ckpt: \n") + with model_ckpt_index_file.open("w") as f: + f.write("read from init model ckpt: \n") with lcurve.open("w") as f: f.write("read from train_script: \n") f.write(script.read_text() + "\n") @@ -582,8 +652,11 @@ def execute( return OPIO( { "script": work_dir / oscript, - "cnn_model": work_dir / model, - "qnn_model": work_dir / model, + "cnn_model": work_dir / cnn_model, + "qnn_model": work_dir / qnn_model, + "model_ckpt_data": work_dir / model_ckpt_meta_file, + "model_ckpt_meta": work_dir / model_ckpt_meta_file, + "model_ckpt_index": work_dir / model_ckpt_meta_file, "lcurve": work_dir / lcurve, "log": work_dir / log, } diff --git a/tests/test_prep_run_nvnmd_train.py b/tests/test_prep_run_nvnmd_train.py index 9bbadc21..7defc199 100644 --- a/tests/test_prep_run_nvnmd_train.py +++ b/tests/test_prep_run_nvnmd_train.py @@ -55,6 +55,7 @@ MockedRunNvNMDTrainNoneInitModel, make_mocked_init_data, make_mocked_init_models, + make_mocked_init_models_ckpt, mocked_numb_models, mocked_template_script, ) @@ -77,7 +78,7 @@ def _check_log( - tcase, fname, path, script, init_model, init_data, iter_data, only_check_name=False + tcase, fname, path, script, init_model, init_model_ckpt, init_data, iter_data, only_check_name=False ): with open(fname) as fp: lines_ = fp.read().strip().split("\n") @@ -94,9 +95,21 @@ def _check_log( lines[0].split(" "), ["init_model", str(revised_fname(Path(path) / init_model)), "OK"], ) + tcase.assertEqual( + lines[1].split(" "), + ["init_model_ckpt_meta", str(revised_fname(Path(path) / init_model_ckpt / "model.ckpt.meta")), "OK"], + ) + tcase.assertEqual( + lines[2].split(" "), + ["init_model_ckpt_data", str(revised_fname(Path(path) / init_model_ckpt / "model.ckpt.data")), "OK"], + ) + tcase.assertEqual( + lines[3].split(" "), + ["init_model_ckpt_index", str(revised_fname(Path(path) / init_model_ckpt / "model.ckpt.index")), "OK"], + ) for ii in range(2): tcase.assertEqual( - lines[1 + ii].split(" "), + lines[4 + ii].split(" "), [ "data", str(revised_fname(Path(path) / sorted(list(init_data))[ii])), @@ -105,7 +118,7 @@ def _check_log( ) for ii in range(2): tcase.assertEqual( - lines[3 + ii].split(" "), + lines[6 + ii].split(" "), [ "data", str(revised_fname(Path(path) / sorted(list(iter_data))[ii])), @@ -113,7 +126,7 @@ def _check_log( ], ) tcase.assertEqual( - lines[5].split(" "), ["script", str(revised_fname(Path(path) / script)), "OK"] + lines[8].split(" "), ["script", str(revised_fname(Path(path) / script)), "OK"] ) @@ -132,6 +145,21 @@ def _check_model( tcase.assertEqual(flines[ii + 1], mlines[ii]) +def _check_model_ckpt( + tcase, + fname, + path, + model, +): + with open(fname) as fp: + flines = fp.read().strip().split("\n") + with open(Path(path) / model) as fp: + mlines = fp.read().strip().split("\n") + tcase.assertEqual(flines[0], "read from init model ckpt: ") + for ii in range(len(mlines)): + tcase.assertEqual(flines[ii + 1], mlines[ii]) + + def _check_lcurve( tcase, fname, @@ -152,6 +180,7 @@ def check_run_train_nvnmd_output( work_dir, script, init_model, + init_model_ckpt, init_data, iter_data, only_check_name=False, @@ -164,12 +193,17 @@ def check_run_train_nvnmd_output( cwd, script, init_model, + init_model_ckpt, init_data, iter_data, only_check_name=only_check_name, ) - _check_model(tcase, "frozen_model.pb", cwd, init_model) - _check_lcurve(tcase, "lcurve.out", cwd, script) + _check_model(tcase, "nvnmd_cnn/frozen_model.pb", cwd, init_model) + _check_model(tcase, "nvnmd_qnn/model.pb", cwd, init_model) + _check_model_ckpt(tcase, "nvnmd_cnn/model.ckpt.meta", cwd, init_model_ckpt / "model.ckpt.meta") + _check_model_ckpt(tcase, "nvnmd_cnn/model.ckpt.data-00000-of-00001", cwd, init_model_ckpt / "model.ckpt.data") + _check_model_ckpt(tcase, "nvnmd_cnn/model.ckpt.index", cwd, init_model_ckpt / "model.ckpt.index") + _check_lcurve(tcase, "nvnmd_cnn/lcurve.out", cwd, script) os.chdir(cwd) @@ -208,6 +242,7 @@ def setUp(self): self.numb_models = mocked_numb_models self.init_models = make_mocked_init_models(self.numb_models) + self.init_models_ckpt = make_mocked_init_models_ckpt(self.numb_models) tmp_init_data = make_mocked_init_data() self.init_data = tmp_init_data @@ -240,6 +275,9 @@ def tearDown(self): for ii in self.init_models: if Path(ii).exists(): os.remove(ii) + for ii in self.init_models_ckpt: + if Path(ii).exists(): + shutil.rmtree(ii) def test(self): for ii in range(3): @@ -250,6 +288,9 @@ def test(self): "task_name": self.task_names[ii], "task_path": self.task_paths[ii], "init_model": self.init_models[ii], + "init_model_ckpt_meta": self.init_models_ckpt[ii] / "model.ckpt.meta", + "init_model_ckpt_data": self.init_models_ckpt[ii] / "model.ckpt.data", + "init_model_ckpt_index": self.init_models_ckpt[ii] / "model.ckpt.index", "init_data": self.init_data, "iter_data": self.iter_data, } @@ -257,15 +298,19 @@ def test(self): op = run.execute(ip) self.assertEqual(op["script"], Path(train_task_pattern % ii) / "input.json") self.assertTrue(op["script"].is_file()) - self.assertEqual(op["cnn_model"], Path(train_task_pattern % ii) / "frozen_model.pb") - self.assertEqual(op["qnn_model"], Path(train_task_pattern % ii) / "model.pb") + self.assertEqual(op["cnn_model"], Path(train_task_pattern % ii) / "nvnmd_cnn" / "frozen_model.pb") + self.assertEqual(op["qnn_model"], Path(train_task_pattern % ii) / "nvnmd_qnn" / "model.pb") + self.assertEqual(op["model_ckpt_data"], Path(train_task_pattern % ii) / "nvnmd_cnn" / "model.ckpt.data-00000-of-00001") + self.assertEqual(op["model_ckpt_meta"], Path(train_task_pattern % ii) / "nvnmd_cnn" /"model.ckpt.meta") + self.assertEqual(op["model_ckpt_index"], Path(train_task_pattern % ii) / "nvnmd_cnn" /"model.ckpt.index") self.assertEqual(op["log"], Path(train_task_pattern % ii) / "log") - self.assertEqual(op["lcurve"], Path(train_task_pattern % ii) / "lcurve.out") + self.assertEqual(op["lcurve"], Path(train_task_pattern % ii) / "nvnmd_cnn" / "lcurve.out") check_run_train_nvnmd_output( self, self.task_names[ii], self.train_scripts[ii], self.init_models[ii], + self.init_models_ckpt[ii], self.init_data, self.iter_data, ) @@ -278,7 +323,13 @@ def setUp(self): tmp_models = make_mocked_init_models(self.numb_models) self.init_models = upload_artifact(tmp_models) self.str_init_models = tmp_models - + + tmp_models_ckpt = make_mocked_init_models_ckpt(self.numb_models) + self.init_models_ckpt_meta = upload_artifact([dir / "model.ckpt.meta" for dir in tmp_models_ckpt]) + self.init_models_ckpt_data = upload_artifact([dir / "model.ckpt.data" for dir in tmp_models_ckpt]) + self.init_models_ckpt_index = upload_artifact([dir / "model.ckpt.index" for dir in tmp_models_ckpt]) + self.str_init_models_ckpt = tmp_models_ckpt + tmp_init_data = make_mocked_init_data() self.init_data = upload_artifact(tmp_init_data) self.path_init_data = tmp_init_data @@ -308,6 +359,9 @@ def tearDown(self): for ii in self.str_init_models: if Path(ii).exists(): os.remove(ii) + for ii in self.str_init_models_ckpt: + if Path(ii).exists(): + shutil.rmtree(ii) def test_train(self): steps = PrepRunNvNMDTrain( @@ -328,6 +382,9 @@ def test_train(self): }, artifacts={ "init_models": self.init_models, + "init_models_ckpt_meta": self.init_models_ckpt_meta, + "init_models_ckpt_data": self.init_models_ckpt_data, + "init_models_ckpt_index": self.init_models_ckpt_index, "init_data": self.init_data, "iter_data": self.iter_data, }, @@ -345,6 +402,10 @@ def test_train(self): download_artifact(step.outputs.artifacts["scripts"]) download_artifact(step.outputs.artifacts["models"]) + download_artifact(step.outputs.artifacts["models_ckpt_meta"]) + download_artifact(step.outputs.artifacts["models_ckpt_data"]) + download_artifact(step.outputs.artifacts["models_ckpt_index"]) + download_artifact(step.outputs.artifacts["nvnmodels"]) download_artifact(step.outputs.artifacts["logs"]) download_artifact(step.outputs.artifacts["lcurves"]) @@ -354,6 +415,7 @@ def test_train(self): self.task_names[ii], self.train_scripts[ii], self.str_init_models[ii], + self.str_init_models_ckpt[ii], self.path_init_data, self.path_iter_data, only_check_name=True, @@ -378,6 +440,9 @@ def test_train_no_init_model(self): }, artifacts={ "init_models": None, + "init_models_ckpt_meta": None, + "init_models_ckpt_data": None, + "init_models_ckpt_index": None, "init_data": self.init_data, "iter_data": self.iter_data, }, From 8064b74dd90bc69960ec2fdcae281fddbb7467e5 Mon Sep 17 00:00:00 2001 From: Leo <843497845@qq.com> Date: Tue, 10 Jun 2025 16:42:13 +0800 Subject: [PATCH 26/49] fix nvnmd test unit --- tests/mocked_ops.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/tests/mocked_ops.py b/tests/mocked_ops.py index 48488b24..543e69d7 100644 --- a/tests/mocked_ops.py +++ b/tests/mocked_ops.py @@ -573,6 +573,12 @@ def execute( script = ip["task_path"] / "input.json" if ip["init_model"] is not None: raise FatalError("init model is not None") + if ip["init_model_ckpt_meta"] is not None: + raise FatalError("init model ckpt meta is not None") + if ip["init_model_ckpt_data"] is not None: + raise FatalError("init model ckpt data is not None") + if ip["init_model_ckpt_index"] is not None: + raise FatalError("init model ckpt index is not None") init_data = ip["init_data"] iter_data = ip["iter_data"] @@ -616,12 +622,15 @@ def execute( ) copyfile(script, oscript) - cnn_model = Path("frozen_model.pb") - qnn_model = Path("model.pb") - model_ckpt_meta_file = Path("model.ckpt.meta") - model_ckpt_data_file = Path("model.ckpt.data-00000-of-00001") - model_ckpt_index_file = Path("model.ckpt.index") - lcurve = Path("lcurve.out") + + cnn_dir = Path("nvnmd_cnn") + qnn_dir = Path("nvnmd_qnn") + cnn_model = cnn_dir / Path("frozen_model.pb") + qnn_model = qnn_dir / Path("model.pb") + model_ckpt_meta_file = cnn_dir / Path("model.ckpt.meta") + model_ckpt_data_file = cnn_dir / Path("model.ckpt.data-00000-of-00001") + model_ckpt_index_file = cnn_dir / Path("model.ckpt.index") + lcurve = cnn_dir / Path("lcurve.out") log = Path("log") for ii in jtmp["data"]: @@ -633,16 +642,18 @@ def execute( with log.open("a") as f: f.write(f"script {str(script)} OK\n") + cnn_dir.mkdir(exist_ok=True, parents=True) with cnn_model.open("w") as f: f.write("read from init model: \n") - with qnn_model.open("w") as f: - f.write("read from init model: \n") with model_ckpt_meta_file.open("w") as f: f.write("read from init model ckpt: \n") with model_ckpt_data_file.open("w") as f: f.write("read from init model ckpt: \n") with model_ckpt_index_file.open("w") as f: f.write("read from init model ckpt: \n") + qnn_dir.mkdir(exist_ok=True, parents=True) + with qnn_model.open("w") as f: + f.write("read from init model: \n") with lcurve.open("w") as f: f.write("read from train_script: \n") f.write(script.read_text() + "\n") From b73ea1172ab0c292fb6966722e2f60819bf38c35 Mon Sep 17 00:00:00 2001 From: jiongwalai <843497845@qq.com> Date: Tue, 10 Jun 2025 23:42:20 +0800 Subject: [PATCH 27/49] fix dpgen_loop in nvnmd --- dpgen2/flow/dpgen_loop.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dpgen2/flow/dpgen_loop.py b/dpgen2/flow/dpgen_loop.py index f1b6ac06..947e461c 100644 --- a/dpgen2/flow/dpgen_loop.py +++ b/dpgen2/flow/dpgen_loop.py @@ -513,7 +513,7 @@ def _loop( _then=block_step.outputs.artifacts["models"], _else=next_step.outputs.artifacts["models"], ) - if ( hasattr(block_step.outputs.artifacts["models_ckpt_meta"], "local_path") and + if ( hasattr(block_step.outputs.artifacts["models_ckpt_meta"], "local_path") or hasattr(next_step.outputs.artifacts["models_ckpt_meta"], "local_path") ): steps.outputs.artifacts["models_ckpt_meta"].from_expression = if_expression( @@ -521,7 +521,7 @@ def _loop( _then=block_step.outputs.artifacts["models_ckpt_meta"], _else=next_step.outputs.artifacts["models_ckpt_meta"], ) - if ( hasattr(block_step.outputs.artifacts["models_ckpt_data"], "local_path") and + if ( hasattr(block_step.outputs.artifacts["models_ckpt_data"], "local_path") or hasattr(next_step.outputs.artifacts["models_ckpt_data"], "local_path") ): steps.outputs.artifacts["models_ckpt_data"].from_expression = if_expression( @@ -529,7 +529,7 @@ def _loop( _then=block_step.outputs.artifacts["models_ckpt_data"], _else=next_step.outputs.artifacts["models_ckpt_data"], ) - if ( hasattr(block_step.outputs.artifacts["models_ckpt_index"], "local_path") and + if ( hasattr(block_step.outputs.artifacts["models_ckpt_index"], "local_path") or hasattr(next_step.outputs.artifacts["models_ckpt_index"], "local_path") ): steps.outputs.artifacts["models_ckpt_index"].from_expression = if_expression( From ba7fc224ad76a41b271c59650c25bbd238406bc1 Mon Sep 17 00:00:00 2001 From: jiongwalai <843497845@qq.com> Date: Wed, 11 Jun 2025 12:42:31 +0800 Subject: [PATCH 28/49] fix run nvnmd command && fix model_ckpt input and output --- dpgen2/flow/dpgen_loop.py | 115 ++++++++-------------- dpgen2/op/run_nvnmd.py | 2 +- dpgen2/op/run_nvnmd_train.py | 5 +- dpgen2/superop/block.py | 4 +- dpgen2/utils/download_dpgen2_artifacts.py | 6 ++ 5 files changed, 55 insertions(+), 77 deletions(-) diff --git a/dpgen2/flow/dpgen_loop.py b/dpgen2/flow/dpgen_loop.py index 947e461c..db074d97 100644 --- a/dpgen2/flow/dpgen_loop.py +++ b/dpgen2/flow/dpgen_loop.py @@ -380,35 +380,20 @@ def _loop( "explore_config": steps.inputs.parameters["explore_config"], "expl_task_grp": steps.inputs.parameters["expl_task_grp"], } - if hasattr( steps.inputs.artifacts["init_models_ckpt_meta"], "local_path" ) and \ - hasattr( steps.inputs.artifacts["init_models_ckpt_data"], "local_path" ) and \ - hasattr( steps.inputs.artifacts["init_models_ckpt_index"], "local_path" ): - block_step = Step( - name=name + "-block", - template=block_op, - parameters=block_common_parameters, - artifacts={ - "init_models": steps.inputs.artifacts["init_models"], - "init_models_ckpt_meta": steps.inputs.artifacts["init_models_ckpt_meta"], - "init_models_ckpt_index": steps.inputs.artifacts["init_models_ckpt_index"], - "init_models_ckpt_data": steps.inputs.artifacts["init_models_ckpt_data"], - "init_data": steps.inputs.artifacts["init_data"], - "iter_data": steps.inputs.artifacts["iter_data"], - }, - key=step_keys["block"], - ) - else: - block_step = Step( - name=name + "-block", - template=block_op, - parameters=block_common_parameters, - artifacts={ - "init_models": steps.inputs.artifacts["init_models"], - "init_data": steps.inputs.artifacts["init_data"], - "iter_data": steps.inputs.artifacts["iter_data"], - }, - key=step_keys["block"], - ) + block_step = Step( + name=name + "-block", + template=block_op, + parameters=block_common_parameters, + artifacts={ + "init_models": steps.inputs.artifacts["init_models"], + "init_models_ckpt_meta": steps.inputs.artifacts["init_models_ckpt_meta"], + "init_models_ckpt_index": steps.inputs.artifacts["init_models_ckpt_index"], + "init_models_ckpt_data": steps.inputs.artifacts["init_models_ckpt_data"], + "init_data": steps.inputs.artifacts["init_data"], + "iter_data": steps.inputs.artifacts["iter_data"], + }, + key=step_keys["block"], + ) steps.add(block_step) scheduler_step = Step( @@ -470,9 +455,9 @@ def _loop( ), "expl_task_grp": scheduler_step.outputs.parameters["expl_task_grp"], } - if hasattr( block_step.outputs.artifacts["models_ckpt_meta"], "local_path" ) and \ - hasattr( block_step.outputs.artifacts["models_ckpt_data"], "local_path" ) and \ - hasattr( block_step.outputs.artifacts["models_ckpt_index"], "local_path" ): + if (hasattr(block_step.outputs.artifacts["models_ckpt_meta"]._from, "path") and + hasattr(block_step.outputs.artifacts["models_ckpt_data"]._from, "path") and + hasattr(block_step.outputs.artifacts["models_ckpt_index"]._from, "path")): next_step = Step( name=name + "-next", template=steps, @@ -513,25 +498,25 @@ def _loop( _then=block_step.outputs.artifacts["models"], _else=next_step.outputs.artifacts["models"], ) - if ( hasattr(block_step.outputs.artifacts["models_ckpt_meta"], "local_path") or - hasattr(next_step.outputs.artifacts["models_ckpt_meta"], "local_path") - ): + if ( hasattr(block_step.outputs.artifacts["models_ckpt_meta"]._from, "path") or + hasattr(next_step.outputs.artifacts["models_ckpt_meta"]._from, "path") + ): steps.outputs.artifacts["models_ckpt_meta"].from_expression = if_expression( _if=(scheduler_step.outputs.parameters["converged"] == True), _then=block_step.outputs.artifacts["models_ckpt_meta"], _else=next_step.outputs.artifacts["models_ckpt_meta"], ) - if ( hasattr(block_step.outputs.artifacts["models_ckpt_data"], "local_path") or - hasattr(next_step.outputs.artifacts["models_ckpt_data"], "local_path") + if ( hasattr(block_step.outputs.artifacts["models_ckpt_data"]._from, "path") or + hasattr(next_step.outputs.artifacts["models_ckpt_data"]._from, "path") ): steps.outputs.artifacts["models_ckpt_data"].from_expression = if_expression( _if=(scheduler_step.outputs.parameters["converged"] == True), _then=block_step.outputs.artifacts["models_ckpt_data"], _else=next_step.outputs.artifacts["models_ckpt_data"], ) - if ( hasattr(block_step.outputs.artifacts["models_ckpt_index"], "local_path") or - hasattr(next_step.outputs.artifacts["models_ckpt_index"], "local_path") - ): + if ( hasattr(block_step.outputs.artifacts["models_ckpt_index"]._from, "path") or + hasattr(next_step.outputs.artifacts["models_ckpt_index"]._from, "path") + ): steps.outputs.artifacts["models_ckpt_index"].from_expression = if_expression( _if=(scheduler_step.outputs.parameters["converged"] == True), _then=block_step.outputs.artifacts["models_ckpt_index"], @@ -616,46 +601,32 @@ def _dpgen( "optional_parameter": steps.inputs.parameters["optional_parameter"], "expl_task_grp": scheduler_step.outputs.parameters["expl_task_grp"], } - if hasattr( steps.inputs.artifacts["init_models_ckpt_meta"], "local_path" ) and \ - hasattr( steps.inputs.artifacts["init_models_ckpt_data"], "local_path" ) and \ - hasattr( steps.inputs.artifacts["init_models_ckpt_index"], "local_path" ): - loop_step = Step( - name=name + "-loop", - template=loop_op, - parameters=common_parameters, - artifacts={ - "init_models": steps.inputs.artifacts["init_models"], - "init_models_ckpt_meta": steps.inputs.artifacts["init_models_ckpt_meta"], - "init_models_ckpt_data": steps.inputs.artifacts["init_models_ckpt_data"], - "init_models_ckpt_index": steps.inputs.artifacts["init_models_ckpt_index"], - "init_data": steps.inputs.artifacts["init_data"], - "iter_data": steps.inputs.artifacts["iter_data"], - }, - key="--".join(["%s" % id_step.outputs.parameters["block_id"], loop_key]), - ) - else: - loop_step = Step( - name=name + "-loop", - template=loop_op, - parameters=common_parameters, - artifacts={ - "init_models": steps.inputs.artifacts["init_models"], - "init_data": steps.inputs.artifacts["init_data"], - "iter_data": steps.inputs.artifacts["iter_data"], - }, - key="--".join(["%s" % id_step.outputs.parameters["block_id"], loop_key]), - ) + + loop_step = Step( + name=name + "-loop", + template=loop_op, + parameters=common_parameters, + artifacts={ + "init_models": steps.inputs.artifacts["init_models"], + "init_models_ckpt_meta": steps.inputs.artifacts["init_models_ckpt_meta"], + "init_models_ckpt_data": steps.inputs.artifacts["init_models_ckpt_data"], + "init_models_ckpt_index": steps.inputs.artifacts["init_models_ckpt_index"], + "init_data": steps.inputs.artifacts["init_data"], + "iter_data": steps.inputs.artifacts["iter_data"], + }, + key="--".join(["%s" % id_step.outputs.parameters["block_id"], loop_key]), + ) steps.add(loop_step) steps.outputs.parameters[ "exploration_scheduler" ].value_from_parameter = loop_step.outputs.parameters["exploration_scheduler"] steps.outputs.artifacts["models"]._from = loop_step.outputs.artifacts["models"] - if hasattr(loop_step.outputs.artifacts["models_ckpt_meta"], "local_path"): + if hasattr(loop_step.outputs.artifacts["models_ckpt_meta"]._from, "path"): steps.outputs.artifacts["models_ckpt_meta"]._from = loop_step.outputs.artifacts["models_ckpt_meta"] - if hasattr(loop_step.outputs.artifacts["models_ckpt_data"], "local_path"): + if hasattr(loop_step.outputs.artifacts["models_ckpt_data"]._from, "path"): steps.outputs.artifacts["models_ckpt_data"]._from = loop_step.outputs.artifacts["models_ckpt_data"] - if hasattr(loop_step.outputs.artifacts["models_ckpt_index"], "local_path"): + if hasattr(loop_step.outputs.artifacts["models_ckpt_index"]._from, "path"): steps.outputs.artifacts["models_ckpt_index"]._from = loop_step.outputs.artifacts["models_ckpt_index"] steps.outputs.artifacts["iter_data"]._from = loop_step.outputs.artifacts[ "iter_data" diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index 8c893e4d..9b1123cd 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -191,7 +191,7 @@ def execute( [ "cp", str(model_name), "model.pb", "&&", - "mylmp", "-i", lmp_input_name, + command, "-i", lmp_input_name, "-log", lmp_log_name, "-v", "rerun", "%d"%i, "&&", diff --git a/dpgen2/op/run_nvnmd_train.py b/dpgen2/op/run_nvnmd_train.py index 3f9b6fe2..96956fe6 100644 --- a/dpgen2/op/run_nvnmd_train.py +++ b/dpgen2/op/run_nvnmd_train.py @@ -73,7 +73,10 @@ def _make_train_command( if case_init_model: if isinstance(init_model, list): # initialize from model.ckpt - init_model = ".".join(str(init_model[0]).split('.')[:-1]) + #init_model = ".".join(str(init_model[0]).split('.')[:-1]) + for i in init_model: + shutil.copy(i, '.') + init_model = "model.ckpt" init_flag = "--init-model" else: # initialize from frozen model init_flag = "--init-frz-model" diff --git a/dpgen2/superop/block.py b/dpgen2/superop/block.py index fa225b3d..c8940b92 100644 --- a/dpgen2/superop/block.py +++ b/dpgen2/superop/block.py @@ -363,11 +363,9 @@ def _block_cl( block_steps.outputs.artifacts["models"]._from = prep_run_dp_train.outputs.artifacts[ "models" ] - if "models_ckpt_meta" in prep_run_dp_train.outputs.artifacts: + if isinstance(prep_run_dp_train_op, PrepRunNvNMDTrain): block_steps.outputs.artifacts["models_ckpt_meta"]._from = prep_run_dp_train.outputs.artifacts["models_ckpt_meta"] - if "models_ckpt_meta" in prep_run_dp_train.outputs.artifacts: block_steps.outputs.artifacts["models_ckpt_data"]._from = prep_run_dp_train.outputs.artifacts["models_ckpt_data"] - if "models_ckpt_meta" in prep_run_dp_train.outputs.artifacts: block_steps.outputs.artifacts["models_ckpt_index"]._from = prep_run_dp_train.outputs.artifacts["models_ckpt_index"] block_steps.outputs.artifacts["iter_data"]._from = collect_data.outputs.artifacts[ "iter_data" diff --git a/dpgen2/utils/download_dpgen2_artifacts.py b/dpgen2/utils/download_dpgen2_artifacts.py index b5f69153..3d46b8da 100644 --- a/dpgen2/utils/download_dpgen2_artifacts.py +++ b/dpgen2/utils/download_dpgen2_artifacts.py @@ -54,10 +54,16 @@ def add_output( op_download_setting = { "prep-run-train": DownloadDefinition() .add_input("init_models") + .add_input("init_model_ckpt_meta") + .add_input("init_model_ckpt_data") + .add_input("init_model_ckpt_index") .add_input("init_data") .add_input("iter_data") .add_output("scripts") .add_output("models") + .add_output("models_ckpt_meta") + .add_output("models_ckpt_data") + .add_output("models_ckpt_index") .add_output("logs") .add_output("lcurves"), "prep-run-explore": DownloadDefinition() From e329de93097e23e8eedfffce34a65bcdb3198dd0 Mon Sep 17 00:00:00 2001 From: jiongwalai <843497845@qq.com> Date: Wed, 11 Jun 2025 13:02:33 +0800 Subject: [PATCH 29/49] fix nvnmd test unit --- dpgen2/op/run_nvnmd_train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dpgen2/op/run_nvnmd_train.py b/dpgen2/op/run_nvnmd_train.py index 96956fe6..18acf117 100644 --- a/dpgen2/op/run_nvnmd_train.py +++ b/dpgen2/op/run_nvnmd_train.py @@ -75,7 +75,8 @@ def _make_train_command( if isinstance(init_model, list): # initialize from model.ckpt #init_model = ".".join(str(init_model[0]).split('.')[:-1]) for i in init_model: - shutil.copy(i, '.') + if(os.path.exists(i)): + shutil.copy(i, '.') init_model = "model.ckpt" init_flag = "--init-model" else: # initialize from frozen model From fcb77474ac35054c418e4847de46dc453a9c5141 Mon Sep 17 00:00:00 2001 From: jiongwalai <843497845@qq.com> Date: Wed, 11 Jun 2025 13:46:24 +0800 Subject: [PATCH 30/49] fix dl test unit --- tests/utils/test_dl_dpgen2_arti.py | 32 +++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/tests/utils/test_dl_dpgen2_arti.py b/tests/utils/test_dl_dpgen2_arti.py index c1166678..07e0ab04 100644 --- a/tests/utils/test_dl_dpgen2_arti.py +++ b/tests/utils/test_dl_dpgen2_arti.py @@ -67,7 +67,22 @@ def test_train_download(self, mocked_dl): skip_exists=True, ), mock.call( - "arti-init_data", + "arti-init_models", + path=Path("foo/iter-000000/prep-run-train/inputs"), + skip_exists=True, + ), + mock.call( + "arti-init_models_ckpt_meta", + path=Path("foo/iter-000000/prep-run-train/inputs"), + skip_exists=True, + ), + mock.call( + "arti-init_models_ckpt_data", + path=Path("foo/iter-000000/prep-run-train/inputs"), + skip_exists=True, + ), + mock.call( + "arti-init_models_ckpt_index", path=Path("foo/iter-000000/prep-run-train/inputs"), skip_exists=True, ), @@ -86,6 +101,21 @@ def test_train_download(self, mocked_dl): path=Path("foo/iter-000000/prep-run-train/outputs"), skip_exists=True, ), + mock.call( + "arti-models_ckpt_meta", + path=Path("foo/iter-000000/prep-run-train/outputs"), + skip_exists=True, + ), + mock.call( + "arti-models_ckpt_data", + path=Path("foo/iter-000000/prep-run-train/outputs"), + skip_exists=True, + ), + mock.call( + "arti-models_ckpt_index", + path=Path("foo/iter-000000/prep-run-train/outputs"), + skip_exists=True, + ), mock.call( "arti-logs", path=Path("foo/iter-000000/prep-run-train/outputs"), From b8bdce639c905c4f2ee537dd862b104cf8bd47c4 Mon Sep 17 00:00:00 2001 From: jiongwalai <843497845@qq.com> Date: Wed, 11 Jun 2025 13:55:03 +0800 Subject: [PATCH 31/49] fix dl test unit --- tests/utils/test_dl_dpgen2_arti.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/utils/test_dl_dpgen2_arti.py b/tests/utils/test_dl_dpgen2_arti.py index 07e0ab04..9872ef18 100644 --- a/tests/utils/test_dl_dpgen2_arti.py +++ b/tests/utils/test_dl_dpgen2_arti.py @@ -297,6 +297,21 @@ def test_update_finished_steps_none_steps(self, mocked_dl): path=Path("iter-000000/prep-run-train/inputs"), skip_exists=True, ), + mock.call( + "arti-init_models_ckpt_meta", + path=Path("iter-000000/prep-run-train/inputs"), + skip_exists=True, + ), + mock.call( + "arti-init_models_ckpt_data", + path=Path("iter-000000/prep-run-train/inputs"), + skip_exists=True, + ), + mock.call( + "arti-init_models_ckpt_index", + path=Path("iter-000000/prep-run-train/inputs"), + skip_exists=True, + ), mock.call( "arti-init_data", path=Path("iter-000000/prep-run-train/inputs"), @@ -317,6 +332,21 @@ def test_update_finished_steps_none_steps(self, mocked_dl): path=Path("iter-000000/prep-run-train/outputs"), skip_exists=True, ), + mock.call( + "arti-models_ckpt_data", + path=Path("iter-000000/prep-run-train/outputs"), + skip_exists=True, + ), + mock.call( + "arti-models_ckpt_meta", + path=Path("iter-000000/prep-run-train/outputs"), + skip_exists=True, + ), + mock.call( + "arti-models_ckpt_index", + path=Path("iter-000000/prep-run-train/outputs"), + skip_exists=True, + ), mock.call( "arti-logs", path=Path("iter-000000/prep-run-train/outputs"), From 35804c5f5eb4ad1be44c484c54950d8e32e4a73a Mon Sep 17 00:00:00 2001 From: jiongwalai <843497845@qq.com> Date: Wed, 11 Jun 2025 13:59:42 +0800 Subject: [PATCH 32/49] fix dl art --- dpgen2/utils/download_dpgen2_artifacts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dpgen2/utils/download_dpgen2_artifacts.py b/dpgen2/utils/download_dpgen2_artifacts.py index 3d46b8da..db4b7e6c 100644 --- a/dpgen2/utils/download_dpgen2_artifacts.py +++ b/dpgen2/utils/download_dpgen2_artifacts.py @@ -54,9 +54,9 @@ def add_output( op_download_setting = { "prep-run-train": DownloadDefinition() .add_input("init_models") - .add_input("init_model_ckpt_meta") - .add_input("init_model_ckpt_data") - .add_input("init_model_ckpt_index") + .add_input("init_models_ckpt_meta") + .add_input("init_models_ckpt_data") + .add_input("init_models_ckpt_index") .add_input("init_data") .add_input("iter_data") .add_output("scripts") From 26db68f4db0fbf01da05e51f9422c33d7f19ae49 Mon Sep 17 00:00:00 2001 From: jiongwalai <843497845@qq.com> Date: Wed, 11 Jun 2025 15:33:06 +0800 Subject: [PATCH 33/49] fix dl test unit --- tests/utils/test_dl_dpgen2_arti.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/utils/test_dl_dpgen2_arti.py b/tests/utils/test_dl_dpgen2_arti.py index 9872ef18..47b88ee3 100644 --- a/tests/utils/test_dl_dpgen2_arti.py +++ b/tests/utils/test_dl_dpgen2_arti.py @@ -61,11 +61,6 @@ class TestDownloadDpgen2Artifact(unittest.TestCase): def test_train_download(self, mocked_dl): download_dpgen2_artifacts(Mockedwf(), "iter-000000--prep-run-train", "foo") expected = [ - mock.call( - "arti-init_models", - path=Path("foo/iter-000000/prep-run-train/inputs"), - skip_exists=True, - ), mock.call( "arti-init_models", path=Path("foo/iter-000000/prep-run-train/inputs"), From 849ffeb0843611427111316bc4ecfc9207d4a231 Mon Sep 17 00:00:00 2001 From: jiongwalai <843497845@qq.com> Date: Wed, 11 Jun 2025 16:36:31 +0800 Subject: [PATCH 34/49] fix dl test unit --- tests/utils/test_dl_dpgen2_arti.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/utils/test_dl_dpgen2_arti.py b/tests/utils/test_dl_dpgen2_arti.py index 47b88ee3..b9b85575 100644 --- a/tests/utils/test_dl_dpgen2_arti.py +++ b/tests/utils/test_dl_dpgen2_arti.py @@ -81,6 +81,11 @@ def test_train_download(self, mocked_dl): path=Path("foo/iter-000000/prep-run-train/inputs"), skip_exists=True, ), + mock.call( + "arti-init_data", + path=Path("foo/iter-000000/prep-run-train/inputs"), + skip_exists=True, + ), mock.call( "arti-iter_data", path=Path("foo/iter-000000/prep-run-train/inputs"), @@ -328,12 +333,12 @@ def test_update_finished_steps_none_steps(self, mocked_dl): skip_exists=True, ), mock.call( - "arti-models_ckpt_data", + "arti-models_ckpt_meta", path=Path("iter-000000/prep-run-train/outputs"), skip_exists=True, ), mock.call( - "arti-models_ckpt_meta", + "arti-models_ckpt_data", path=Path("iter-000000/prep-run-train/outputs"), skip_exists=True, ), From d7210b9d230f726fcac817a9ab5537f2eacf7d5f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Jun 2025 13:20:15 +0000 Subject: [PATCH 35/49] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpgen2/entrypoint/args.py | 8 +- dpgen2/entrypoint/submit.py | 12 +-- .../exploration/render/traj_render_lammps.py | 6 +- dpgen2/exploration/task/lmp/lmp_input.py | 16 ++- .../task/lmp_template_task_group.py | 34 +++--- .../task/make_task_group_from_config.py | 10 +- dpgen2/exploration/task/npt_task_group.py | 2 +- dpgen2/flow/dpgen_loop.py | 58 ++++++---- dpgen2/op/__init__.py | 13 ++- dpgen2/op/prep_nvnmd_train.py | 2 +- dpgen2/op/run_nvnmd.py | 82 ++++++++------ dpgen2/op/run_nvnmd_train.py | 97 ++++++++++------- dpgen2/superop/__init__.py | 6 +- dpgen2/superop/block.py | 18 ++-- dpgen2/superop/prep_run_nvnmd_train.py | 55 ++++++++-- tests/mocked_ops.py | 35 +++--- tests/op/test_prep_nvnmd_train.py | 17 ++- tests/op/test_run_nvnmd.py | 46 +++++--- tests/op/test_run_nvnmd_train.py | 53 ++++++--- tests/test_prep_run_nvnmd_train.py | 102 ++++++++++++++---- 20 files changed, 430 insertions(+), 242 deletions(-) diff --git a/dpgen2/entrypoint/args.py b/dpgen2/entrypoint/args.py index 7fcd91ce..1c5e13c1 100644 --- a/dpgen2/entrypoint/args.py +++ b/dpgen2/entrypoint/args.py @@ -28,12 +28,12 @@ from dpgen2.op.run_dp_train import ( RunDPTrain, ) -from dpgen2.op.run_nvnmd_train import ( - RunNvNMDTrain, -) from dpgen2.op.run_lmp import ( RunLmp, ) +from dpgen2.op.run_nvnmd_train import ( + RunNvNMDTrain, +) from dpgen2.utils import ( normalize_step_dict, step_conf_args, @@ -129,6 +129,7 @@ def dp_train_args(): ), ] + def nvnmd_train_args(): doc_numb_models = "Number of models trained for evaluating the model deviation" doc_config = "Configuration of training" @@ -174,6 +175,7 @@ def nvnmd_train_args(): ), ] + def variant_train(): doc = "the type of the training" return Variant( diff --git a/dpgen2/entrypoint/submit.py b/dpgen2/entrypoint/submit.py index c35c123d..f2bb1531 100644 --- a/dpgen2/entrypoint/submit.py +++ b/dpgen2/entrypoint/submit.py @@ -104,16 +104,16 @@ PrepCalyInput, PrepCalyModelDevi, PrepDPTrain, - PrepNvNMDTrain, PrepLmp, + PrepNvNMDTrain, PrepRelax, RunCalyDPOptim, RunCalyModelDevi, RunDPTrain, - RunNvNMDTrain, RunLmp, RunLmpHDF5, RunNvNMD, + RunNvNMDTrain, RunRelax, RunRelaxHDF5, SelectConfs, @@ -126,9 +126,9 @@ PrepRunCaly, PrepRunDiffCSP, PrepRunDPTrain, - PrepRunNvNMDTrain, PrepRunFp, PrepRunLmp, + PrepRunNvNMDTrain, ) from dpgen2.superop.caly_evo_step import ( CalyEvoStep, @@ -216,7 +216,7 @@ def make_concurrent_learning_op( prep_config=prep_explore_config, run_config=run_explore_config, upload_python_packages=upload_python_packages, - ) + ) elif "calypso" in explore_style: expl_mode = explore_style.split(":")[-1] if ":" in explore_style else "default" if expl_mode == "merge": @@ -531,7 +531,7 @@ def workflow_concurrent_learning( else None ) config["train"]["numb_models"] = 1 - + elif train_style == "dp-nvnmd": init_models_paths = config["train"].get("init_models_paths", None) numb_models = config["train"]["numb_models"] @@ -540,7 +540,7 @@ def workflow_concurrent_learning( f"{len(init_models_paths)} init models provided, which does " "not match numb_models={numb_models}" ) - + else: raise RuntimeError(f"unknown params, train_style: {train_style}") diff --git a/dpgen2/exploration/render/traj_render_lammps.py b/dpgen2/exploration/render/traj_render_lammps.py index 8fec2744..06a638ea 100644 --- a/dpgen2/exploration/render/traj_render_lammps.py +++ b/dpgen2/exploration/render/traj_render_lammps.py @@ -126,8 +126,10 @@ def get_confs( traj = StringIO(trajs[ii].get_data()) # type: ignore else: traj = trajs[ii] - #ss = dpdata.System(traj, fmt=traj_fmt, type_map=type_map) - ss = read(str(traj), format="lammps-dump-text", index=":", specorder=type_map) + # ss = dpdata.System(traj, fmt=traj_fmt, type_map=type_map) + ss = read( + str(traj), format="lammps-dump-text", index=":", specorder=type_map + ) for jj in id_selected[ii]: s = dpdata.System(ss[jj], fmt="ase/structure", type_map=type_map) s.nopbc = self.nopbc diff --git a/dpgen2/exploration/task/lmp/lmp_input.py b/dpgen2/exploration/task/lmp/lmp_input.py index 777aef3a..07fba442 100644 --- a/dpgen2/exploration/task/lmp/lmp_input.py +++ b/dpgen2/exploration/task/lmp/lmp_input.py @@ -114,9 +114,7 @@ def make_lmp_input( model_devi_file_name, ) elif nvnmd_version is not None: - ret += "pair_style nvnmd %s\n" % ( - "model.pb" - ) + ret += "pair_style nvnmd %s\n" % ("model.pb") else: # 1.x keywords = "" @@ -151,7 +149,7 @@ def make_lmp_input( ) ret += "restart 10000 dpgen.restart\n" ret += "\n" - if(nvnmd_version is not None): + if nvnmd_version is not None: ret += 'if "${rerun} > 0" then "jump SELF rerun"\n' if pka_e is None: ret += 'if "${restart} == 0" then "velocity all create ${TEMP} %d"' % ( @@ -200,9 +198,9 @@ def make_lmp_input( ret += "\n" ret += "timestep %f\n" % dt ret += "run ${NSTEPS} upto\n" - if(nvnmd_version is not None): - ret += 'jump SELF end\n' - ret += 'label rerun\n' - ret += 'rerun %s.0 dump x y z fx fy fz add yes\n' % lmp_traj_name - ret += 'label end\n' + if nvnmd_version is not None: + ret += "jump SELF end\n" + ret += "label rerun\n" + ret += "rerun %s.0 dump x y z fx fy fz add yes\n" % lmp_traj_name + ret += "label end\n" return ret diff --git a/dpgen2/exploration/task/lmp_template_task_group.py b/dpgen2/exploration/task/lmp_template_task_group.py index c740e448..27075695 100644 --- a/dpgen2/exploration/task/lmp_template_task_group.py +++ b/dpgen2/exploration/task/lmp_template_task_group.py @@ -72,7 +72,7 @@ def set_lmp( self.pimd_bead, nvnmd_version=self.nvnmd_version, ) - if(nvnmd_version is not None): + if nvnmd_version is not None: self.lmp_template = revise_lmp_input_rerun(self.lmp_template) if plm_template_fname is not None: self.plm_template = Path(plm_template_fname).read_text().split("\n") @@ -176,7 +176,7 @@ def revise_lmp_input_model( if pimd_bead is not None else lmp_model_devi_name ) - if(nvnmd_version is None): + if nvnmd_version is None: idx = find_only_one_key(lmp_lines, ["pair_style", "deepmd"]) lmp_lines[idx] = "pair_style deepmd %s out_freq %d out_file %s%s" % ( graph_list, @@ -188,18 +188,18 @@ def revise_lmp_input_model( idx = find_only_one_key(lmp_lines, ["pair_style", "nvnmd"]) lmp_lines[idx] = "pair_style nvnmd %s %s" % ( "model.pb", - extra_pair_style_args + extra_pair_style_args, ) - + return lmp_lines -def revise_lmp_input_dump(lmp_lines, trj_freq, pimd_bead=None,nvnmd_version=None): +def revise_lmp_input_dump(lmp_lines, trj_freq, pimd_bead=None, nvnmd_version=None): idx = find_only_one_key(lmp_lines, ["dump", "dpgen_dump"]) lmp_traj_file_name = ( lmp_pimd_traj_name % pimd_bead if pimd_bead is not None else lmp_traj_name ) - if(nvnmd_version is None): + if nvnmd_version is None: lmp_lines[ idx ] = f"dump dpgen_dump all custom {trj_freq} {lmp_traj_file_name} id type x y z" @@ -207,10 +207,7 @@ def revise_lmp_input_dump(lmp_lines, trj_freq, pimd_bead=None,nvnmd_version=None lmp_lines[ idx ] = f"dump dpgen_dump all custom {trj_freq} {lmp_traj_file_name} id type x y z fx fy fz" - lmp_lines.insert( - idx+1, - 'if \"${rerun} > 0\" then \"jump SELF rerun\"' - ) + lmp_lines.insert(idx + 1, 'if "${rerun} > 0" then "jump SELF rerun"') return lmp_lines @@ -222,19 +219,12 @@ def revise_lmp_input_plm(lmp_lines, in_plm, out_plm="output.plumed"): ) return lmp_lines + def revise_lmp_input_rerun(lmp_lines): - lmp_lines.append( - 'jump SELF end' - ) - lmp_lines.append( - 'label rerun' - ) - lmp_lines.append( - f'rerun {lmp_traj_name}.0 dump x y z fx fy fz add yes' - ) - lmp_lines.append( - 'label end' - ) + lmp_lines.append("jump SELF end") + lmp_lines.append("label rerun") + lmp_lines.append(f"rerun {lmp_traj_name}.0 dump x y z fx fy fz add yes") + lmp_lines.append("label end") return lmp_lines diff --git a/dpgen2/exploration/task/make_task_group_from_config.py b/dpgen2/exploration/task/make_task_group_from_config.py index 05bfac30..b8113e21 100644 --- a/dpgen2/exploration/task/make_task_group_from_config.py +++ b/dpgen2/exploration/task/make_task_group_from_config.py @@ -298,7 +298,11 @@ def variant_task_group(): "lmp-md", dict, npt_task_group_args(), alias=["lmp-npt"], doc=doc_lmp_md ), Argument( - "lmp-nvnmd", dict, npt_task_group_args(), alias=["lmp-nvnmd-npt"], doc=doc_lmp_md + "lmp-nvnmd", + dict, + npt_task_group_args(), + alias=["lmp-nvnmd-npt"], + doc=doc_lmp_md, ), Argument( "lmp-template", @@ -627,7 +631,7 @@ def make_lmp_task_group_from_config( config["conf_idx"] = [] if "conf_idx" not in config else None config = lmp_normalize(config) config = config_strip_confidx(config) - + if config["type"] == "lmp-md": tgroup = NPTTaskGroup() config.pop("type") @@ -663,7 +667,7 @@ def make_lmp_task_group_from_config( numb_models, lmp_template, **config, - ) + ) elif config["type"] == "customized-lmp-template": tgroup = CustomizedLmpTemplateTaskGroup() config.pop("type") diff --git a/dpgen2/exploration/task/npt_task_group.py b/dpgen2/exploration/task/npt_task_group.py index e597071b..c66d985c 100644 --- a/dpgen2/exploration/task/npt_task_group.py +++ b/dpgen2/exploration/task/npt_task_group.py @@ -134,7 +134,7 @@ def _make_lmp_task( self.ele_temp_f, self.ele_temp_a, self.no_pbc, - nvnmd_version = self.nvnmd_version, + nvnmd_version=self.nvnmd_version, trj_seperate_files=False, pimd_bead=self.pimd_bead, ), diff --git a/dpgen2/flow/dpgen_loop.py b/dpgen2/flow/dpgen_loop.py index db074d97..c5ea0b79 100644 --- a/dpgen2/flow/dpgen_loop.py +++ b/dpgen2/flow/dpgen_loop.py @@ -186,8 +186,8 @@ def __init__( } self._input_artifacts = { "init_models": InputArtifact(optional=True), - "init_models_ckpt_meta": InputArtifact(optional=True), - "init_models_ckpt_data": InputArtifact(optional=True), + "init_models_ckpt_meta": InputArtifact(optional=True), + "init_models_ckpt_data": InputArtifact(optional=True), "init_models_ckpt_index": InputArtifact(optional=True), "init_data": InputArtifact(), "iter_data": InputArtifact(), @@ -455,18 +455,26 @@ def _loop( ), "expl_task_grp": scheduler_step.outputs.parameters["expl_task_grp"], } - if (hasattr(block_step.outputs.artifacts["models_ckpt_meta"]._from, "path") and - hasattr(block_step.outputs.artifacts["models_ckpt_data"]._from, "path") and - hasattr(block_step.outputs.artifacts["models_ckpt_index"]._from, "path")): + if ( + hasattr(block_step.outputs.artifacts["models_ckpt_meta"]._from, "path") + and hasattr(block_step.outputs.artifacts["models_ckpt_data"]._from, "path") + and hasattr(block_step.outputs.artifacts["models_ckpt_index"]._from, "path") + ): next_step = Step( name=name + "-next", template=steps, parameters=next_common_parameters, artifacts={ "init_models": block_step.outputs.artifacts["models"], - "init_models_ckpt_meta": block_step.outputs.artifacts["models_ckpt_meta"], - "init_models_ckpt_index": block_step.outputs.artifacts["models_ckpt_index"], - "init_models_ckpt_data": block_step.outputs.artifacts["models_ckpt_data"], + "init_models_ckpt_meta": block_step.outputs.artifacts[ + "models_ckpt_meta" + ], + "init_models_ckpt_index": block_step.outputs.artifacts[ + "models_ckpt_index" + ], + "init_models_ckpt_data": block_step.outputs.artifacts[ + "models_ckpt_data" + ], "init_data": steps.inputs.artifacts["init_data"], "iter_data": block_step.outputs.artifacts["iter_data"], }, @@ -498,25 +506,25 @@ def _loop( _then=block_step.outputs.artifacts["models"], _else=next_step.outputs.artifacts["models"], ) - if ( hasattr(block_step.outputs.artifacts["models_ckpt_meta"]._from, "path") or - hasattr(next_step.outputs.artifacts["models_ckpt_meta"]._from, "path") - ): + if hasattr( + block_step.outputs.artifacts["models_ckpt_meta"]._from, "path" + ) or hasattr(next_step.outputs.artifacts["models_ckpt_meta"]._from, "path"): steps.outputs.artifacts["models_ckpt_meta"].from_expression = if_expression( _if=(scheduler_step.outputs.parameters["converged"] == True), _then=block_step.outputs.artifacts["models_ckpt_meta"], _else=next_step.outputs.artifacts["models_ckpt_meta"], ) - if ( hasattr(block_step.outputs.artifacts["models_ckpt_data"]._from, "path") or - hasattr(next_step.outputs.artifacts["models_ckpt_data"]._from, "path") - ): + if hasattr( + block_step.outputs.artifacts["models_ckpt_data"]._from, "path" + ) or hasattr(next_step.outputs.artifacts["models_ckpt_data"]._from, "path"): steps.outputs.artifacts["models_ckpt_data"].from_expression = if_expression( _if=(scheduler_step.outputs.parameters["converged"] == True), _then=block_step.outputs.artifacts["models_ckpt_data"], _else=next_step.outputs.artifacts["models_ckpt_data"], ) - if ( hasattr(block_step.outputs.artifacts["models_ckpt_index"]._from, "path") or - hasattr(next_step.outputs.artifacts["models_ckpt_index"]._from, "path") - ): + if hasattr( + block_step.outputs.artifacts["models_ckpt_index"]._from, "path" + ) or hasattr(next_step.outputs.artifacts["models_ckpt_index"]._from, "path"): steps.outputs.artifacts["models_ckpt_index"].from_expression = if_expression( _if=(scheduler_step.outputs.parameters["converged"] == True), _then=block_step.outputs.artifacts["models_ckpt_index"], @@ -608,8 +616,8 @@ def _dpgen( parameters=common_parameters, artifacts={ "init_models": steps.inputs.artifacts["init_models"], - "init_models_ckpt_meta": steps.inputs.artifacts["init_models_ckpt_meta"], - "init_models_ckpt_data": steps.inputs.artifacts["init_models_ckpt_data"], + "init_models_ckpt_meta": steps.inputs.artifacts["init_models_ckpt_meta"], + "init_models_ckpt_data": steps.inputs.artifacts["init_models_ckpt_data"], "init_models_ckpt_index": steps.inputs.artifacts["init_models_ckpt_index"], "init_data": steps.inputs.artifacts["init_data"], "iter_data": steps.inputs.artifacts["iter_data"], @@ -623,11 +631,17 @@ def _dpgen( ].value_from_parameter = loop_step.outputs.parameters["exploration_scheduler"] steps.outputs.artifacts["models"]._from = loop_step.outputs.artifacts["models"] if hasattr(loop_step.outputs.artifacts["models_ckpt_meta"]._from, "path"): - steps.outputs.artifacts["models_ckpt_meta"]._from = loop_step.outputs.artifacts["models_ckpt_meta"] + steps.outputs.artifacts["models_ckpt_meta"]._from = loop_step.outputs.artifacts[ + "models_ckpt_meta" + ] if hasattr(loop_step.outputs.artifacts["models_ckpt_data"]._from, "path"): - steps.outputs.artifacts["models_ckpt_data"]._from = loop_step.outputs.artifacts["models_ckpt_data"] + steps.outputs.artifacts["models_ckpt_data"]._from = loop_step.outputs.artifacts[ + "models_ckpt_data" + ] if hasattr(loop_step.outputs.artifacts["models_ckpt_index"]._from, "path"): - steps.outputs.artifacts["models_ckpt_index"]._from = loop_step.outputs.artifacts["models_ckpt_index"] + steps.outputs.artifacts[ + "models_ckpt_index" + ]._from = loop_step.outputs.artifacts["models_ckpt_index"] steps.outputs.artifacts["iter_data"]._from = loop_step.outputs.artifacts[ "iter_data" ] diff --git a/dpgen2/op/__init__.py b/dpgen2/op/__init__.py index 95f7f9c2..a2d43f58 100644 --- a/dpgen2/op/__init__.py +++ b/dpgen2/op/__init__.py @@ -19,12 +19,12 @@ from .prep_dp_train import ( PrepDPTrain, ) -from .prep_nvnmd_train import ( - PrepNvNMDTrain, -) from .prep_lmp import ( PrepLmp, ) +from .prep_nvnmd_train import ( + PrepNvNMDTrain, +) from .prep_relax import ( PrepRelax, ) @@ -37,9 +37,6 @@ from .run_dp_train import ( RunDPTrain, ) -from .run_nvnmd_train import ( - RunNvNMDTrain, -) from .run_lmp import ( RunLmp, RunLmpHDF5, @@ -47,7 +44,9 @@ from .run_nvnmd import ( RunNvNMD, ) - +from .run_nvnmd_train import ( + RunNvNMDTrain, +) from .run_relax import ( RunRelax, RunRelaxHDF5, diff --git a/dpgen2/op/prep_nvnmd_train.py b/dpgen2/op/prep_nvnmd_train.py index 45a7e121..56452514 100644 --- a/dpgen2/op/prep_nvnmd_train.py +++ b/dpgen2/op/prep_nvnmd_train.py @@ -112,7 +112,7 @@ def _script_rand_seed( input_dict, ): jtmp = input_dict.copy() - + # the key "seed" in "nvnmd" is used to set the random seed for the network parameters, it is developing. jtmp["nvnmd"]["seed"] = random.randrange(sys.maxsize) % (2**32) jtmp["training"]["seed"] = random.randrange(sys.maxsize) % (2**32) diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index 9b1123cd..d6882a61 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -1,10 +1,10 @@ import glob +import itertools import json import logging import os import random import re -import itertools from pathlib import ( Path, ) @@ -16,6 +16,9 @@ ) import numpy as np +from ase.io import ( + read, +) from dargs import ( Argument, ArgumentEncoder, @@ -44,6 +47,9 @@ plm_output_name, pytorch_model_name_pattern, ) +from dpgen2.op.run_caly_model_devi import ( + write_model_devi_out, +) from dpgen2.utils import ( BinaryFileInput, set_directory, @@ -51,10 +57,6 @@ from dpgen2.utils.run_command import ( run_command, ) -from dpgen2.op.run_caly_model_devi import ( - write_model_devi_out, -) -from ase.io import read class RunNvNMD(OP): @@ -151,9 +153,7 @@ def execute( try: Path(iname).symlink_to(ii) except: - logging.warning( - "failed to link %s, maybe already linked" % iname - ) + logging.warning("failed to link %s, maybe already linked" % iname) pass # link models model_names = [] @@ -168,7 +168,7 @@ def execute( "failed to link %s, maybe already linked" % mname ) pass - + elif ext == ".pt": # freeze model mname = pytorch_model_name_pattern % (idx) @@ -189,13 +189,22 @@ def execute( [ " ".join( [ - "cp", str(model_name), "model.pb", + "cp", + str(model_name), + "model.pb", + "&&", + command, + "-i", + lmp_input_name, + "-log", + lmp_log_name, + "-v", + "rerun", + "%d" % i, "&&", - command, "-i", lmp_input_name, - "-log", lmp_log_name, - "-v", "rerun", "%d"%i, - "&&", - "cp", lmp_traj_name, lmp_traj_name+".%d"%i + "cp", + lmp_traj_name, + lmp_traj_name + ".%d" % i, ] ) for i, model_name in enumerate(models) @@ -230,9 +239,11 @@ def execute( with open("job.json", "w") as f: json.dump(data, f, indent=4) merge_pimd_files() - + if os.path.exists(lmp_traj_name): - calc_model_devi([lmp_traj_name+f".{i}" for i in range(len(model_names))]) + calc_model_devi( + [lmp_traj_name + f".{i}" for i in range(len(model_names))] + ) ret_dict = { "log": work_dir / lmp_log_name, @@ -423,40 +434,51 @@ def merge_pimd_files(): with open(model_devi_file, "r") as f2: f.write(f2.read()) + def calc_model_devi( traj_files, - fname = "model_devi.out", + fname="model_devi.out", ): - trajectories = [] for f in traj_files: - traj = read(f, format='lammps-dump-text', index=':', order=True) + traj = read(f, format="lammps-dump-text", index=":", order=True) trajectories.append(traj) - + num_frames = len(trajectories[0]) for traj in trajectories: assert len(traj) == num_frames, "Not match" - + devi = [] for frame_idx in range(num_frames): frames = [traj[frame_idx] for traj in trajectories] - + all_forces = [atoms.get_forces() for atoms in frames] all_errors = [] - + for atom_idx in range(len(frames[0])): forces = [forces_arr[atom_idx] for forces_arr in all_forces] - + for a, b in itertools.combinations(forces, 2): error = np.linalg.norm(a - b) all_errors.append(error) - + max_error = np.max(all_errors) if all_errors else 0.0 min_error = np.min(all_errors) if all_errors else 0.0 - avg_error = np.mean(all_errors) if all_errors else 0.0 + avg_error = np.mean(all_errors) if all_errors else 0.0 # ase verion >= 3.26.0, please update ase using "pip install git+https://gitlab.com/ase/ase.git" - devi.append([trajectories[0][frame_idx].info['timestep'],0,0,0,max_error, min_error, avg_error,0]) - + devi.append( + [ + trajectories[0][frame_idx].info["timestep"], + 0, + 0, + 0, + max_error, + min_error, + avg_error, + 0, + ] + ) + devi = np.array(devi) - write_model_devi_out(devi, fname=fname) + write_model_devi_out(devi, fname=fname) diff --git a/dpgen2/op/run_nvnmd_train.py b/dpgen2/op/run_nvnmd_train.py index 18acf117..a19c7c8b 100644 --- a/dpgen2/op/run_nvnmd_train.py +++ b/dpgen2/op/run_nvnmd_train.py @@ -1,9 +1,9 @@ +import copy import glob import json import logging import os import shutil -import copy from pathlib import ( Path, ) @@ -35,9 +35,9 @@ ) from dpgen2.constants import ( - train_script_name, train_cnn_script_name, train_qnn_script_name, + train_script_name, train_task_pattern, ) from dpgen2.utils.chdir import ( @@ -55,42 +55,47 @@ def _make_train_command( init_model, train_args="", ): - # find checkpoint - if os.path.isfile("nvnmd_cnn/checkpoint") and not os.path.isfile("nvnmd_cnn/frozen_model.pb"): + if os.path.isfile("nvnmd_cnn/checkpoint") and not os.path.isfile( + "nvnmd_cnn/frozen_model.pb" + ): checkpoint = "nvnmd_cnn/model.ckpt" else: checkpoint = None - + # case of restart if checkpoint is not None: - command = dp_command + ["train-nvnmd", "--restart", checkpoint, train_script_name] + command = dp_command + [ + "train-nvnmd", + "--restart", + checkpoint, + train_script_name, + ] return command - + # case of init model assert checkpoint is None case_init_model = do_init_model if case_init_model: - - if isinstance(init_model, list): # initialize from model.ckpt - #init_model = ".".join(str(init_model[0]).split('.')[:-1]) + if isinstance(init_model, list): # initialize from model.ckpt + # init_model = ".".join(str(init_model[0]).split('.')[:-1]) for i in init_model: - if(os.path.exists(i)): - shutil.copy(i, '.') + if os.path.exists(i): + shutil.copy(i, ".") init_model = "model.ckpt" init_flag = "--init-model" - else: # initialize from frozen model + else: # initialize from frozen model init_flag = "--init-frz-model" - + command = dp_command + [ - "train-nvnmd", - init_flag, - str(init_model), - train_script_name, - ] + "train-nvnmd", + init_flag, + str(init_model), + train_script_name, + ] else: - command = dp_command + ["train-nvnmd", train_script_name] - + command = dp_command + ["train-nvnmd", train_script_name] + command += train_args.split() return command @@ -196,9 +201,9 @@ def execute( task_name = ip["task_name"] task_path = ip["task_path"] init_model = ip["init_model"] - init_model_ckpt_data = ip["init_model_ckpt_data"] - init_model_ckpt_meta = ip["init_model_ckpt_meta"] - init_model_ckpt_index = ip["init_model_ckpt_index"] + init_model_ckpt_data = ip["init_model_ckpt_data"] + init_model_ckpt_meta = ip["init_model_ckpt_meta"] + init_model_ckpt_index = ip["init_model_ckpt_index"] init_data = ip["init_data"] iter_data = ip["iter_data"] valid_data = ip["valid_data"] @@ -217,7 +222,11 @@ def execute( major_version = "2" # auto prob style - init_model_ckpt = [init_model_ckpt_meta, init_model_ckpt_data, init_model_ckpt_index] + init_model_ckpt = [ + init_model_ckpt_meta, + init_model_ckpt_data, + init_model_ckpt_index, + ] do_init_model = RunNvNMDTrain.decide_init_model( config, init_model_ckpt if init_model_ckpt_data is not None else init_model, @@ -244,10 +253,18 @@ def execute( valid_data, ) train_cnn_dict = RunNvNMDTrain.write_other_to_input_script( - train_dict, config, do_init_model, False, major_version, + train_dict, + config, + do_init_model, + False, + major_version, ) train_qnn_dict = RunNvNMDTrain.write_other_to_input_script( - train_dict, config, do_init_model, True, major_version, + train_dict, + config, + do_init_model, + True, + major_version, ) with set_directory(work_dir): @@ -260,7 +277,7 @@ def clean_before_quit(): # dump train script with open(train_cnn_script_name, "w") as fp: json.dump(train_cnn_dict, fp, indent=4) - + with open(train_qnn_script_name, "w") as fp: json.dump(train_qnn_dict, fp, indent=4) @@ -274,7 +291,7 @@ def clean_before_quit(): train_cnn_script_name, do_init_model, init_model_ckpt if init_model_ckpt_data is not None else init_model, - train_args = "-s s1", + train_args="-s s1", ) if not RunNvNMDTrain.skip_training( @@ -297,31 +314,35 @@ def clean_before_quit(): ) ) raise FatalError("dp train-nvnmd -s s1 failed") - fplog.write("#=================== train_cnn std out ===================\n") + fplog.write( + "#=================== train_cnn std out ===================\n" + ) fplog.write(out) - fplog.write("#=================== train_cnn std err ===================\n") + fplog.write( + "#=================== train_cnn std err ===================\n" + ) fplog.write(err) - + cnn_model_file = "nvnmd_cnn/frozen_model.pb" model_ckpt_data_file = "nvnmd_cnn/model.ckpt.data-00000-of-00001" model_ckpt_index_file = "nvnmd_cnn/model.ckpt.index" model_ckpt_meta_file = "nvnmd_cnn/model.ckpt.meta" lcurve_file = "nvnmd_cnn/lcurve.out" - + else: cnn_model_file = init_model model_ckpt_data_file = "" model_ckpt_index_file = "" model_ckpt_meta_file = "" lcurve_file = "nvnmd_qnn/lcurve.out" - + # train qnn model command = _make_train_command( dp_command, train_qnn_script_name, do_init_model, init_model_ckpt if init_model_ckpt_data is not None else init_model, - train_args = "-s s2", + train_args="-s s2", ) ret, out, err = run_command(command) @@ -345,12 +366,12 @@ def clean_before_quit(): fplog.write(out) fplog.write("#=================== train_qnn std err ===================\n") fplog.write(err) - + qnn_model_file = "nvnmd_qnn/model.pb" if os.path.exists("input_v2_compat.json"): shutil.copy2("input_v2_compat.json", train_script_name) - + clean_before_quit() return OPIO( @@ -377,7 +398,7 @@ def write_data_to_input_script( valid_data: Optional[Union[List[Path], Dict[str, List[Path]]]] = None, ): odict = idict.copy() - + data_list = [str(ii) for ii in init_data] + [str(ii) for ii in iter_data] if major_version == "1": # v1 behavior diff --git a/dpgen2/superop/__init__.py b/dpgen2/superop/__init__.py index cfddabdd..50e0a5d7 100644 --- a/dpgen2/superop/__init__.py +++ b/dpgen2/superop/__init__.py @@ -10,12 +10,12 @@ from .prep_run_dp_train import ( PrepRunDPTrain, ) -from .prep_run_nvnmd_train import ( - PrepRunNvNMDTrain, -) from .prep_run_fp import ( PrepRunFp, ) from .prep_run_lmp import ( PrepRunLmp, ) +from .prep_run_nvnmd_train import ( + PrepRunNvNMDTrain, +) diff --git a/dpgen2/superop/block.py b/dpgen2/superop/block.py index c8940b92..81396a11 100644 --- a/dpgen2/superop/block.py +++ b/dpgen2/superop/block.py @@ -58,15 +58,15 @@ from .prep_run_dp_train import ( PrepRunDPTrain, ) -from .prep_run_nvnmd_train import ( - PrepRunNvNMDTrain, -) from .prep_run_fp import ( PrepRunFp, ) from .prep_run_lmp import ( PrepRunLmp, ) +from .prep_run_nvnmd_train import ( + PrepRunNvNMDTrain, +) block_default_optional_parameter = { "data_mixed_type": False, @@ -364,9 +364,15 @@ def _block_cl( "models" ] if isinstance(prep_run_dp_train_op, PrepRunNvNMDTrain): - block_steps.outputs.artifacts["models_ckpt_meta"]._from = prep_run_dp_train.outputs.artifacts["models_ckpt_meta"] - block_steps.outputs.artifacts["models_ckpt_data"]._from = prep_run_dp_train.outputs.artifacts["models_ckpt_data"] - block_steps.outputs.artifacts["models_ckpt_index"]._from = prep_run_dp_train.outputs.artifacts["models_ckpt_index"] + block_steps.outputs.artifacts[ + "models_ckpt_meta" + ]._from = prep_run_dp_train.outputs.artifacts["models_ckpt_meta"] + block_steps.outputs.artifacts[ + "models_ckpt_data" + ]._from = prep_run_dp_train.outputs.artifacts["models_ckpt_data"] + block_steps.outputs.artifacts[ + "models_ckpt_index" + ]._from = prep_run_dp_train.outputs.artifacts["models_ckpt_index"] block_steps.outputs.artifacts["iter_data"]._from = collect_data.outputs.artifacts[ "iter_data" ] diff --git a/dpgen2/superop/prep_run_nvnmd_train.py b/dpgen2/superop/prep_run_nvnmd_train.py index 21bd376c..29963534 100644 --- a/dpgen2/superop/prep_run_nvnmd_train.py +++ b/dpgen2/superop/prep_run_nvnmd_train.py @@ -198,8 +198,23 @@ def _prep_run_nvnmd_train( slices=Slices( "int('{{item}}')", input_parameter=["task_name"], - input_artifact=["task_path", "init_model", "init_model_ckpt_data", "init_model_ckpt_index", "init_model_ckpt_meta"], - output_artifact=["cnn_model", "qnn_model", "model_ckpt_meta", "model_ckpt_data", "model_ckpt_index", "lcurve", "log", "script"], + input_artifact=[ + "task_path", + "init_model", + "init_model_ckpt_data", + "init_model_ckpt_index", + "init_model_ckpt_meta", + ], + output_artifact=[ + "cnn_model", + "qnn_model", + "model_ckpt_meta", + "model_ckpt_data", + "model_ckpt_index", + "lcurve", + "log", + "script", + ], **template_slice_config, ), python_packages=upload_python_packages, @@ -215,9 +230,15 @@ def _prep_run_nvnmd_train( artifacts={ "task_path": prep_train.outputs.artifacts["task_paths"], "init_model": train_steps.inputs.artifacts["init_models"], - "init_model_ckpt_meta": train_steps.inputs.artifacts["init_models_ckpt_meta"], - "init_model_ckpt_data": train_steps.inputs.artifacts["init_models_ckpt_data"], - "init_model_ckpt_index": train_steps.inputs.artifacts["init_models_ckpt_index"], + "init_model_ckpt_meta": train_steps.inputs.artifacts[ + "init_models_ckpt_meta" + ], + "init_model_ckpt_data": train_steps.inputs.artifacts[ + "init_models_ckpt_data" + ], + "init_model_ckpt_index": train_steps.inputs.artifacts[ + "init_models_ckpt_index" + ], "init_data": train_steps.inputs.artifacts["init_data"], "iter_data": train_steps.inputs.artifacts["iter_data"], "valid_data": valid_data, @@ -242,12 +263,24 @@ def _prep_run_nvnmd_train( train_steps.outputs.artifacts["scripts"]._from = run_train.outputs.artifacts[ "script" ] - train_steps.outputs.artifacts["models"]._from = run_train.outputs.artifacts["cnn_model"] - train_steps.outputs.artifacts["nvnmodels"]._from = run_train.outputs.artifacts["qnn_model"] - train_steps.outputs.artifacts["models_ckpt_meta"]._from = run_train.outputs.artifacts["model_ckpt_meta"] - train_steps.outputs.artifacts["models_ckpt_data"]._from = run_train.outputs.artifacts["model_ckpt_data"] - train_steps.outputs.artifacts["models_ckpt_index"]._from = run_train.outputs.artifacts["model_ckpt_index"] + train_steps.outputs.artifacts["models"]._from = run_train.outputs.artifacts[ + "cnn_model" + ] + train_steps.outputs.artifacts["nvnmodels"]._from = run_train.outputs.artifacts[ + "qnn_model" + ] + train_steps.outputs.artifacts[ + "models_ckpt_meta" + ]._from = run_train.outputs.artifacts["model_ckpt_meta"] + train_steps.outputs.artifacts[ + "models_ckpt_data" + ]._from = run_train.outputs.artifacts["model_ckpt_data"] + train_steps.outputs.artifacts[ + "models_ckpt_index" + ]._from = run_train.outputs.artifacts["model_ckpt_index"] train_steps.outputs.artifacts["logs"]._from = run_train.outputs.artifacts["log"] - train_steps.outputs.artifacts["lcurves"]._from = run_train.outputs.artifacts["lcurve"] + train_steps.outputs.artifacts["lcurves"]._from = run_train.outputs.artifacts[ + "lcurve" + ] return train_steps diff --git a/tests/mocked_ops.py b/tests/mocked_ops.py index 543e69d7..006a31d7 100644 --- a/tests/mocked_ops.py +++ b/tests/mocked_ops.py @@ -40,11 +40,11 @@ lmp_model_devi_name, lmp_task_pattern, lmp_traj_name, - model_name_pattern, - model_ckpt_pattern, - model_ckpt_meta_pattern, model_ckpt_data_pattern, model_ckpt_index_pattern, + model_ckpt_meta_pattern, + model_ckpt_pattern, + model_name_pattern, train_log_name, train_script_name, train_task_pattern, @@ -96,15 +96,15 @@ from dpgen2.op.run_dp_train import ( RunDPTrain, ) -from dpgen2.op.run_nvnmd_train import ( - RunNvNMDTrain, -) from dpgen2.op.run_lmp import ( RunLmp, ) from dpgen2.op.run_nvnmd import ( RunNvNMD, ) +from dpgen2.op.run_nvnmd_train import ( + RunNvNMDTrain, +) from dpgen2.op.select_confs import ( SelectConfs, ) @@ -128,7 +128,7 @@ def make_mocked_init_models(numb_models): def make_mocked_init_models_ckpt(numb_models): tmp_models_ckpt = [] for ii in range(numb_models): - dir = Path(model_ckpt_pattern %ii) + dir = Path(model_ckpt_pattern % ii) dir.mkdir(exist_ok=True, parents=True) ff_meta = Path(model_ckpt_meta_pattern % ii) ff_meta.write_text(f"This is init model ckpt meta {ii}") @@ -476,13 +476,13 @@ def execute( ) copyfile(script, oscript) - + cnn_dir = Path("nvnmd_cnn") qnn_dir = Path("nvnmd_qnn") cnn_model = cnn_dir / Path("frozen_model.pb") qnn_model = qnn_dir / Path("model.pb") - model_ckpt_meta_file = cnn_dir / Path("model.ckpt.meta") - model_ckpt_data_file = cnn_dir / Path("model.ckpt.data-00000-of-00001") + model_ckpt_meta_file = cnn_dir / Path("model.ckpt.meta") + model_ckpt_data_file = cnn_dir / Path("model.ckpt.data-00000-of-00001") model_ckpt_index_file = cnn_dir / Path("model.ckpt.index") lcurve = cnn_dir / Path("lcurve.out") log = Path("log") @@ -492,13 +492,13 @@ def execute( f.write(f"init_model {str(init_model)} OK\n") assert init_model_ckpt_meta.exists() with log.open("a") as f: - f.write(f"init_model_ckpt_meta {str(init_model_ckpt_meta)} OK\n") + f.write(f"init_model_ckpt_meta {str(init_model_ckpt_meta)} OK\n") assert init_model_ckpt_data.exists() with log.open("a") as f: - f.write(f"init_model_ckpt_data {str(init_model_ckpt_data)} OK\n") + f.write(f"init_model_ckpt_data {str(init_model_ckpt_data)} OK\n") assert init_model_ckpt_index.exists() with log.open("a") as f: - f.write(f"init_model_ckpt_index {str(init_model_ckpt_index)} OK\n") + f.write(f"init_model_ckpt_index {str(init_model_ckpt_index)} OK\n") for ii in jtmp["data"]: assert Path(ii).exists() assert (ii in init_data_str) or (ii in iter_data_str) @@ -508,7 +508,6 @@ def execute( with log.open("a") as f: f.write(f"script {str(script)} OK\n") - cnn_dir.mkdir(exist_ok=True, parents=True) with cnn_model.open("w") as f: f.write("read from init model: \n") @@ -622,13 +621,13 @@ def execute( ) copyfile(script, oscript) - + cnn_dir = Path("nvnmd_cnn") qnn_dir = Path("nvnmd_qnn") cnn_model = cnn_dir / Path("frozen_model.pb") qnn_model = qnn_dir / Path("model.pb") - model_ckpt_meta_file = cnn_dir / Path("model.ckpt.meta") - model_ckpt_data_file = cnn_dir / Path("model.ckpt.data-00000-of-00001") + model_ckpt_meta_file = cnn_dir / Path("model.ckpt.meta") + model_ckpt_data_file = cnn_dir / Path("model.ckpt.data-00000-of-00001") model_ckpt_index_file = cnn_dir / Path("model.ckpt.index") lcurve = cnn_dir / Path("lcurve.out") log = Path("log") @@ -667,7 +666,7 @@ def execute( "qnn_model": work_dir / qnn_model, "model_ckpt_data": work_dir / model_ckpt_meta_file, "model_ckpt_meta": work_dir / model_ckpt_meta_file, - "model_ckpt_index": work_dir / model_ckpt_meta_file, + "model_ckpt_index": work_dir / model_ckpt_meta_file, "lcurve": work_dir / lcurve, "log": work_dir / log, } diff --git a/tests/op/test_prep_nvnmd_train.py b/tests/op/test_prep_nvnmd_train.py index 0d5fe698..3ddbbd7c 100644 --- a/tests/op/test_prep_nvnmd_train.py +++ b/tests/op/test_prep_nvnmd_train.py @@ -31,10 +31,7 @@ # isort: on template_script_nvnmd_v0 = { - "nvnmd": { - "version": 0, - "seed": 1 - }, + "nvnmd": {"version": 0, "seed": 1}, "training": { "systems": [], "stop_batch": 2000, @@ -45,10 +42,7 @@ template_script_nvnmd_v1 = { - "nvnmd": { - "version": 1, - "seed": 1 - }, + "nvnmd": {"version": 1, "seed": 1}, "training": { "systems": [], "stop_batch": 2000, @@ -88,7 +82,10 @@ def _check_output_dir_and_file_exist(self, op, numb_models): def test_template_nvnmd_v1(self): ip = OPIO( - {"template_script": template_script_nvnmd_v1, "numb_models": self.numb_models} + { + "template_script": template_script_nvnmd_v1, + "numb_models": self.numb_models, + } ) faked_rg.faked_random = -1 @@ -158,7 +155,7 @@ def test_template_raise_wrong_list_length(self): "template_script": [ template_script_nvnmd_v1, template_script_nvnmd_v0, - template_script_nvnmd_v1 + template_script_nvnmd_v1, ], "numb_models": self.numb_models, } diff --git a/tests/op/test_run_nvnmd.py b/tests/op/test_run_nvnmd.py index fedc164b..3ff65074 100644 --- a/tests/op/test_run_nvnmd.py +++ b/tests/op/test_run_nvnmd.py @@ -87,18 +87,29 @@ def test_success(self, mocked_run): self.assertEqual(out["traj"], work_dir / lmp_traj_name) self.assertEqual(out["model_devi"], work_dir / lmp_model_devi_name) # check call - models = ["models/path/model_%d.pb"%i for i in range(len(self.models))] + models = ["models/path/model_%d.pb" % i for i in range(len(self.models))] calls = [ call( " ; ".join( [ " ".join( [ - "cp", model_name, "model.pb", "&&", - "mylmp", "-i", lmp_input_name, - "-log", lmp_log_name, - "-v", "rerun", "%d"%i, "&&", - "cp", lmp_traj_name, lmp_traj_name+".%d"%i + "cp", + model_name, + "model.pb", + "&&", + "mylmp", + "-i", + lmp_input_name, + "-log", + lmp_log_name, + "-v", + "rerun", + "%d" % i, + "&&", + "cp", + lmp_traj_name, + lmp_traj_name + ".%d" % i, ] ) for i, model_name in enumerate(models) @@ -132,18 +143,29 @@ def test_error(self, mocked_run): ) ) # check call - models = ["models/path/model_%d.pb"%i for i in range(len(self.models))] + models = ["models/path/model_%d.pb" % i for i in range(len(self.models))] calls = [ call( " ; ".join( [ " ".join( [ - "cp", model_name, "model.pb", "&&", - "mylmp", "-i", lmp_input_name, - "-log", lmp_log_name, - "-v", "rerun", "%d"%i, "&&", - "cp", lmp_traj_name, lmp_traj_name+".%d"%i + "cp", + model_name, + "model.pb", + "&&", + "mylmp", + "-i", + lmp_input_name, + "-log", + lmp_log_name, + "-v", + "rerun", + "%d" % i, + "&&", + "cp", + lmp_traj_name, + lmp_traj_name + ".%d" % i, ] ) for i, model_name in enumerate(models) diff --git a/tests/op/test_run_nvnmd_train.py b/tests/op/test_run_nvnmd_train.py index 2e1495d8..7b20f511 100644 --- a/tests/op/test_run_nvnmd_train.py +++ b/tests/op/test_run_nvnmd_train.py @@ -134,7 +134,7 @@ def setUp(self): "auto_prob": "prob_sys_size", }, "disp_file": "lcurve.out", - "save_ckpt": "model.ckpt" + "save_ckpt": "model.ckpt", }, "learning_rate": { "start_lr": 1.0, @@ -201,7 +201,7 @@ def setUp(self): "batch_size": "auto", "auto_prob_style": "prob_sys_size", "disp_file": "lcurve.out", - "save_ckpt": "model.ckpt" + "save_ckpt": "model.ckpt", }, "learning_rate": { "start_lr": 1.0, @@ -384,7 +384,7 @@ def test_update_input_dict_v2(self): config = self.config.copy() config["init_model_policy"] = "no" odict = RunNvNMDTrain.write_other_to_input_script( - odict, config, False, False,major_version="2" + odict, config, False, False, major_version="2" ) self.assertDictEqual(odict, self.expected_odict_v2) @@ -421,15 +421,20 @@ def test_exec_v1(self, mocked_run): self.assertEqual(out["script"], work_dir / train_cnn_script_name) self.assertEqual(out["cnn_model"], work_dir / "nvnmd_cnn/frozen_model.pb") self.assertEqual(out["qnn_model"], work_dir / "nvnmd_qnn/model.pb") - self.assertEqual(out["model_ckpt_data"], work_dir / "nvnmd_cnn/model.ckpt.data-00000-of-00001") + self.assertEqual( + out["model_ckpt_data"], + work_dir / "nvnmd_cnn/model.ckpt.data-00000-of-00001", + ) self.assertEqual(out["model_ckpt_meta"], work_dir / "nvnmd_cnn/model.ckpt.meta") - self.assertEqual(out["model_ckpt_index"], work_dir / "nvnmd_cnn/model.ckpt.index") + self.assertEqual( + out["model_ckpt_index"], work_dir / "nvnmd_cnn/model.ckpt.index" + ) self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") self.assertEqual(out["log"], work_dir / "train.log") calls = [ call(["dp", "train-nvnmd", train_cnn_script_name, "-s", "s1"]), - call(["dp", "train-nvnmd", train_qnn_script_name, "-s", "s2"]) + call(["dp", "train-nvnmd", train_qnn_script_name, "-s", "s2"]), ] mocked_run.assert_has_calls(calls) @@ -481,15 +486,20 @@ def test_exec_v2(self, mocked_run): self.assertEqual(out["script"], work_dir / train_cnn_script_name) self.assertEqual(out["cnn_model"], work_dir / "nvnmd_cnn/frozen_model.pb") self.assertEqual(out["qnn_model"], work_dir / "nvnmd_qnn/model.pb") - self.assertEqual(out["model_ckpt_data"], work_dir / "nvnmd_cnn/model.ckpt.data-00000-of-00001") + self.assertEqual( + out["model_ckpt_data"], + work_dir / "nvnmd_cnn/model.ckpt.data-00000-of-00001", + ) self.assertEqual(out["model_ckpt_meta"], work_dir / "nvnmd_cnn/model.ckpt.meta") - self.assertEqual(out["model_ckpt_index"], work_dir / "nvnmd_cnn/model.ckpt.index") + self.assertEqual( + out["model_ckpt_index"], work_dir / "nvnmd_cnn/model.ckpt.index" + ) self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") self.assertEqual(out["log"], work_dir / "train.log") calls = [ call(["dp", "train-nvnmd", train_cnn_script_name, "-s", "s1"]), - call(["dp", "train-nvnmd", train_qnn_script_name, "-s", "s2"]) + call(["dp", "train-nvnmd", train_qnn_script_name, "-s", "s2"]), ] mocked_run.assert_has_calls(calls) @@ -541,9 +551,14 @@ def test_exec_v2_init_model(self, mocked_run): self.assertEqual(out["script"], work_dir / train_cnn_script_name) self.assertEqual(out["cnn_model"], work_dir / "nvnmd_cnn/frozen_model.pb") self.assertEqual(out["qnn_model"], work_dir / "nvnmd_qnn/model.pb") - self.assertEqual(out["model_ckpt_data"], work_dir / "nvnmd_cnn/model.ckpt.data-00000-of-00001") + self.assertEqual( + out["model_ckpt_data"], + work_dir / "nvnmd_cnn/model.ckpt.data-00000-of-00001", + ) self.assertEqual(out["model_ckpt_meta"], work_dir / "nvnmd_cnn/model.ckpt.meta") - self.assertEqual(out["model_ckpt_index"], work_dir / "nvnmd_cnn/model.ckpt.index") + self.assertEqual( + out["model_ckpt_index"], work_dir / "nvnmd_cnn/model.ckpt.index" + ) self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") self.assertEqual(out["log"], work_dir / "train.log") @@ -556,7 +571,7 @@ def test_exec_v2_init_model(self, mocked_run): "model.ckpt", train_cnn_script_name, "-s", - "s1" + "s1", ] ) ] @@ -619,6 +634,7 @@ def test_exec_v2_train_error(self, mocked_run): jdata = json.load(fp) self.assertDictEqual(jdata, self.expected_odict_v2) + class TestRunNvNMDTrainNullIterData(unittest.TestCase): def setUp(self): self.atom_name = "foo" @@ -678,7 +694,7 @@ def setUp(self): "auto_prob": "prob_sys_size", }, "disp_file": "lcurve.out", - "save_ckpt": "model.ckpt" + "save_ckpt": "model.ckpt", }, "learning_rate": { "start_lr": 1.0, @@ -747,15 +763,20 @@ def test_exec_v2_empty_dir(self, mocked_run): self.assertEqual(out["script"], work_dir / train_cnn_script_name) self.assertEqual(out["cnn_model"], work_dir / "nvnmd_cnn/frozen_model.pb") self.assertEqual(out["qnn_model"], work_dir / "nvnmd_qnn/model.pb") - self.assertEqual(out["model_ckpt_data"], work_dir / "nvnmd_cnn/model.ckpt.data-00000-of-00001") + self.assertEqual( + out["model_ckpt_data"], + work_dir / "nvnmd_cnn/model.ckpt.data-00000-of-00001", + ) self.assertEqual(out["model_ckpt_meta"], work_dir / "nvnmd_cnn/model.ckpt.meta") - self.assertEqual(out["model_ckpt_index"], work_dir / "nvnmd_cnn/model.ckpt.index") + self.assertEqual( + out["model_ckpt_index"], work_dir / "nvnmd_cnn/model.ckpt.index" + ) self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") self.assertEqual(out["log"], work_dir / "train.log") calls = [ call(["dp", "train-nvnmd", train_cnn_script_name, "-s", "s1"]), - call(["dp", "train-nvnmd", train_qnn_script_name, "-s", "s2"]) + call(["dp", "train-nvnmd", train_qnn_script_name, "-s", "s2"]), ] mocked_run.assert_has_calls(calls) diff --git a/tests/test_prep_run_nvnmd_train.py b/tests/test_prep_run_nvnmd_train.py index 7defc199..14847382 100644 --- a/tests/test_prep_run_nvnmd_train.py +++ b/tests/test_prep_run_nvnmd_train.py @@ -78,7 +78,15 @@ def _check_log( - tcase, fname, path, script, init_model, init_model_ckpt, init_data, iter_data, only_check_name=False + tcase, + fname, + path, + script, + init_model, + init_model_ckpt, + init_data, + iter_data, + only_check_name=False, ): with open(fname) as fp: lines_ = fp.read().strip().split("\n") @@ -97,15 +105,27 @@ def _check_log( ) tcase.assertEqual( lines[1].split(" "), - ["init_model_ckpt_meta", str(revised_fname(Path(path) / init_model_ckpt / "model.ckpt.meta")), "OK"], - ) + [ + "init_model_ckpt_meta", + str(revised_fname(Path(path) / init_model_ckpt / "model.ckpt.meta")), + "OK", + ], + ) tcase.assertEqual( lines[2].split(" "), - ["init_model_ckpt_data", str(revised_fname(Path(path) / init_model_ckpt / "model.ckpt.data")), "OK"], + [ + "init_model_ckpt_data", + str(revised_fname(Path(path) / init_model_ckpt / "model.ckpt.data")), + "OK", + ], ) tcase.assertEqual( lines[3].split(" "), - ["init_model_ckpt_index", str(revised_fname(Path(path) / init_model_ckpt / "model.ckpt.index")), "OK"], + [ + "init_model_ckpt_index", + str(revised_fname(Path(path) / init_model_ckpt / "model.ckpt.index")), + "OK", + ], ) for ii in range(2): tcase.assertEqual( @@ -200,9 +220,18 @@ def check_run_train_nvnmd_output( ) _check_model(tcase, "nvnmd_cnn/frozen_model.pb", cwd, init_model) _check_model(tcase, "nvnmd_qnn/model.pb", cwd, init_model) - _check_model_ckpt(tcase, "nvnmd_cnn/model.ckpt.meta", cwd, init_model_ckpt / "model.ckpt.meta") - _check_model_ckpt(tcase, "nvnmd_cnn/model.ckpt.data-00000-of-00001", cwd, init_model_ckpt / "model.ckpt.data") - _check_model_ckpt(tcase, "nvnmd_cnn/model.ckpt.index", cwd, init_model_ckpt / "model.ckpt.index") + _check_model_ckpt( + tcase, "nvnmd_cnn/model.ckpt.meta", cwd, init_model_ckpt / "model.ckpt.meta" + ) + _check_model_ckpt( + tcase, + "nvnmd_cnn/model.ckpt.data-00000-of-00001", + cwd, + init_model_ckpt / "model.ckpt.data", + ) + _check_model_ckpt( + tcase, "nvnmd_cnn/model.ckpt.index", cwd, init_model_ckpt / "model.ckpt.index" + ) _check_lcurve(tcase, "nvnmd_cnn/lcurve.out", cwd, script) os.chdir(cwd) @@ -288,9 +317,12 @@ def test(self): "task_name": self.task_names[ii], "task_path": self.task_paths[ii], "init_model": self.init_models[ii], - "init_model_ckpt_meta": self.init_models_ckpt[ii] / "model.ckpt.meta", - "init_model_ckpt_data": self.init_models_ckpt[ii] / "model.ckpt.data", - "init_model_ckpt_index": self.init_models_ckpt[ii] / "model.ckpt.index", + "init_model_ckpt_meta": self.init_models_ckpt[ii] + / "model.ckpt.meta", + "init_model_ckpt_data": self.init_models_ckpt[ii] + / "model.ckpt.data", + "init_model_ckpt_index": self.init_models_ckpt[ii] + / "model.ckpt.index", "init_data": self.init_data, "iter_data": self.iter_data, } @@ -298,13 +330,32 @@ def test(self): op = run.execute(ip) self.assertEqual(op["script"], Path(train_task_pattern % ii) / "input.json") self.assertTrue(op["script"].is_file()) - self.assertEqual(op["cnn_model"], Path(train_task_pattern % ii) / "nvnmd_cnn" / "frozen_model.pb") - self.assertEqual(op["qnn_model"], Path(train_task_pattern % ii) / "nvnmd_qnn" / "model.pb") - self.assertEqual(op["model_ckpt_data"], Path(train_task_pattern % ii) / "nvnmd_cnn" / "model.ckpt.data-00000-of-00001") - self.assertEqual(op["model_ckpt_meta"], Path(train_task_pattern % ii) / "nvnmd_cnn" /"model.ckpt.meta") - self.assertEqual(op["model_ckpt_index"], Path(train_task_pattern % ii) / "nvnmd_cnn" /"model.ckpt.index") + self.assertEqual( + op["cnn_model"], + Path(train_task_pattern % ii) / "nvnmd_cnn" / "frozen_model.pb", + ) + self.assertEqual( + op["qnn_model"], + Path(train_task_pattern % ii) / "nvnmd_qnn" / "model.pb", + ) + self.assertEqual( + op["model_ckpt_data"], + Path(train_task_pattern % ii) + / "nvnmd_cnn" + / "model.ckpt.data-00000-of-00001", + ) + self.assertEqual( + op["model_ckpt_meta"], + Path(train_task_pattern % ii) / "nvnmd_cnn" / "model.ckpt.meta", + ) + self.assertEqual( + op["model_ckpt_index"], + Path(train_task_pattern % ii) / "nvnmd_cnn" / "model.ckpt.index", + ) self.assertEqual(op["log"], Path(train_task_pattern % ii) / "log") - self.assertEqual(op["lcurve"], Path(train_task_pattern % ii) / "nvnmd_cnn" / "lcurve.out") + self.assertEqual( + op["lcurve"], Path(train_task_pattern % ii) / "nvnmd_cnn" / "lcurve.out" + ) check_run_train_nvnmd_output( self, self.task_names[ii], @@ -315,6 +366,7 @@ def test(self): self.iter_data, ) + @unittest.skipIf(skip_ut_with_dflow, skip_ut_with_dflow_reason) class TestTrainNvNMD(unittest.TestCase): def setUp(self): @@ -323,13 +375,19 @@ def setUp(self): tmp_models = make_mocked_init_models(self.numb_models) self.init_models = upload_artifact(tmp_models) self.str_init_models = tmp_models - + tmp_models_ckpt = make_mocked_init_models_ckpt(self.numb_models) - self.init_models_ckpt_meta = upload_artifact([dir / "model.ckpt.meta" for dir in tmp_models_ckpt]) - self.init_models_ckpt_data = upload_artifact([dir / "model.ckpt.data" for dir in tmp_models_ckpt]) - self.init_models_ckpt_index = upload_artifact([dir / "model.ckpt.index" for dir in tmp_models_ckpt]) + self.init_models_ckpt_meta = upload_artifact( + [dir / "model.ckpt.meta" for dir in tmp_models_ckpt] + ) + self.init_models_ckpt_data = upload_artifact( + [dir / "model.ckpt.data" for dir in tmp_models_ckpt] + ) + self.init_models_ckpt_index = upload_artifact( + [dir / "model.ckpt.index" for dir in tmp_models_ckpt] + ) self.str_init_models_ckpt = tmp_models_ckpt - + tmp_init_data = make_mocked_init_data() self.init_data = upload_artifact(tmp_init_data) self.path_init_data = tmp_init_data From 05ba10f2640be785af9ce2a1c70834c745b248b6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Jun 2025 13:20:15 +0000 Subject: [PATCH 36/49] reuse PrepRunDPTrain and PrepRunLmp superop --- dpgen2/constants.py | 5 +- dpgen2/entrypoint/args.py | 14 +- dpgen2/entrypoint/submit.py | 23 +- .../exploration/render/traj_render_lammps.py | 13 +- dpgen2/exploration/task/lmp/lmp_input.py | 40 +- .../task/lmp_template_task_group.py | 34 +- .../task/make_task_group_from_config.py | 10 +- dpgen2/exploration/task/npt_task_group.py | 2 +- dpgen2/flow/dpgen_loop.py | 89 +--- dpgen2/op/__init__.py | 10 +- dpgen2/op/prep_dp_train.py | 23 +- dpgen2/op/prep_nvnmd_train.py | 119 ----- dpgen2/op/run_dp_train.py | 13 +- dpgen2/op/run_nvnmd.py | 344 +++++-------- dpgen2/op/run_nvnmd_train.py | 453 ++++------------- dpgen2/superop/__init__.py | 3 - dpgen2/superop/block.py | 87 +--- dpgen2/superop/prep_run_dp_train.py | 5 +- dpgen2/superop/prep_run_nvnmd_train.py | 253 ---------- dpgen2/utils/download_dpgen2_artifacts.py | 10 +- tests/mocked_ops.py | 156 +++--- tests/op/test_prep_dp_train.py | 64 +++ tests/op/test_prep_nvnmd_train.py | 173 ------- tests/op/test_run_dp_train.py | 5 + tests/op/test_run_nvnmd.py | 263 ++-------- tests/op/test_run_nvnmd_train.py | 332 ++++++++----- tests/test_prep_run_dp_train.py | 230 +++++++++ tests/test_prep_run_lmp.py | 73 +++ tests/test_prep_run_nvnmd.py | 307 ------------ tests/test_prep_run_nvnmd_train.py | 459 ------------------ 30 files changed, 1037 insertions(+), 2575 deletions(-) delete mode 100644 dpgen2/op/prep_nvnmd_train.py delete mode 100644 dpgen2/superop/prep_run_nvnmd_train.py delete mode 100644 tests/op/test_prep_nvnmd_train.py delete mode 100644 tests/test_prep_run_nvnmd.py delete mode 100644 tests/test_prep_run_nvnmd_train.py diff --git a/dpgen2/constants.py b/dpgen2/constants.py index c591d37c..bcc322a9 100644 --- a/dpgen2/constants.py +++ b/dpgen2/constants.py @@ -5,11 +5,8 @@ train_qnn_script_name = "input_qnn.json" train_log_name = "train.log" model_name_pattern = "model.%03d.pb" +nvnmd_model_name_pattern = "nvnmd_model.%03d" pytorch_model_name_pattern = "model.%03d.pth" -model_ckpt_pattern = "model.ckpt.%03d" -model_ckpt_meta_pattern = "model.ckpt.%03d/model.ckpt.meta" -model_ckpt_data_pattern = "model.ckpt.%03d/model.ckpt.data" -model_ckpt_index_pattern = "model.ckpt.%03d/model.ckpt.index" model_name_match_pattern = r"model\.[0-9]{3,}(\.pb|\.pth)" lmp_index_pattern = "%06d" lmp_task_pattern = "task." + lmp_index_pattern diff --git a/dpgen2/entrypoint/args.py b/dpgen2/entrypoint/args.py index 7fcd91ce..70492ab9 100644 --- a/dpgen2/entrypoint/args.py +++ b/dpgen2/entrypoint/args.py @@ -28,12 +28,12 @@ from dpgen2.op.run_dp_train import ( RunDPTrain, ) -from dpgen2.op.run_nvnmd_train import ( - RunNvNMDTrain, -) from dpgen2.op.run_lmp import ( RunLmp, ) +from dpgen2.op.run_nvnmd_train import ( + RunNvNMDTrain, +) from dpgen2.utils import ( normalize_step_dict, step_conf_args, @@ -129,6 +129,7 @@ def dp_train_args(): ), ] + def nvnmd_train_args(): doc_numb_models = "Number of models trained for evaluating the model deviation" doc_config = "Configuration of training" @@ -141,9 +142,9 @@ def nvnmd_train_args(): Argument( "config", dict, - RunNvNMDTrain.training_args(), + RunDPTrain.training_args(), optional=True, - default=RunNvNMDTrain.normalize_config({}), + default=RunDPTrain.normalize_config({}), doc=doc_numb_models, ), Argument("numb_models", int, optional=True, default=4, doc=doc_numb_models), @@ -174,6 +175,7 @@ def nvnmd_train_args(): ), ] + def variant_train(): doc = "the type of the training" return Variant( @@ -502,7 +504,7 @@ def variant_explore(): "type", [ Argument("lmp", dict, lmp_args(), doc=doc_lmp), - Argument("nvnmd", dict, lmp_args(), doc=doc_lmp), + Argument("lmp-nvnmd", dict, lmp_args(), doc=doc_lmp), Argument("calypso", dict, caly_args(), doc=doc_calypso), Argument("calypso:default", dict, caly_args(), doc=doc_calypso), Argument("calypso:merge", dict, caly_args(), doc=doc_calypso), diff --git a/dpgen2/entrypoint/submit.py b/dpgen2/entrypoint/submit.py index c35c123d..3e4bdec9 100644 --- a/dpgen2/entrypoint/submit.py +++ b/dpgen2/entrypoint/submit.py @@ -104,16 +104,15 @@ PrepCalyInput, PrepCalyModelDevi, PrepDPTrain, - PrepNvNMDTrain, PrepLmp, PrepRelax, RunCalyDPOptim, RunCalyModelDevi, RunDPTrain, - RunNvNMDTrain, RunLmp, RunLmpHDF5, RunNvNMD, + RunNvNMDTrain, RunRelax, RunRelaxHDF5, SelectConfs, @@ -126,7 +125,6 @@ PrepRunCaly, PrepRunDiffCSP, PrepRunDPTrain, - PrepRunNvNMDTrain, PrepRunFp, PrepRunLmp, ) @@ -187,9 +185,9 @@ def make_concurrent_learning_op( optional_files=train_optional_files, ) elif train_style == "dp-nvnmd": - prep_run_train_op = PrepRunNvNMDTrain( + prep_run_train_op = PrepRunDPTrain( "prep-run-nvnmd-train", - PrepNvNMDTrain, + PrepDPTrain, RunNvNMDTrain, prep_config=prep_train_config, run_config=run_train_config, @@ -208,7 +206,7 @@ def make_concurrent_learning_op( run_config=run_explore_config, upload_python_packages=upload_python_packages, ) - elif "nvnmd" in explore_style: + elif "lmp-nvnmd" in explore_style: prep_run_explore_op = PrepRunLmp( "prep-run-nvnmd", PrepLmp, @@ -216,7 +214,7 @@ def make_concurrent_learning_op( prep_config=prep_explore_config, run_config=run_explore_config, upload_python_packages=upload_python_packages, - ) + ) elif "calypso" in explore_style: expl_mode = explore_style.split(":")[-1] if ":" in explore_style else "default" if expl_mode == "merge": @@ -310,7 +308,7 @@ def make_naive_exploration_scheduler( # use npt task group explore_style = config["explore"]["type"] - if explore_style in ("lmp", "nvnmd"): + if explore_style in ("lmp", "lmp-nvnmd"): return make_lmp_naive_exploration_scheduler(config) elif "calypso" in explore_style or explore_style == "diffcsp": return make_naive_exploration_scheduler_without_conf(config, explore_style) @@ -531,7 +529,7 @@ def workflow_concurrent_learning( else None ) config["train"]["numb_models"] = 1 - + elif train_style == "dp-nvnmd": init_models_paths = config["train"].get("init_models_paths", None) numb_models = config["train"]["numb_models"] @@ -540,7 +538,7 @@ def workflow_concurrent_learning( f"{len(init_models_paths)} init models provided, which does " "not match numb_models={numb_models}" ) - + else: raise RuntimeError(f"unknown params, train_style: {train_style}") @@ -660,8 +658,6 @@ def workflow_concurrent_learning( init_models = get_artifact_from_uri(config["train"]["init_models_uri"]) elif train_style == "dp-dist" and config["train"]["student_model_uri"] is not None: init_models = get_artifact_from_uri(config["train"]["student_model_uri"]) - elif train_style == "dp-nvnmd" and config["train"]["init_models_uri"] is not None: - init_models = get_artifact_from_uri(config["train"]["init_models_uri"]) elif init_models_paths is not None: init_models = upload_artifact_and_print_uri(init_models_paths, "init_models") else: @@ -699,9 +695,6 @@ def workflow_concurrent_learning( }, artifacts={ "init_models": init_models, - "init_models_ckpt_meta": None, - "init_models_ckpt_index": None, - "init_models_ckpt_data": None, "init_data": init_data, "iter_data": iter_data, }, diff --git a/dpgen2/exploration/render/traj_render_lammps.py b/dpgen2/exploration/render/traj_render_lammps.py index 8fec2744..0b1c36f9 100644 --- a/dpgen2/exploration/render/traj_render_lammps.py +++ b/dpgen2/exploration/render/traj_render_lammps.py @@ -15,9 +15,6 @@ import dpdata import numpy as np -from ase.io import ( - read, -) from dflow.python.opio import ( HDF5Dataset, ) @@ -112,6 +109,10 @@ def get_confs( conf_filters: Optional["ConfFilters"] = None, optional_outputs: Optional[List[Path]] = None, ) -> dpdata.MultiSystems: + + from ase.io import( # type: ignore + read, + ) ntraj = len(trajs) ele_temp = None if optional_outputs: @@ -126,8 +127,10 @@ def get_confs( traj = StringIO(trajs[ii].get_data()) # type: ignore else: traj = trajs[ii] - #ss = dpdata.System(traj, fmt=traj_fmt, type_map=type_map) - ss = read(str(traj), format="lammps-dump-text", index=":", specorder=type_map) + # ss = dpdata.System(traj, fmt=traj_fmt, type_map=type_map) + ss = read( + str(traj), format="lammps-dump-text", index=":", specorder=type_map + ) for jj in id_selected[ii]: s = dpdata.System(ss[jj], fmt="ase/structure", type_map=type_map) s.nopbc = self.nopbc diff --git a/dpgen2/exploration/task/lmp/lmp_input.py b/dpgen2/exploration/task/lmp/lmp_input.py index 777aef3a..3a08bbbc 100644 --- a/dpgen2/exploration/task/lmp/lmp_input.py +++ b/dpgen2/exploration/task/lmp/lmp_input.py @@ -114,9 +114,7 @@ def make_lmp_input( model_devi_file_name, ) elif nvnmd_version is not None: - ret += "pair_style nvnmd %s\n" % ( - "model.pb" - ) + ret += "pair_style nvnmd %s\n" % ("model.pb") else: # 1.x keywords = "" @@ -139,19 +137,28 @@ def make_lmp_input( ret += "\n" ret += "thermo_style custom step temp pe ke etotal press vol lx ly lz xy xz yz\n" ret += "thermo ${THERMO_FREQ}\n" - if trj_seperate_files and nvnmd_version is None: - ret += "dump 1 all custom ${DUMP_FREQ} traj/*.lammpstrj id type x y z fx fy fz\n" + if trj_seperate_files: + if nvnmd_version is None: + ret += "dump 1 all custom ${DUMP_FREQ} traj/*.lammpstrj id type x y z fx fy fz\n" + else: + ret += "dump 1 all custom ${DUMP_FREQ} traj_${rerun}/*.lammpstrj id type x y z fx fy fz\n" else: lmp_traj_file_name = ( lmp_pimd_traj_name % pimd_bead if pimd_bead is not None else lmp_traj_name ) - ret += ( - "dump 1 all custom ${DUMP_FREQ} %s id type x y z fx fy fz\n" - % lmp_traj_file_name - ) + if nvnmd_version is None: + ret += ( + "dump 1 all custom ${DUMP_FREQ} %s id type x y z fx fy fz\n" + % lmp_traj_file_name + ) + else: + ret += ( + "dump 1 all custom ${DUMP_FREQ} %s_${rerun} id type x y z fx fy fz\n" + % lmp_traj_file_name + ) ret += "restart 10000 dpgen.restart\n" ret += "\n" - if(nvnmd_version is not None): + if nvnmd_version is not None: ret += 'if "${rerun} > 0" then "jump SELF rerun"\n' if pka_e is None: ret += 'if "${restart} == 0" then "velocity all create ${TEMP} %d"' % ( @@ -200,9 +207,12 @@ def make_lmp_input( ret += "\n" ret += "timestep %f\n" % dt ret += "run ${NSTEPS} upto\n" - if(nvnmd_version is not None): - ret += 'jump SELF end\n' - ret += 'label rerun\n' - ret += 'rerun %s.0 dump x y z fx fy fz add yes\n' % lmp_traj_name - ret += 'label end\n' + if nvnmd_version is not None: + ret += "jump SELF end\n" + ret += "label rerun\n" + if trj_seperate_files: + ret += "rerun traj_0/*.lammpstrj dump x y z fx fy fz add yes\n" + else: + ret += "rerun %s_0 dump x y z fx fy fz add yes\n" % lmp_traj_name + ret += "label end\n" return ret diff --git a/dpgen2/exploration/task/lmp_template_task_group.py b/dpgen2/exploration/task/lmp_template_task_group.py index c740e448..27075695 100644 --- a/dpgen2/exploration/task/lmp_template_task_group.py +++ b/dpgen2/exploration/task/lmp_template_task_group.py @@ -72,7 +72,7 @@ def set_lmp( self.pimd_bead, nvnmd_version=self.nvnmd_version, ) - if(nvnmd_version is not None): + if nvnmd_version is not None: self.lmp_template = revise_lmp_input_rerun(self.lmp_template) if plm_template_fname is not None: self.plm_template = Path(plm_template_fname).read_text().split("\n") @@ -176,7 +176,7 @@ def revise_lmp_input_model( if pimd_bead is not None else lmp_model_devi_name ) - if(nvnmd_version is None): + if nvnmd_version is None: idx = find_only_one_key(lmp_lines, ["pair_style", "deepmd"]) lmp_lines[idx] = "pair_style deepmd %s out_freq %d out_file %s%s" % ( graph_list, @@ -188,18 +188,18 @@ def revise_lmp_input_model( idx = find_only_one_key(lmp_lines, ["pair_style", "nvnmd"]) lmp_lines[idx] = "pair_style nvnmd %s %s" % ( "model.pb", - extra_pair_style_args + extra_pair_style_args, ) - + return lmp_lines -def revise_lmp_input_dump(lmp_lines, trj_freq, pimd_bead=None,nvnmd_version=None): +def revise_lmp_input_dump(lmp_lines, trj_freq, pimd_bead=None, nvnmd_version=None): idx = find_only_one_key(lmp_lines, ["dump", "dpgen_dump"]) lmp_traj_file_name = ( lmp_pimd_traj_name % pimd_bead if pimd_bead is not None else lmp_traj_name ) - if(nvnmd_version is None): + if nvnmd_version is None: lmp_lines[ idx ] = f"dump dpgen_dump all custom {trj_freq} {lmp_traj_file_name} id type x y z" @@ -207,10 +207,7 @@ def revise_lmp_input_dump(lmp_lines, trj_freq, pimd_bead=None,nvnmd_version=None lmp_lines[ idx ] = f"dump dpgen_dump all custom {trj_freq} {lmp_traj_file_name} id type x y z fx fy fz" - lmp_lines.insert( - idx+1, - 'if \"${rerun} > 0\" then \"jump SELF rerun\"' - ) + lmp_lines.insert(idx + 1, 'if "${rerun} > 0" then "jump SELF rerun"') return lmp_lines @@ -222,19 +219,12 @@ def revise_lmp_input_plm(lmp_lines, in_plm, out_plm="output.plumed"): ) return lmp_lines + def revise_lmp_input_rerun(lmp_lines): - lmp_lines.append( - 'jump SELF end' - ) - lmp_lines.append( - 'label rerun' - ) - lmp_lines.append( - f'rerun {lmp_traj_name}.0 dump x y z fx fy fz add yes' - ) - lmp_lines.append( - 'label end' - ) + lmp_lines.append("jump SELF end") + lmp_lines.append("label rerun") + lmp_lines.append(f"rerun {lmp_traj_name}.0 dump x y z fx fy fz add yes") + lmp_lines.append("label end") return lmp_lines diff --git a/dpgen2/exploration/task/make_task_group_from_config.py b/dpgen2/exploration/task/make_task_group_from_config.py index 05bfac30..b8113e21 100644 --- a/dpgen2/exploration/task/make_task_group_from_config.py +++ b/dpgen2/exploration/task/make_task_group_from_config.py @@ -298,7 +298,11 @@ def variant_task_group(): "lmp-md", dict, npt_task_group_args(), alias=["lmp-npt"], doc=doc_lmp_md ), Argument( - "lmp-nvnmd", dict, npt_task_group_args(), alias=["lmp-nvnmd-npt"], doc=doc_lmp_md + "lmp-nvnmd", + dict, + npt_task_group_args(), + alias=["lmp-nvnmd-npt"], + doc=doc_lmp_md, ), Argument( "lmp-template", @@ -627,7 +631,7 @@ def make_lmp_task_group_from_config( config["conf_idx"] = [] if "conf_idx" not in config else None config = lmp_normalize(config) config = config_strip_confidx(config) - + if config["type"] == "lmp-md": tgroup = NPTTaskGroup() config.pop("type") @@ -663,7 +667,7 @@ def make_lmp_task_group_from_config( numb_models, lmp_template, **config, - ) + ) elif config["type"] == "customized-lmp-template": tgroup = CustomizedLmpTemplateTaskGroup() config.pop("type") diff --git a/dpgen2/exploration/task/npt_task_group.py b/dpgen2/exploration/task/npt_task_group.py index e597071b..c66d985c 100644 --- a/dpgen2/exploration/task/npt_task_group.py +++ b/dpgen2/exploration/task/npt_task_group.py @@ -134,7 +134,7 @@ def _make_lmp_task( self.ele_temp_f, self.ele_temp_a, self.no_pbc, - nvnmd_version = self.nvnmd_version, + nvnmd_version=self.nvnmd_version, trj_seperate_files=False, pimd_bead=self.pimd_bead, ), diff --git a/dpgen2/flow/dpgen_loop.py b/dpgen2/flow/dpgen_loop.py index db074d97..190a1090 100644 --- a/dpgen2/flow/dpgen_loop.py +++ b/dpgen2/flow/dpgen_loop.py @@ -186,9 +186,6 @@ def __init__( } self._input_artifacts = { "init_models": InputArtifact(optional=True), - "init_models_ckpt_meta": InputArtifact(optional=True), - "init_models_ckpt_data": InputArtifact(optional=True), - "init_models_ckpt_index": InputArtifact(optional=True), "init_data": InputArtifact(), "iter_data": InputArtifact(), } @@ -197,9 +194,6 @@ def __init__( } self._output_artifacts = { "models": OutputArtifact(), - "models_ckpt_meta": OutputArtifact(optional=True), - "models_ckpt_data": OutputArtifact(optional=True), - "models_ckpt_index": OutputArtifact(optional=True), "iter_data": OutputArtifact(), } @@ -284,9 +278,6 @@ def __init__( self._input_artifacts = { "init_models": InputArtifact(optional=True), "init_data": InputArtifact(), - "init_models_ckpt_meta": InputArtifact(optional=True), - "init_models_ckpt_data": InputArtifact(optional=True), - "init_models_ckpt_index": InputArtifact(optional=True), "iter_data": InputArtifact(), } self._output_parameters = { @@ -294,9 +285,6 @@ def __init__( } self._output_artifacts = { "models": OutputArtifact(), - "models_ckpt_meta": OutputArtifact(optional=True), - "models_ckpt_data": OutputArtifact(optional=True), - "models_ckpt_index": OutputArtifact(optional=True), "iter_data": OutputArtifact(), } @@ -386,9 +374,6 @@ def _loop( parameters=block_common_parameters, artifacts={ "init_models": steps.inputs.artifacts["init_models"], - "init_models_ckpt_meta": steps.inputs.artifacts["init_models_ckpt_meta"], - "init_models_ckpt_index": steps.inputs.artifacts["init_models_ckpt_index"], - "init_models_ckpt_data": steps.inputs.artifacts["init_models_ckpt_data"], "init_data": steps.inputs.artifacts["init_data"], "iter_data": steps.inputs.artifacts["iter_data"], }, @@ -455,35 +440,17 @@ def _loop( ), "expl_task_grp": scheduler_step.outputs.parameters["expl_task_grp"], } - if (hasattr(block_step.outputs.artifacts["models_ckpt_meta"]._from, "path") and - hasattr(block_step.outputs.artifacts["models_ckpt_data"]._from, "path") and - hasattr(block_step.outputs.artifacts["models_ckpt_index"]._from, "path")): - next_step = Step( - name=name + "-next", - template=steps, - parameters=next_common_parameters, - artifacts={ - "init_models": block_step.outputs.artifacts["models"], - "init_models_ckpt_meta": block_step.outputs.artifacts["models_ckpt_meta"], - "init_models_ckpt_index": block_step.outputs.artifacts["models_ckpt_index"], - "init_models_ckpt_data": block_step.outputs.artifacts["models_ckpt_data"], - "init_data": steps.inputs.artifacts["init_data"], - "iter_data": block_step.outputs.artifacts["iter_data"], - }, - when="%s == false" % (scheduler_step.outputs.parameters["converged"]), - ) - else: - next_step = Step( - name=name + "-next", - template=steps, - parameters=next_common_parameters, - artifacts={ - "init_models": block_step.outputs.artifacts["models"], - "init_data": steps.inputs.artifacts["init_data"], - "iter_data": block_step.outputs.artifacts["iter_data"], - }, - when="%s == false" % (scheduler_step.outputs.parameters["converged"]), - ) + next_step = Step( + name=name + "-next", + template=steps, + parameters=next_common_parameters, + artifacts={ + "init_models": block_step.outputs.artifacts["models"], + "init_data": steps.inputs.artifacts["init_data"], + "iter_data": block_step.outputs.artifacts["iter_data"], + }, + when="%s == false" % (scheduler_step.outputs.parameters["converged"]), + ) steps.add(next_step) steps.outputs.parameters[ @@ -498,30 +465,6 @@ def _loop( _then=block_step.outputs.artifacts["models"], _else=next_step.outputs.artifacts["models"], ) - if ( hasattr(block_step.outputs.artifacts["models_ckpt_meta"]._from, "path") or - hasattr(next_step.outputs.artifacts["models_ckpt_meta"]._from, "path") - ): - steps.outputs.artifacts["models_ckpt_meta"].from_expression = if_expression( - _if=(scheduler_step.outputs.parameters["converged"] == True), - _then=block_step.outputs.artifacts["models_ckpt_meta"], - _else=next_step.outputs.artifacts["models_ckpt_meta"], - ) - if ( hasattr(block_step.outputs.artifacts["models_ckpt_data"]._from, "path") or - hasattr(next_step.outputs.artifacts["models_ckpt_data"]._from, "path") - ): - steps.outputs.artifacts["models_ckpt_data"].from_expression = if_expression( - _if=(scheduler_step.outputs.parameters["converged"] == True), - _then=block_step.outputs.artifacts["models_ckpt_data"], - _else=next_step.outputs.artifacts["models_ckpt_data"], - ) - if ( hasattr(block_step.outputs.artifacts["models_ckpt_index"]._from, "path") or - hasattr(next_step.outputs.artifacts["models_ckpt_index"]._from, "path") - ): - steps.outputs.artifacts["models_ckpt_index"].from_expression = if_expression( - _if=(scheduler_step.outputs.parameters["converged"] == True), - _then=block_step.outputs.artifacts["models_ckpt_index"], - _else=next_step.outputs.artifacts["models_ckpt_index"], - ) steps.outputs.artifacts["iter_data"].from_expression = if_expression( _if=(scheduler_step.outputs.parameters["converged"] == True), _then=block_step.outputs.artifacts["iter_data"], @@ -601,16 +544,12 @@ def _dpgen( "optional_parameter": steps.inputs.parameters["optional_parameter"], "expl_task_grp": scheduler_step.outputs.parameters["expl_task_grp"], } - loop_step = Step( name=name + "-loop", template=loop_op, parameters=common_parameters, artifacts={ "init_models": steps.inputs.artifacts["init_models"], - "init_models_ckpt_meta": steps.inputs.artifacts["init_models_ckpt_meta"], - "init_models_ckpt_data": steps.inputs.artifacts["init_models_ckpt_data"], - "init_models_ckpt_index": steps.inputs.artifacts["init_models_ckpt_index"], "init_data": steps.inputs.artifacts["init_data"], "iter_data": steps.inputs.artifacts["iter_data"], }, @@ -622,12 +561,6 @@ def _dpgen( "exploration_scheduler" ].value_from_parameter = loop_step.outputs.parameters["exploration_scheduler"] steps.outputs.artifacts["models"]._from = loop_step.outputs.artifacts["models"] - if hasattr(loop_step.outputs.artifacts["models_ckpt_meta"]._from, "path"): - steps.outputs.artifacts["models_ckpt_meta"]._from = loop_step.outputs.artifacts["models_ckpt_meta"] - if hasattr(loop_step.outputs.artifacts["models_ckpt_data"]._from, "path"): - steps.outputs.artifacts["models_ckpt_data"]._from = loop_step.outputs.artifacts["models_ckpt_data"] - if hasattr(loop_step.outputs.artifacts["models_ckpt_index"]._from, "path"): - steps.outputs.artifacts["models_ckpt_index"]._from = loop_step.outputs.artifacts["models_ckpt_index"] steps.outputs.artifacts["iter_data"]._from = loop_step.outputs.artifacts[ "iter_data" ] diff --git a/dpgen2/op/__init__.py b/dpgen2/op/__init__.py index 95f7f9c2..fa1c020c 100644 --- a/dpgen2/op/__init__.py +++ b/dpgen2/op/__init__.py @@ -19,9 +19,6 @@ from .prep_dp_train import ( PrepDPTrain, ) -from .prep_nvnmd_train import ( - PrepNvNMDTrain, -) from .prep_lmp import ( PrepLmp, ) @@ -37,9 +34,6 @@ from .run_dp_train import ( RunDPTrain, ) -from .run_nvnmd_train import ( - RunNvNMDTrain, -) from .run_lmp import ( RunLmp, RunLmpHDF5, @@ -47,7 +41,9 @@ from .run_nvnmd import ( RunNvNMD, ) - +from .run_nvnmd_train import ( + RunNvNMDTrain, +) from .run_relax import ( RunRelax, RunRelaxHDF5, diff --git a/dpgen2/op/prep_dp_train.py b/dpgen2/op/prep_dp_train.py index 20fe58c2..f5767bcd 100644 --- a/dpgen2/op/prep_dp_train.py +++ b/dpgen2/op/prep_dp_train.py @@ -119,15 +119,18 @@ def _script_rand_seed( input_dict, ): jtmp = input_dict.copy() - if "model_dict" in jtmp["model"]: - for d in jtmp["model"]["model_dict"].values(): - if isinstance(d["descriptor"], str): - self._set_desc_seed(jtmp["model"]["shared_dict"][d["descriptor"]]) - d["fitting_net"]["seed"] = random.randrange(sys.maxsize) % (2**32) - else: - self._set_desc_seed(jtmp["model"]["descriptor"]) - jtmp["model"]["fitting_net"]["seed"] = random.randrange(sys.maxsize) % ( - 2**32 - ) + if "model" in jtmp: + if "model_dict" in jtmp["model"]: + for d in jtmp["model"]["model_dict"].values(): + if isinstance(d["descriptor"], str): + self._set_desc_seed(jtmp["model"]["shared_dict"][d["descriptor"]]) + d["fitting_net"]["seed"] = random.randrange(sys.maxsize) % (2**32) + else: + self._set_desc_seed(jtmp["model"]["descriptor"]) + jtmp["model"]["fitting_net"]["seed"] = random.randrange(sys.maxsize) % ( + 2**32 + ) + elif "nvnmd" in jtmp: + jtmp["nvnmd"]["seed"] = random.randrange(sys.maxsize) % (2**32) jtmp["training"]["seed"] = random.randrange(sys.maxsize) % (2**32) return jtmp diff --git a/dpgen2/op/prep_nvnmd_train.py b/dpgen2/op/prep_nvnmd_train.py deleted file mode 100644 index 45a7e121..00000000 --- a/dpgen2/op/prep_nvnmd_train.py +++ /dev/null @@ -1,119 +0,0 @@ -import json -import random -import sys -from pathlib import ( - Path, -) -from typing import ( - List, - Tuple, - Union, -) - -from dflow.python import ( - OP, - OPIO, - Artifact, - BigParameter, - OPIOSign, -) - -from dpgen2.constants import ( - train_script_name, - train_task_pattern, -) - - -class PrepNvNMDTrain(OP): - r"""Prepares the working directories for DP training tasks. - - A list of (`numb_models`) working directories containing all files - needed to start training tasks will be created. The paths of the - directories will be returned as `op["task_paths"]`. The identities - of the tasks are returned as `op["task_names"]`. - - """ - - @classmethod - def get_input_sign(cls): - return OPIOSign( - { - "template_script": BigParameter(Union[dict, List[dict]]), - "numb_models": int, - } - ) - - @classmethod - def get_output_sign(cls): - return OPIOSign( - { - "task_names": BigParameter(List[str]), - "task_paths": Artifact(List[Path]), - } - ) - - @OP.exec_sign_check - def execute( - self, - ip: OPIO, - ) -> OPIO: - r"""Execute the OP. - - Parameters - ---------- - ip : dict - Input dict with components: - - - `template_script`: (`str` or `List[str]`) A template of the training script. Can be a `str` or `List[str]`. In the case of `str`, all training tasks share the same training input template, the only difference is the random number used to initialize the network parameters. In the case of `List[str]`, one training task uses one template from the list. The random numbers used to initialize the network parameters are differnt. The length of the list should be the same as `numb_models`. - - `numb_models`: (`int`) Number of DP models to train. - - Returns - ------- - op : dict - Output dict with components: - - - `task_names`: (`List[str]`) The name of tasks. Will be used as the identities of the tasks. The names of different tasks are different. - - `task_paths`: (`Artifact(List[Path])`) The parepared working paths of the tasks. The order fo the Paths should be consistent with `op["task_names"]` - - """ - template = ip["template_script"] - numb_models = ip["numb_models"] - osubdirs = [] - if type(template) != list: - template = [template for ii in range(numb_models)] - else: - if not (len(template) == numb_models): - raise RuntimeError( - f"length of the template list should be equal to {numb_models}" - ) - - for ii in range(numb_models): - # mkdir - subdir = Path(train_task_pattern % ii) - subdir.mkdir(exist_ok=True, parents=True) - osubdirs.append(str(subdir)) - # change random seed in template - idict = self._script_rand_seed(template[ii]) - # write input script - fname = subdir / train_script_name - with open(fname, "w") as fp: - json.dump(idict, fp, indent=4) - - op = OPIO( - { - "task_names": osubdirs, - "task_paths": [Path(ii) for ii in osubdirs], - } - ) - return op - - def _script_rand_seed( - self, - input_dict, - ): - jtmp = input_dict.copy() - - # the key "seed" in "nvnmd" is used to set the random seed for the network parameters, it is developing. - jtmp["nvnmd"]["seed"] = random.randrange(sys.maxsize) % (2**32) - jtmp["training"]["seed"] = random.randrange(sys.maxsize) % (2**32) - return jtmp diff --git a/dpgen2/op/run_dp_train.py b/dpgen2/op/run_dp_train.py index dccbc518..0e7a712b 100644 --- a/dpgen2/op/run_dp_train.py +++ b/dpgen2/op/run_dp_train.py @@ -3,6 +3,7 @@ import logging import os import shutil +import copy from pathlib import ( Path, ) @@ -406,9 +407,11 @@ def write_other_to_input_script( config, do_init_model, major_version: str = "1", + do_quantized: bool = False, ): - odict = idict.copy() + odict = copy.deepcopy(idict) odict["training"]["disp_file"] = "lcurve.out" + odict["training"]["save_ckpt"] = "model.ckpt" if do_init_model: odict["learning_rate"]["start_lr"] = config["init_model_start_lr"] if "loss_dict" in odict: @@ -429,8 +432,16 @@ def write_other_to_input_script( raise RuntimeError( "unsupported DeePMD-kit major version", major_version ) + + if do_quantized: + if major_version == "1": + odict["training"]["stop_batch"] = 0 + elif major_version == "2": + odict["training"]["numb_steps"] = 0 + return odict + @staticmethod def skip_training( work_dir, diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index 9b1123cd..064ac8f8 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -1,15 +1,17 @@ import glob +import itertools import json import logging import os import random import re -import itertools +import shutil from pathlib import ( Path, ) from typing import ( List, + Union, Optional, Set, Tuple, @@ -44,6 +46,10 @@ plm_output_name, pytorch_model_name_pattern, ) +from dpgen2.op.run_lmp import ( + RunLmp, + find_only_one_key, +) from dpgen2.utils import ( BinaryFileInput, set_directory, @@ -51,10 +57,6 @@ from dpgen2.utils.run_command import ( run_command, ) -from dpgen2.op.run_caly_model_devi import ( - write_model_devi_out, -) -from ase.io import read class RunNvNMD(OP): @@ -103,7 +105,7 @@ def execute( ip : dict Input dict with components: - - `config`: (`dict`) The config of lmp task. Check `RunNvNMD.lmp_args` for definitions. + - `config`: (`dict`) The config of lmp task. Check `RunLmp.lmp_args` for definitions. - `task_name`: (`str`) The name of the task. - `task_path`: (`Artifact(Path)`) The path that contains all input files prepareed by `PrepLmp`. - `models`: (`Artifact(List[Path])`) The frozen model to estimate the model deviation. The first model with be used to drive molecular dynamics simulation. @@ -122,7 +124,7 @@ def execute( On the failure of LAMMPS execution. Handle different failure cases? e.g. loss atoms. """ config = ip["config"] if ip["config"] is not None else {} - config = RunNvNMD.normalize_config(config) + config = RunLmp.normalize_config(config) command = config["command"] teacher_model: Optional[BinaryFileInput] = config["teacher_model_path"] shuffle_models: Optional[bool] = config["shuffle_models"] @@ -132,28 +134,18 @@ def execute( # input_files = [lmp_conf_name, lmp_input_name] # input_files = [(Path(task_path) / ii).resolve() for ii in input_files] input_files = [ii.resolve() for ii in Path(task_path).iterdir()] - model_files = [Path(ii).resolve() for ii in models] + model_files = [Path(ii).resolve() / "model.pb" for ii in models] work_dir = Path(task_name) - if teacher_model is not None: - assert ( - len(model_files) == 1 - ), "One model is enough in knowledge distillation" - ext = os.path.splitext(teacher_model.file_name)[-1] - teacher_model_file = "teacher_model" + ext - teacher_model.save_as_file(teacher_model_file) - model_files = [Path(teacher_model_file).resolve()] + model_files - with set_directory(work_dir): # link input files for ii in input_files: iname = ii.name + #Path(iname).symlink_to(ii) try: Path(iname).symlink_to(ii) except: - logging.warning( - "failed to link %s, maybe already linked" % iname - ) + logging.warning("failed to link %s, maybe already linked" % iname) pass # link models model_names = [] @@ -161,6 +153,7 @@ def execute( ext = os.path.splitext(mm)[-1] if ext == ".pb": mname = model_name_pattern % (idx) + #Path(mname).symlink_to(mm) try: Path(mname).symlink_to(mm) except: @@ -168,11 +161,7 @@ def execute( "failed to link %s, maybe already linked" % mname ) pass - - elif ext == ".pt": - # freeze model - mname = pytorch_model_name_pattern % (idx) - freeze_model(mm, mname, config.get("model_frozen_head")) + else: raise RuntimeError( "Model file with extension '%s' is not supported" % ext @@ -182,61 +171,52 @@ def execute( if shuffle_models: random.shuffle(model_names) - set_models(lmp_input_name, model_names) + set_lmp_models(lmp_input_name, model_names) # run lmp - commands = " ; ".join( - [ - " ".join( - [ - "cp", str(model_name), "model.pb", - "&&", - command, "-i", lmp_input_name, - "-log", lmp_log_name, - "-v", "rerun", "%d"%i, - "&&", - "cp", lmp_traj_name, lmp_traj_name+".%d"%i - ] - ) - for i, model_name in enumerate(models) - ] - ) - ret, out, err = run_command(commands, shell=True) - if ret != 0: - logging.error( - "".join( - ( - "lmp failed\n", - "command was: ", - commands, - "out msg: ", - out, - "\n", - "err msg: ", - err, - "\n", + #for ii in range(1): + for ii in range(len(model_names)): + commands = " ".join( + [ + command, + "-i", + "%d_%s" % (ii, lmp_input_name), + "-log", + "%d_%s" % (ii, lmp_log_name), + "-v", + "rerun", + "%d" % ii, + ] + ) + ret, out, err = run_command(commands, shell=True) + if ret != 0: + logging.error( + "".join( + ( + "lmp failed\n", + "command was: ", + commands, + "out msg: ", + out, + "\n", + "err msg: ", + err, + "\n", + ) ) ) - ) - raise TransientError("lmp failed") - - ele_temp = None - if config.get("use_ele_temp", 0): - ele_temp = get_ele_temp(lmp_log_name) - if ele_temp is not None: - data = { - "ele_temp": ele_temp, - } - with open("job.json", "w") as f: - json.dump(data, f, indent=4) + raise TransientError("lmp failed") + merge_pimd_files() - - if os.path.exists(lmp_traj_name): - calc_model_devi([lmp_traj_name+f".{i}" for i in range(len(model_names))]) + + traj_files = glob.glob("*_%s"%lmp_traj_name) + if len(traj_files) > 1: + calc_model_devi(traj_files, lmp_model_devi_name) + ret_dict = { "log": work_dir / lmp_log_name, - "traj": work_dir / lmp_traj_name, + "traj": work_dir / ("%d_%s" % (0, lmp_traj_name)), "model_devi": self.get_model_devi(work_dir / lmp_model_devi_name), } plm_output = ( @@ -245,168 +225,36 @@ def execute( else {} ) ret_dict.update(plm_output) - if ele_temp is not None: - ret_dict["optional_output"] = work_dir / "job.json" - return OPIO(ret_dict) def get_model_devi(self, model_devi_file): return model_devi_file - @staticmethod - def lmp_args(): - doc_lmp_cmd = "The command of LAMMPS" - doc_teacher_model = "The teacher model in `Knowledge Distillation`" - doc_shuffle_models = "Randomly pick a model from the group of models to drive theexploration MD simulation" - doc_head = "Select a head from multitask" - doc_use_ele_temp = "Whether to use electronic temperature, 0 for no, 1 for frame temperature, and 2 for atomic temperature" - doc_use_hdf5 = "Use HDF5 to store trajs and model_devis" - return [ - Argument("command", str, optional=True, default="lmp", doc=doc_lmp_cmd), - Argument( - "teacher_model_path", - [BinaryFileInput, str], - optional=True, - default=None, - doc=doc_teacher_model, - ), - Argument( - "shuffle_models", - bool, - optional=True, - default=False, - doc=doc_shuffle_models, - ), - Argument("head", str, optional=True, default=None, doc=doc_head), - Argument( - "use_ele_temp", int, optional=True, default=0, doc=doc_use_ele_temp - ), - Argument( - "model_frozen_head", str, optional=True, default=None, doc=doc_head - ), - Argument( - "use_hdf5", - bool, - optional=True, - default=False, - doc=doc_use_hdf5, - ), - ] - - @staticmethod - def normalize_config(data={}): - ta = RunNvNMD.lmp_args() - base = Argument("base", dict, ta) - data = base.normalize_value(data, trim_pattern="_*") - base.check_value(data, strict=True) - return data - - -config_args = RunNvNMD.lmp_args - - -def set_models(lmp_input_name: str, model_names: List[str]): + +config_args = RunLmp.lmp_args + + +def set_lmp_models(lmp_input_name: str, model_names: List[str]): with open(lmp_input_name, encoding="utf8") as f: lmp_input_lines = f.readlines() idx = find_only_one_key( - lmp_input_lines, ["pair_style", "deepmd"], raise_not_found=False + lmp_input_lines, ["pair_style", "nvnmd"], raise_not_found=False ) if idx is None: return new_line_split = lmp_input_lines[idx].split() - match_first = -1 - match_last = -1 - pattern = model_name_match_pattern - for sidx, ii in enumerate(new_line_split): - if re.fullmatch(pattern, ii) is not None: - if match_first == -1: - match_first = sidx - else: - if match_first != -1: - match_last = sidx - break - if match_first == -1: - raise RuntimeError( - f"cannot file model pattern {pattern} in line " f" {lmp_input_lines[idx]}" - ) - if match_last == -1: + match_idx = find_only_one_key(new_line_split, ['model.pb'], raise_not_found=False) + if match_idx is None: raise RuntimeError(f"last matching index should not be -1, terribly wrong ") - for ii in range(match_last, len(new_line_split)): - if re.fullmatch(pattern, new_line_split[ii]) is not None: - raise RuntimeError( - f"unexpected matching of model pattern {pattern} " - f"in line {lmp_input_lines[idx]}" - ) - new_line_split[match_first:match_last] = model_names - lmp_input_lines[idx] = " ".join(new_line_split) + "\n" - - with open(lmp_input_name, "w", encoding="utf8") as f: - f.write("".join(lmp_input_lines)) - - -def find_only_one_key(lmp_lines, key, raise_not_found=True): - found = [] - for idx in range(len(lmp_lines)): - words = lmp_lines[idx].split() - nkey = len(key) - if len(words) >= nkey and words[:nkey] == key: - found.append(idx) - if len(found) > 1: - raise RuntimeError("found %d keywords %s" % (len(found), key)) - if len(found) == 0: - if raise_not_found: - raise RuntimeError("failed to find keyword %s" % (key)) - else: - return None - return found[0] - - -def get_ele_temp(lmp_log_name): - with open(lmp_log_name, encoding="utf8") as f: - lmp_log_lines = f.readlines() - - for line in lmp_log_lines: - fields = line.split() - if fields[:2] == ["pair_style", "deepmd"]: - if "fparam" in fields: - # for rendering variables - try: - return float(fields[fields.index("fparam") + 1]) - except Exception: - pass - if "aparam" in fields: - try: - return float(fields[fields.index("aparam") + 1]) - except Exception: - pass + + for ii, model_name in enumerate(model_names): + new_line_split[match_idx] = model_name + + lmp_input_lines[idx] = " ".join(new_line_split) + "\n" - return None - - -def freeze_model(input_model, frozen_model, head=None): - freeze_args = "-o %s" % frozen_model - if head is not None: - freeze_args += " --head %s" % head - freeze_cmd = "dp --pt freeze -c %s %s" % (input_model, freeze_args) - ret, out, err = run_command(freeze_cmd, shell=True) - if ret != 0: - logging.error( - "".join( - ( - "freeze failed\n", - "command was", - freeze_cmd, - "out msg", - out, - "\n", - "err msg", - err, - "\n", - ) - ) - ) - raise TransientError("freeze failed") + with open(lmp_input_name + ".%d"%(ii), "w", encoding="utf8") as f: + f.write("".join(lmp_input_lines)) def merge_pimd_files(): @@ -423,40 +271,72 @@ def merge_pimd_files(): with open(model_devi_file, "r") as f2: f.write(f2.read()) + def calc_model_devi( traj_files, - fname = "model_devi.out", + fname="model_devi.out", ): - + + from ase.io import read # type: ignore trajectories = [] for f in traj_files: - traj = read(f, format='lammps-dump-text', index=':', order=True) + traj = read(f, format="lammps-dump-text", index=":", order=True) trajectories.append(traj) - + num_frames = len(trajectories[0]) for traj in trajectories: assert len(traj) == num_frames, "Not match" - + devi = [] for frame_idx in range(num_frames): frames = [traj[frame_idx] for traj in trajectories] - + all_forces = [atoms.get_forces() for atoms in frames] all_errors = [] - + for atom_idx in range(len(frames[0])): forces = [forces_arr[atom_idx] for forces_arr in all_forces] - + for a, b in itertools.combinations(forces, 2): error = np.linalg.norm(a - b) all_errors.append(error) - + max_error = np.max(all_errors) if all_errors else 0.0 min_error = np.min(all_errors) if all_errors else 0.0 - avg_error = np.mean(all_errors) if all_errors else 0.0 + avg_error = np.mean(all_errors) if all_errors else 0.0 # ase verion >= 3.26.0, please update ase using "pip install git+https://gitlab.com/ase/ase.git" - devi.append([trajectories[0][frame_idx].info['timestep'],0,0,0,max_error, min_error, avg_error,0]) - + devi.append( + [ + trajectories[0][frame_idx].info["timestep"], + 0, + 0, + 0, + max_error, + min_error, + avg_error, + 0, + ] + ) + devi = np.array(devi) - write_model_devi_out(devi, fname=fname) + write_model_devi_out(devi, fname=fname) + +def write_model_devi_out(devi: np.ndarray, fname: Union[str, Path], header: str = ""): + assert devi.shape[1] == 8 + header = "%s\n%10s" % (header, "step") + for item in "vf": + header += "%19s%19s%19s" % ( + f"max_devi_{item}", + f"min_devi_{item}", + f"avg_devi_{item}", + ) + with open(fname, "ab") as fp: + np.savetxt( + fp, + devi, + fmt=["%12d"] + ["%19.6e" for _ in range(devi.shape[1] - 1)], + delimiter="", + header=header, + ) + return devi \ No newline at end of file diff --git a/dpgen2/op/run_nvnmd_train.py b/dpgen2/op/run_nvnmd_train.py index 18acf117..2c0300f1 100644 --- a/dpgen2/op/run_nvnmd_train.py +++ b/dpgen2/op/run_nvnmd_train.py @@ -1,9 +1,9 @@ +import copy import glob import json import logging import os import shutil -import copy from pathlib import ( Path, ) @@ -35,9 +35,9 @@ ) from dpgen2.constants import ( - train_script_name, train_cnn_script_name, train_qnn_script_name, + train_script_name, train_task_pattern, ) from dpgen2.utils.chdir import ( @@ -46,6 +46,10 @@ from dpgen2.utils.run_command import ( run_command, ) +from dpgen2.op.run_dp_train import ( + RunDPTrain, + _expand_all_multi_sys_to_sys, +) def _make_train_command( @@ -55,42 +59,47 @@ def _make_train_command( init_model, train_args="", ): - # find checkpoint - if os.path.isfile("nvnmd_cnn/checkpoint") and not os.path.isfile("nvnmd_cnn/frozen_model.pb"): + if os.path.isfile("nvnmd_cnn/checkpoint") and not os.path.isfile( + "nvnmd_cnn/frozen_model.pb" + ): checkpoint = "nvnmd_cnn/model.ckpt" else: checkpoint = None - + # case of restart if checkpoint is not None: - command = dp_command + ["train-nvnmd", "--restart", checkpoint, train_script_name] + command = dp_command + [ + "train-nvnmd", + "--restart", + checkpoint, + train_script_name, + ] return command - + # case of init model assert checkpoint is None case_init_model = do_init_model if case_init_model: - - if isinstance(init_model, list): # initialize from model.ckpt - #init_model = ".".join(str(init_model[0]).split('.')[:-1]) + if isinstance(init_model, list): # initialize from model.ckpt + # init_model = ".".join(str(init_model[0]).split('.')[:-1]) for i in init_model: - if(os.path.exists(i)): - shutil.copy(i, '.') + if os.path.exists(i): + shutil.copy(i, ".") init_model = "model.ckpt" init_flag = "--init-model" - else: # initialize from frozen model + else: # initialize from frozen model init_flag = "--init-frz-model" - + command = dp_command + [ - "train-nvnmd", - init_flag, - str(init_model), - train_script_name, - ] + "train-nvnmd", + init_flag, + str(init_model), + train_script_name, + ] else: - command = dp_command + ["train-nvnmd", train_script_name] - + command = dp_command + ["train-nvnmd", train_script_name] + command += train_args.split() return command @@ -121,9 +130,6 @@ def get_input_sign(cls): ), "task_path": Artifact(Path), "init_model": Artifact(Path, optional=True), - "init_model_ckpt_meta": Artifact(Path, optional=True), - "init_model_ckpt_data": Artifact(Path, optional=True), - "init_model_ckpt_index": Artifact(Path, optional=True), "init_data": Artifact(NestedDict[Path]), "iter_data": Artifact(List[Path]), "valid_data": Artifact(NestedDict[Path], optional=True), @@ -136,11 +142,7 @@ def get_output_sign(cls): return OPIOSign( { "script": Artifact(Path), - "cnn_model": Artifact(Path), - "qnn_model": Artifact(Path), - "model_ckpt_data": Artifact(Path), - "model_ckpt_meta": Artifact(Path), - "model_ckpt_index": Artifact(Path), + "model": Artifact(Path), "lcurve": Artifact(Path), "log": Artifact(Path), } @@ -158,28 +160,19 @@ def execute( ip : dict Input dict with components: - - `config`: (`dict`) The config of training task. Check `RunNvNMDTrain.training_args` for definitions. + - `config`: (`dict`) The config of training task. Check `RunDPTrain.training_args` for definitions. - `task_name`: (`str`) The name of training task. - `task_path`: (`Artifact(Path)`) The path that contains all input files prepareed by `PrepDPTrain`. - - `init_model`: (`Artifact(Path)`) A frozen model to initialize the training. - - `init_model_ckpt_meta`: (`Artifact(Path)`, optional) The meta file of the frozen model. - - `init_model_ckpt_data`: (`Artifact(Path)`, optional) The data file of the frozen model. - - `init_model_ckpt_index`: (`Artifact(Path)`, optional) The index file of the frozen model. + - `init_model`: (`Artifact(Path)`) The checkpoint and frozen model to initialize the training. - `init_data`: (`Artifact(NestedDict[Path])`) Initial training data. - `iter_data`: (`Artifact(List[Path])`) Training data generated in the DPGEN iterations. - - `valid_data`: (`Artifact(NestedDict[Path])`, optional) Validation data. - - `optional_files`: (`Artifact(List[Path])`, optional) Optional files to be copied to the working directory. Returns ------- Any Output dict with components: - `script`: (`Artifact(Path)`) The training script. - - `cnn_model`: (`Artifact(Path)`) The trained continuous frozen model. - - `qnn_model`: (`Artifact(Path)`) The trained quantized frozen model. - - `model_ckpt_data`: (`Artifact(Path)`) The data file of the trained model. - - `model_ckpt_meta`: (`Artifact(Path)`) The meta file of the trained model. - - `model_ckpt_index`: (`Artifact(Path)`) The index file of the trained model. + - `model`: (`Artifact(Path)`) The trained continuous and quantized frozen model, the checkpoint model. - `lcurve`: (`Artifact(Path)`) The learning curve file. - `log`: (`Artifact(Path)`) The log file of training. @@ -192,13 +185,14 @@ def execute( config = ip["config"] if ip["config"] is not None else {} dp_command = ip["config"].get("command", "dp").split() train_args = config.get("train_args", "") - config = RunNvNMDTrain.normalize_config(config) + config = RunDPTrain.normalize_config(config) task_name = ip["task_name"] task_path = ip["task_path"] init_model = ip["init_model"] - init_model_ckpt_data = ip["init_model_ckpt_data"] - init_model_ckpt_meta = ip["init_model_ckpt_meta"] - init_model_ckpt_index = ip["init_model_ckpt_index"] + init_frz_model = ip["init_model"] / "frozen_model.pb" if init_model else None + init_model_ckpt_data = ip["init_model"] / "model.ckpt.data-00000-of-00001" if init_model else None + init_model_ckpt_meta = ip["init_model"] / "model.ckpt.meta" if init_model else None + init_model_ckpt_index = ip["init_model"] / "model.ckpt.index" if init_model else None init_data = ip["init_data"] iter_data = ip["iter_data"] valid_data = ip["valid_data"] @@ -217,10 +211,14 @@ def execute( major_version = "2" # auto prob style - init_model_ckpt = [init_model_ckpt_meta, init_model_ckpt_data, init_model_ckpt_index] - do_init_model = RunNvNMDTrain.decide_init_model( + init_model_ckpt = [ + init_model_ckpt_meta, + init_model_ckpt_data, + init_model_ckpt_index, + ] + do_init_model = RunDPTrain.decide_init_model( config, - init_model_ckpt if init_model_ckpt_data is not None else init_model, + init_model_ckpt if init_model_ckpt_data is not None else init_frz_model, init_data, iter_data, mixed_type=mixed_type, @@ -234,7 +232,7 @@ def execute( auto_prob_str = f"prob_sys_size; 0:{numb_old}:{old_ratio}; {numb_old}:{numb_new}:{1.-old_ratio:g}" # update the input dict - train_dict = RunNvNMDTrain.write_data_to_input_script( + train_dict = RunDPTrain.write_data_to_input_script( train_dict, config, init_data, @@ -243,11 +241,19 @@ def execute( major_version, valid_data, ) - train_cnn_dict = RunNvNMDTrain.write_other_to_input_script( - train_dict, config, do_init_model, False, major_version, + train_cnn_dict = RunDPTrain.write_other_to_input_script( + train_dict, + config, + do_init_model, + major_version, + False ) - train_qnn_dict = RunNvNMDTrain.write_other_to_input_script( - train_dict, config, do_init_model, True, major_version, + train_qnn_dict = RunDPTrain.write_other_to_input_script( + train_dict, + config, + do_init_model, + major_version, + True, ) with set_directory(work_dir): @@ -258,9 +264,13 @@ def clean_before_quit(): fplog.close() # dump train script - with open(train_cnn_script_name, "w") as fp: + + with open(train_script_name, "w") as fp: json.dump(train_cnn_dict, fp, indent=4) + with open(train_cnn_script_name, "w") as fp: + json.dump(train_cnn_dict, fp, indent=4) + with open(train_qnn_script_name, "w") as fp: json.dump(train_qnn_dict, fp, indent=4) @@ -274,11 +284,11 @@ def clean_before_quit(): train_cnn_script_name, do_init_model, init_model_ckpt if init_model_ckpt_data is not None else init_model, - train_args = "-s s1", + train_args="-s s1", ) - if not RunNvNMDTrain.skip_training( - work_dir, train_dict, init_model, iter_data + if not RunDPTrain.skip_training( + work_dir, train_dict, init_model, iter_data, None ): ret, out, err = run_command(command) if ret != 0: @@ -297,31 +307,39 @@ def clean_before_quit(): ) ) raise FatalError("dp train-nvnmd -s s1 failed") - fplog.write("#=================== train_cnn std out ===================\n") + fplog.write( + "#=================== train_cnn std out ===================\n" + ) fplog.write(out) - fplog.write("#=================== train_cnn std err ===================\n") + fplog.write( + "#=================== train_cnn std err ===================\n" + ) fplog.write(err) - + cnn_model_file = "nvnmd_cnn/frozen_model.pb" model_ckpt_data_file = "nvnmd_cnn/model.ckpt.data-00000-of-00001" model_ckpt_index_file = "nvnmd_cnn/model.ckpt.index" model_ckpt_meta_file = "nvnmd_cnn/model.ckpt.meta" lcurve_file = "nvnmd_cnn/lcurve.out" - + + if os.path.exists("input_v2_compat.json"): + shutil.copy2("input_v2_compat.json", train_script_name) + + else: cnn_model_file = init_model model_ckpt_data_file = "" model_ckpt_index_file = "" model_ckpt_meta_file = "" lcurve_file = "nvnmd_qnn/lcurve.out" - + # train qnn model command = _make_train_command( dp_command, train_qnn_script_name, do_init_model, init_model_ckpt if init_model_ckpt_data is not None else init_model, - train_args = "-s s2", + train_args="-s s2", ) ret, out, err = run_command(command) @@ -345,305 +363,34 @@ def clean_before_quit(): fplog.write(out) fplog.write("#=================== train_qnn std err ===================\n") fplog.write(err) - + qnn_model_file = "nvnmd_qnn/model.pb" - if os.path.exists("input_v2_compat.json"): - shutil.copy2("input_v2_compat.json", train_script_name) - clean_before_quit() + + # copy all models files to the output directory + os.makedirs("nvnmd_models", exist_ok=True) + if os.path.exists(cnn_model_file): + shutil.copy(cnn_model_file, "nvnmd_models") + if os.path.exists(qnn_model_file): + shutil.copy(qnn_model_file, "nvnmd_models") + if os.path.exists(model_ckpt_meta_file): + shutil.copy(model_ckpt_meta_file, "nvnmd_models") + if os.path.exists(model_ckpt_data_file): + shutil.copy(model_ckpt_data_file, "nvnmd_models") + if os.path.exists(model_ckpt_index_file): + shutil.copy(model_ckpt_index_file, "nvnmd_models") + + model_files = "nvnmd_models" return OPIO( { - "script": work_dir / train_cnn_script_name, - "cnn_model": work_dir / cnn_model_file, - "qnn_model": work_dir / qnn_model_file, - "model_ckpt_data": work_dir / model_ckpt_data_file, - "model_ckpt_meta": work_dir / model_ckpt_meta_file, - "model_ckpt_index": work_dir / model_ckpt_index_file, + "script": work_dir / train_script_name, + "model": work_dir / model_files, "lcurve": work_dir / lcurve_file, "log": work_dir / "train.log", } ) - @staticmethod - def write_data_to_input_script( - idict: dict, - config, - init_data: Union[List[Path], Dict[str, List[Path]]], - iter_data: List[Path], - auto_prob_str: str = "prob_sys_size", - major_version: str = "2", - valid_data: Optional[Union[List[Path], Dict[str, List[Path]]]] = None, - ): - odict = idict.copy() - - data_list = [str(ii) for ii in init_data] + [str(ii) for ii in iter_data] - if major_version == "1": - # v1 behavior - odict["training"]["systems"] = data_list - odict["training"].setdefault("batch_size", "auto") - odict["training"]["auto_prob_style"] = auto_prob_str - if valid_data is not None: - odict["training"]["validation_data"] = { - "systems": [str(ii) for ii in valid_data], - "batch_size": 1, - } - elif major_version == "2": - # v2 behavior - odict["training"]["training_data"]["systems"] = data_list - odict["training"]["training_data"].setdefault("batch_size", "auto") - odict["training"]["training_data"]["auto_prob"] = auto_prob_str - if valid_data is None: - odict["training"].pop("validation_data", None) - else: - odict["training"]["validation_data"] = { - "systems": [str(ii) for ii in valid_data], - "batch_size": 1, - } - else: - raise RuntimeError("unsupported DeePMD-kit major version", major_version) - return odict - - @staticmethod - def write_other_to_input_script( - idict, - config, - do_init_model, - train_qnn_model: bool = False, - major_version: str = "1", - ): - odict = copy.deepcopy(idict) - odict["training"]["disp_file"] = "lcurve.out" - odict["training"]["save_ckpt"] = "model.ckpt" - if do_init_model: - odict["learning_rate"]["start_lr"] = config["init_model_start_lr"] - if "loss_dict" in odict: - for v in odict["loss_dict"].values(): - if isinstance(v, dict): - v["start_pref_e"] = config["init_model_start_pref_e"] - v["start_pref_f"] = config["init_model_start_pref_f"] - v["start_pref_v"] = config["init_model_start_pref_v"] - else: - odict["loss"]["start_pref_e"] = config["init_model_start_pref_e"] - odict["loss"]["start_pref_f"] = config["init_model_start_pref_f"] - odict["loss"]["start_pref_v"] = config["init_model_start_pref_v"] - if major_version == "1": - odict["training"]["stop_batch"] = config["init_model_numb_steps"] - elif major_version == "2": - odict["training"]["numb_steps"] = config["init_model_numb_steps"] - else: - raise RuntimeError( - "unsupported DeePMD-kit major version", major_version - ) - if train_qnn_model: - odict["learning_rate"]["start_lr"] = config["init_model_start_lr"] - if "loss_dict" in odict: - for v in odict["loss_dict"].values(): - if isinstance(v, dict): - v["start_pref_e"] = 1 - v["start_pref_f"] = 1 - v["start_pref_v"] = 1 - if major_version == "1": - odict["training"]["stop_batch"] = 0 - elif major_version == "2": - odict["training"]["numb_steps"] = 0 - return odict - - @staticmethod - def skip_training( - work_dir, - train_dict, - init_model, - iter_data, - ): - # we have init model and no iter data, skip training - if (init_model is not None) and (iter_data is None or len(iter_data) == 0): - with set_directory(work_dir): - with open(train_script_name, "w") as fp: - json.dump(train_dict, fp, indent=4) - Path("train.log").write_text( - f"We have init model {init_model} and " - f"no iteration training data. " - f"The training is skipped.\n" - ) - Path("lcurve.out").touch() - return True - else: - return False - - @staticmethod - def decide_init_model( - config, - init_model, - init_data, - iter_data, - mixed_type=False, - ): - do_init_model = False - # decide if we do init-model - ## cases we do definitely not - if init_model is None or iter_data is None or len(iter_data) == 0: - do_init_model = False - ## cases controlled by the policy - else: - if config["init_model_policy"] == "no": - do_init_model = False - elif config["init_model_policy"] == "yes": - do_init_model = True - elif "old_data_larger_than" in config["init_model_policy"]: - old_data_size_level = int(config["init_model_policy"].split(":")[-1]) - if isinstance(init_data, dict): - init_data_size = _get_data_size_of_all_systems( - sum(init_data.values(), []) - ) - else: - init_data_size = _get_data_size_of_all_systems(init_data) - iter_data_old_size = _get_data_size_of_all_mult_sys( - iter_data[:-1], mixed_type=mixed_type - ) - old_data_size = init_data_size + iter_data_old_size - if old_data_size > old_data_size_level: - do_init_model = True - return do_init_model - - @staticmethod - def training_args(): - doc_command = "The command for DP, 'dp' for default" - doc_init_model_policy = "The policy of init-model training. It can be\n\n\ - - 'no': No init-model training. Traing from scratch.\n\n\ - - 'yes': Do init-model training.\n\n\ - - 'old_data_larger_than:XXX': Do init-model if the training data size of the previous model is larger than XXX. XXX is an int number." - doc_init_model_old_ratio = "The frequency ratio of old data over new data" - doc_init_model_numb_steps = "The number of training steps when init-model" - doc_init_model_start_lr = "The start learning rate when init-model" - doc_init_model_start_pref_e = ( - "The start energy prefactor in loss when init-model" - ) - doc_init_model_start_pref_f = ( - "The start force prefactor in loss when init-model" - ) - doc_init_model_start_pref_v = ( - "The start virial prefactor in loss when init-model" - ) - doc_train_args = "Extra arguments for dp train" - return [ - Argument( - "command", - str, - optional=True, - default="dp", - doc=doc_command, - ), - Argument( - "init_model_policy", - str, - optional=True, - default="no", - doc=doc_init_model_policy, - ), - Argument( - "init_model_old_ratio", - float, - optional=True, - default=0.9, - doc=doc_init_model_old_ratio, - ), - Argument( - "init_model_numb_steps", - int, - optional=True, - default=400000, - doc=doc_init_model_numb_steps, - alias=["init_model_stop_batch"], - ), - Argument( - "init_model_start_lr", - float, - optional=True, - default=1e-4, - doc=doc_init_model_start_lr, - ), - Argument( - "init_model_start_pref_e", - float, - optional=True, - default=0.1, - doc=doc_init_model_start_pref_e, - ), - Argument( - "init_model_start_pref_f", - float, - optional=True, - default=100, - doc=doc_init_model_start_pref_f, - ), - Argument( - "init_model_start_pref_v", - float, - optional=True, - default=0.0, - doc=doc_init_model_start_pref_v, - ), - Argument( - "train_args", - str, - optional=True, - default="", - doc=doc_train_args, - ), - ] - - @staticmethod - def normalize_config(data={}): - ta = RunNvNMDTrain.training_args() - - base = Argument("base", dict, ta) - data = base.normalize_value(data, trim_pattern="_*") - base.check_value(data, strict=True) - - return data - - -def _get_data_size_of_system(data_dir): - ss = dpdata.System(data_dir, fmt="deepmd/npy") - return ss.get_nframes() - - -def _get_data_size_of_all_systems(data_dirs): - count = 0 - for ii in data_dirs: - count += _get_data_size_of_system(ii) - return count - - -def _get_data_size_of_mult_sys(data_dir, mixed_type=False): - ms = dpdata.MultiSystems() - if mixed_type: - ms.from_deepmd_npy_mixed(data_dir) # type: ignore - else: - ms.from_deepmd_npy(data_dir) # type: ignore - return ms.get_nframes() - - -def _get_data_size_of_all_mult_sys(data_dirs, mixed_type=False): - count = 0 - for ii in data_dirs: - count += _get_data_size_of_mult_sys(ii, mixed_type) - return count - - -def _expand_multi_sys_to_sys(multi_sys_dir): - all_type_raws = sorted(glob.glob(os.path.join(multi_sys_dir, "*", "type.raw"))) - all_sys_dirs = [str(Path(ii).parent) for ii in all_type_raws] - return all_sys_dirs - - -def _expand_all_multi_sys_to_sys(list_multi_sys): - all_sys_dirs = [] - for ii in list_multi_sys: - all_sys_dirs = all_sys_dirs + _expand_multi_sys_to_sys(ii) - return all_sys_dirs - -config_args = RunNvNMDTrain.training_args +config_args = RunDPTrain.training_args diff --git a/dpgen2/superop/__init__.py b/dpgen2/superop/__init__.py index cfddabdd..0223605f 100644 --- a/dpgen2/superop/__init__.py +++ b/dpgen2/superop/__init__.py @@ -10,9 +10,6 @@ from .prep_run_dp_train import ( PrepRunDPTrain, ) -from .prep_run_nvnmd_train import ( - PrepRunNvNMDTrain, -) from .prep_run_fp import ( PrepRunFp, ) diff --git a/dpgen2/superop/block.py b/dpgen2/superop/block.py index c8940b92..0e39ab38 100644 --- a/dpgen2/superop/block.py +++ b/dpgen2/superop/block.py @@ -58,9 +58,6 @@ from .prep_run_dp_train import ( PrepRunDPTrain, ) -from .prep_run_nvnmd_train import ( - PrepRunNvNMDTrain, -) from .prep_run_fp import ( PrepRunFp, ) @@ -91,7 +88,7 @@ class ConcurrentLearningBlock(Steps): def __init__( self, name: str, - prep_run_dp_train_op: Union[PrepRunDPTrain, PrepRunNvNMDTrain], + prep_run_dp_train_op: PrepRunDPTrain, prep_run_explore_op: Union[PrepRunLmp, PrepRunCaly, PrepRunDiffCSP], select_confs_op: Type[OP], prep_run_fp_op: PrepRunFp, @@ -116,9 +113,6 @@ def __init__( } self._input_artifacts = { "init_models": InputArtifact(optional=True), - "init_models_ckpt_index": InputArtifact(optional=True), - "init_models_ckpt_data": InputArtifact(optional=True), - "init_models_ckpt_meta": InputArtifact(optional=True), "init_data": InputArtifact(), "iter_data": InputArtifact(), } @@ -127,9 +121,6 @@ def __init__( } self._output_artifacts = { "models": OutputArtifact(), - "models_ckpt_index": OutputArtifact(optional=True), - "models_ckpt_data": OutputArtifact(optional=True), - "models_ckpt_meta": OutputArtifact(optional=True), "iter_data": OutputArtifact(), "trajs": OutputArtifact(), } @@ -221,55 +212,25 @@ def _block_cl( block_steps.inputs.parameters["optional_parameter"] ) - if isinstance(prep_run_dp_train_op, PrepRunNvNMDTrain): - prep_run_dp_train = Step( - name + "-prep-run-nvnmd-train", - template=prep_run_dp_train_op, - parameters={ - "block_id": block_steps.inputs.parameters["block_id"], - "train_config": block_steps.inputs.parameters["train_config"], - "numb_models": block_steps.inputs.parameters["numb_models"], - "template_script": block_steps.inputs.parameters["template_script"], - "run_optional_parameter": run_dp_train_optional_parameter, - }, - artifacts={ - "init_models": block_steps.inputs.artifacts["init_models"], - "init_models_ckpt_index": block_steps.inputs.artifacts[ - "init_models_ckpt_index" - ], - "init_models_ckpt_data": block_steps.inputs.artifacts[ - "init_models_ckpt_data" - ], - "init_models_ckpt_meta": block_steps.inputs.artifacts[ - "init_models_ckpt_meta" - ], - "init_data": block_steps.inputs.artifacts["init_data"], - "iter_data": block_steps.inputs.artifacts["iter_data"], - }, - key="--".join( - ["%s" % block_steps.inputs.parameters["block_id"], "prep-run-train"] - ), - ) - else: - prep_run_dp_train = Step( - name + "-prep-run-dp-train", - template=prep_run_dp_train_op, - parameters={ - "block_id": block_steps.inputs.parameters["block_id"], - "train_config": block_steps.inputs.parameters["train_config"], - "numb_models": block_steps.inputs.parameters["numb_models"], - "template_script": block_steps.inputs.parameters["template_script"], - "run_optional_parameter": run_dp_train_optional_parameter, - }, - artifacts={ - "init_models": block_steps.inputs.artifacts["init_models"], - "init_data": block_steps.inputs.artifacts["init_data"], - "iter_data": block_steps.inputs.artifacts["iter_data"], - }, - key="--".join( - ["%s" % block_steps.inputs.parameters["block_id"], "prep-run-train"] - ), - ) + prep_run_dp_train = Step( + name + "-prep-run-dp-train", + template=prep_run_dp_train_op, + parameters={ + "block_id": block_steps.inputs.parameters["block_id"], + "train_config": block_steps.inputs.parameters["train_config"], + "numb_models": block_steps.inputs.parameters["numb_models"], + "template_script": block_steps.inputs.parameters["template_script"], + "run_optional_parameter": run_dp_train_optional_parameter, + }, + artifacts={ + "init_models": block_steps.inputs.artifacts["init_models"], + "init_data": block_steps.inputs.artifacts["init_data"], + "iter_data": block_steps.inputs.artifacts["iter_data"], + }, + key="--".join( + ["%s" % block_steps.inputs.parameters["block_id"], "prep-run-train"] + ), + ) block_steps.add(prep_run_dp_train) prep_run_explore = Step( @@ -282,9 +243,7 @@ def _block_cl( "type_map": block_steps.inputs.parameters["type_map"], }, artifacts={ - "models": prep_run_dp_train.outputs.artifacts["nvnmodels"] - if isinstance(prep_run_dp_train_op, PrepRunNvNMDTrain) - else prep_run_dp_train.outputs.artifacts["models"] + "models": prep_run_dp_train.outputs.artifacts["models"], }, key="--".join( ["%s" % block_steps.inputs.parameters["block_id"], "prep-run-explore"] @@ -363,10 +322,6 @@ def _block_cl( block_steps.outputs.artifacts["models"]._from = prep_run_dp_train.outputs.artifacts[ "models" ] - if isinstance(prep_run_dp_train_op, PrepRunNvNMDTrain): - block_steps.outputs.artifacts["models_ckpt_meta"]._from = prep_run_dp_train.outputs.artifacts["models_ckpt_meta"] - block_steps.outputs.artifacts["models_ckpt_data"]._from = prep_run_dp_train.outputs.artifacts["models_ckpt_data"] - block_steps.outputs.artifacts["models_ckpt_index"]._from = prep_run_dp_train.outputs.artifacts["models_ckpt_index"] block_steps.outputs.artifacts["iter_data"]._from = collect_data.outputs.artifacts[ "iter_data" ] diff --git a/dpgen2/superop/prep_run_dp_train.py b/dpgen2/superop/prep_run_dp_train.py index 0fd988e4..72a60b41 100644 --- a/dpgen2/superop/prep_run_dp_train.py +++ b/dpgen2/superop/prep_run_dp_train.py @@ -47,6 +47,7 @@ ) from dpgen2.op import ( RunDPTrain, + RunNvNMDTrain, ) from dpgen2.utils.step_config import ( init_executor, @@ -59,7 +60,7 @@ def __init__( self, name: str, prep_train_op: Type[OP], - run_train_op: Type[RunDPTrain], + run_train_op: Type[OP], prep_config: Optional[dict] = None, run_config: Optional[dict] = None, upload_python_packages: Optional[List[os.PathLike]] = None, @@ -150,7 +151,7 @@ def _prep_run_dp_train( train_steps, step_keys, prep_train_op: Type[OP], - run_train_op: Type[RunDPTrain], + run_train_op: Type[OP], prep_config: dict = normalize_step_dict({}), run_config: dict = normalize_step_dict({}), upload_python_packages: Optional[List[os.PathLike]] = None, diff --git a/dpgen2/superop/prep_run_nvnmd_train.py b/dpgen2/superop/prep_run_nvnmd_train.py deleted file mode 100644 index 21bd376c..00000000 --- a/dpgen2/superop/prep_run_nvnmd_train.py +++ /dev/null @@ -1,253 +0,0 @@ -import json -import os -from copy import ( - deepcopy, -) -from pathlib import ( - Path, -) -from typing import ( - List, - Optional, - Set, - Type, -) - -from dflow import ( - InputArtifact, - InputParameter, - Inputs, - OutputArtifact, - OutputParameter, - Outputs, - S3Artifact, - Step, - Steps, - Workflow, - argo_len, - argo_range, - argo_sequence, - download_artifact, - upload_artifact, -) -from dflow.python import ( - OP, - OPIO, - Artifact, - BigParameter, - OPIOSign, - PythonOPTemplate, - Slices, -) - -from dpgen2.constants import ( - train_index_pattern, - train_script_name, - train_task_pattern, -) -from dpgen2.op import ( - RunNvNMDTrain, -) -from dpgen2.utils.step_config import ( - init_executor, -) -from dpgen2.utils.step_config import normalize as normalize_step_dict - - -class PrepRunNvNMDTrain(Steps): - def __init__( - self, - name: str, - prep_train_op: Type[OP], - run_train_op: Type[RunNvNMDTrain], - prep_config: Optional[dict] = None, - run_config: Optional[dict] = None, - upload_python_packages: Optional[List[os.PathLike]] = None, - valid_data: Optional[S3Artifact] = None, - optional_files: Optional[List[str]] = None, - ): - prep_config = normalize_step_dict({}) if prep_config is None else prep_config - run_config = normalize_step_dict({}) if run_config is None else run_config - self._input_parameters = { - "block_id": InputParameter(type=str, value=""), - "numb_models": InputParameter(type=int), - "template_script": InputParameter(), - "train_config": InputParameter(), - "run_optional_parameter": InputParameter( - type=dict, value=run_train_op.default_optional_parameter - ), - } - self._input_artifacts = { - "init_models": InputArtifact(optional=True), - "init_models_ckpt_data": InputArtifact(optional=True), - "init_models_ckpt_index": InputArtifact(optional=True), - "init_models_ckpt_meta": InputArtifact(optional=True), - "init_data": InputArtifact(), - "iter_data": InputArtifact(), - } - self._output_parameters = { - "template_script": OutputParameter(), - } - self._output_artifacts = { - "scripts": OutputArtifact(), - "models": OutputArtifact(), - "nvnmodels": OutputArtifact(), - "models_ckpt_meta": OutputArtifact(optional=True), - "models_ckpt_data": OutputArtifact(optional=True), - "models_ckpt_index": OutputArtifact(optional=True), - "logs": OutputArtifact(), - "lcurves": OutputArtifact(), - } - - super().__init__( - name=name, - inputs=Inputs( - parameters=self._input_parameters, - artifacts=self._input_artifacts, - ), - outputs=Outputs( - parameters=self._output_parameters, - artifacts=self._output_artifacts, - ), - ) - - self._keys = ["prep-train", "run-train"] - self.step_keys = {} - ii = "prep-train" - self.step_keys[ii] = "--".join(["%s" % self.inputs.parameters["block_id"], ii]) - ii = "run-train" - self.step_keys[ii] = "--".join( - ["%s" % self.inputs.parameters["block_id"], ii + "-{{item}}"] - ) - - self = _prep_run_nvnmd_train( - self, - self.step_keys, - prep_train_op, - run_train_op, - prep_config=prep_config, - run_config=run_config, - upload_python_packages=upload_python_packages, - valid_data=valid_data, - optional_files=optional_files, - ) - - @property - def input_parameters(self): - return self._input_parameters - - @property - def input_artifacts(self): - return self._input_artifacts - - @property - def output_parameters(self): - return self._output_parameters - - @property - def output_artifacts(self): - return self._output_artifacts - - @property - def keys(self): - return self._keys - - -def _prep_run_nvnmd_train( - train_steps, - step_keys, - prep_train_op: Type[OP], - run_train_op: Type[RunNvNMDTrain], - prep_config: dict = normalize_step_dict({}), - run_config: dict = normalize_step_dict({}), - upload_python_packages: Optional[List[os.PathLike]] = None, - valid_data: Optional[S3Artifact] = None, - optional_files: Optional[List[str]] = None, -): - prep_config = deepcopy(prep_config) - run_config = deepcopy(run_config) - prep_template_config = prep_config.pop("template_config") - run_template_config = run_config.pop("template_config") - prep_executor = init_executor(prep_config.pop("executor")) - run_executor = init_executor(run_config.pop("executor")) - template_slice_config = run_config.pop("template_slice_config", {}) - - prep_train = Step( - "prep-train", - template=PythonOPTemplate( - prep_train_op, - output_artifact_archive={"task_paths": None}, - python_packages=upload_python_packages, - **prep_template_config, - ), - parameters={ - "numb_models": train_steps.inputs.parameters["numb_models"], - "template_script": train_steps.inputs.parameters["template_script"], - }, - artifacts={}, - key=step_keys["prep-train"], - executor=prep_executor, - **prep_config, - ) - train_steps.add(prep_train) - - run_train = Step( - "run-train", - template=PythonOPTemplate( - run_train_op, - slices=Slices( - "int('{{item}}')", - input_parameter=["task_name"], - input_artifact=["task_path", "init_model", "init_model_ckpt_data", "init_model_ckpt_index", "init_model_ckpt_meta"], - output_artifact=["cnn_model", "qnn_model", "model_ckpt_meta", "model_ckpt_data", "model_ckpt_index", "lcurve", "log", "script"], - **template_slice_config, - ), - python_packages=upload_python_packages, - **run_template_config, - ), - parameters={ - "config": train_steps.inputs.parameters["train_config"], - "task_name": prep_train.outputs.parameters["task_names"], - "optional_parameter": train_steps.inputs.parameters[ - "run_optional_parameter" - ], - }, - artifacts={ - "task_path": prep_train.outputs.artifacts["task_paths"], - "init_model": train_steps.inputs.artifacts["init_models"], - "init_model_ckpt_meta": train_steps.inputs.artifacts["init_models_ckpt_meta"], - "init_model_ckpt_data": train_steps.inputs.artifacts["init_models_ckpt_data"], - "init_model_ckpt_index": train_steps.inputs.artifacts["init_models_ckpt_index"], - "init_data": train_steps.inputs.artifacts["init_data"], - "iter_data": train_steps.inputs.artifacts["iter_data"], - "valid_data": valid_data, - "optional_files": upload_artifact(optional_files) - if optional_files is not None - else None, - }, - with_sequence=argo_sequence( - argo_len(prep_train.outputs.parameters["task_names"]), - format=train_index_pattern, - ), - # with_param=argo_range(train_steps.inputs.parameters["numb_models"]), - key=step_keys["run-train"], - executor=run_executor, - **run_config, - ) - train_steps.add(run_train) - - train_steps.outputs.parameters[ - "template_script" - ].value_from_parameter = train_steps.inputs.parameters["template_script"] - train_steps.outputs.artifacts["scripts"]._from = run_train.outputs.artifacts[ - "script" - ] - train_steps.outputs.artifacts["models"]._from = run_train.outputs.artifacts["cnn_model"] - train_steps.outputs.artifacts["nvnmodels"]._from = run_train.outputs.artifacts["qnn_model"] - train_steps.outputs.artifacts["models_ckpt_meta"]._from = run_train.outputs.artifacts["model_ckpt_meta"] - train_steps.outputs.artifacts["models_ckpt_data"]._from = run_train.outputs.artifacts["model_ckpt_data"] - train_steps.outputs.artifacts["models_ckpt_index"]._from = run_train.outputs.artifacts["model_ckpt_index"] - train_steps.outputs.artifacts["logs"]._from = run_train.outputs.artifacts["log"] - train_steps.outputs.artifacts["lcurves"]._from = run_train.outputs.artifacts["lcurve"] - - return train_steps diff --git a/dpgen2/utils/download_dpgen2_artifacts.py b/dpgen2/utils/download_dpgen2_artifacts.py index db4b7e6c..8ab4cb61 100644 --- a/dpgen2/utils/download_dpgen2_artifacts.py +++ b/dpgen2/utils/download_dpgen2_artifacts.py @@ -54,16 +54,10 @@ def add_output( op_download_setting = { "prep-run-train": DownloadDefinition() .add_input("init_models") - .add_input("init_models_ckpt_meta") - .add_input("init_models_ckpt_data") - .add_input("init_models_ckpt_index") .add_input("init_data") .add_input("iter_data") .add_output("scripts") .add_output("models") - .add_output("models_ckpt_meta") - .add_output("models_ckpt_data") - .add_output("models_ckpt_index") .add_output("logs") .add_output("lcurves"), "prep-run-explore": DownloadDefinition() @@ -365,9 +359,9 @@ def _dl_step_item( [step_key, io, name] = item.split(global_step_def_split) pref = _item_path(prefix, item) if io in ["input"]: - target = step.inputs.artifacts[name] + target = step.inputs.artifacts.get(name) elif io in ["output"]: - target = step.outputs.artifacts[name] + target = step.outputs.artifacts.get(name) else: raise RuntimeError("unknown io style {io}") try: diff --git a/tests/mocked_ops.py b/tests/mocked_ops.py index 543e69d7..9bec0e29 100644 --- a/tests/mocked_ops.py +++ b/tests/mocked_ops.py @@ -41,10 +41,7 @@ lmp_task_pattern, lmp_traj_name, model_name_pattern, - model_ckpt_pattern, - model_ckpt_meta_pattern, - model_ckpt_data_pattern, - model_ckpt_index_pattern, + nvnmd_model_name_pattern, train_log_name, train_script_name, train_task_pattern, @@ -96,15 +93,15 @@ from dpgen2.op.run_dp_train import ( RunDPTrain, ) -from dpgen2.op.run_nvnmd_train import ( - RunNvNMDTrain, -) from dpgen2.op.run_lmp import ( RunLmp, ) from dpgen2.op.run_nvnmd import ( RunNvNMD, ) +from dpgen2.op.run_nvnmd_train import ( + RunNvNMDTrain, +) from dpgen2.op.select_confs import ( SelectConfs, ) @@ -125,19 +122,16 @@ def make_mocked_init_models(numb_models): return tmp_models -def make_mocked_init_models_ckpt(numb_models): - tmp_models_ckpt = [] +def make_mocked_init_nvnmd_models(numb_models): + tmp_models = [] for ii in range(numb_models): - dir = Path(model_ckpt_pattern %ii) - dir.mkdir(exist_ok=True, parents=True) - ff_meta = Path(model_ckpt_meta_pattern % ii) - ff_meta.write_text(f"This is init model ckpt meta {ii}") - ff_data = Path(model_ckpt_data_pattern % ii) - ff_data.write_text(f"This is init model ckpt data {ii}") - ff_index = Path(model_ckpt_index_pattern % ii) - ff_index.write_text(f"This is init model ckpt index {ii}") - tmp_models_ckpt.append(dir) - return tmp_models_ckpt + nvnmd_models_dir = Path(nvnmd_model_name_pattern % ii) + nvnmd_models_dir.mkdir(exist_ok=True, parents=True) + for jj in ("frozen_model.pb", "model.ckpt.meta", "model.ckpt.data", "model.ckpt.index"): + ff = nvnmd_models_dir / jj + ff.write_text(f"This is init {jj} {ii}") + tmp_models.append(nvnmd_models_dir) + return tmp_models def make_mocked_init_data(): @@ -418,25 +412,26 @@ def execute( ) -> OPIO: work_dir = Path(ip["task_name"]) script = ip["task_path"] / "input.json" - init_model = Path(ip["init_model"]) - init_model_ckpt_meta = Path(ip["init_model_ckpt_meta"]) - init_model_ckpt_data = Path(ip["init_model_ckpt_data"]) - init_model_ckpt_index = Path(ip["init_model_ckpt_index"]) + init_model = ip["init_model"] init_data = ip["init_data"] iter_data = ip["iter_data"] assert script.is_file() assert ip["task_path"].is_dir() - assert init_model.is_file() - assert init_model_ckpt_meta.is_file() - assert init_model_ckpt_data.is_file() - assert init_model_ckpt_index.is_file() + assert ip["init_model"].is_dir() assert len(init_data) == 2 assert re.match("task.[0-9][0-9][0-9][0-9]", ip["task_name"]) task_id = int(ip["task_name"].split(".")[1]) assert ip["task_name"] in str(ip["task_path"]) - assert "model" in str(ip["init_model"]) - assert ".pb" in str(ip["init_model"]) + init_frz_model = ip["init_model"] / "frozen_model.pb" + init_model_ckpt_data = ip["init_model"] / "model.ckpt.data" + init_model_ckpt_meta = ip["init_model"] / "model.ckpt.meta" + init_model_ckpt_index = ip["init_model"] / "model.ckpt.index" + + assert ".pb" in str(init_frz_model) + assert "ckpt.meta" in str(init_model_ckpt_meta) + assert "ckpt.data" in str(init_model_ckpt_data) + assert "ckpt.index" in str(init_model_ckpt_index) list_init_data = sorted([str(ii) for ii in init_data]) assert "init_data/bar" in list_init_data[0] assert "init_data/foo" in list_init_data[1] @@ -445,9 +440,9 @@ def execute( script = Path(script).resolve() init_model = init_model.resolve() - init_model_str = str(init_model) - init_model_ckpt_meta = init_model_ckpt_meta.resolve() + init_frz_model = init_frz_model.resolve() init_model_ckpt_data = init_model_ckpt_data.resolve() + init_model_ckpt_meta = init_model_ckpt_meta.resolve() init_model_ckpt_index = init_model_ckpt_index.resolve() init_data = [ii.resolve() for ii in init_data] iter_data = [ii.resolve() for ii in iter_data] @@ -469,36 +464,32 @@ def execute( work_dir.mkdir(exist_ok=True, parents=True) os.chdir(work_dir) - oscript = Path("input.json") - if not oscript.exists(): - from shutil import ( - copyfile, - ) + for script_str in ["input.json", "input_cnn.json", "input_qnn.json"]: + oscript = Path(script_str) + if not oscript.exists(): + from shutil import ( + copyfile, + ) - copyfile(script, oscript) - + copyfile(script, oscript) + + oscript = Path("input.json") cnn_dir = Path("nvnmd_cnn") qnn_dir = Path("nvnmd_qnn") cnn_model = cnn_dir / Path("frozen_model.pb") qnn_model = qnn_dir / Path("model.pb") - model_ckpt_meta_file = cnn_dir / Path("model.ckpt.meta") - model_ckpt_data_file = cnn_dir / Path("model.ckpt.data-00000-of-00001") + model_ckpt_meta_file = cnn_dir / Path("model.ckpt.meta") + model_ckpt_data_file = cnn_dir / Path("model.ckpt.data-00000-of-00001") model_ckpt_index_file = cnn_dir / Path("model.ckpt.index") lcurve = cnn_dir / Path("lcurve.out") log = Path("log") - assert init_model.exists() - with log.open("w") as f: - f.write(f"init_model {str(init_model)} OK\n") + assert init_frz_model.exists() assert init_model_ckpt_meta.exists() - with log.open("a") as f: - f.write(f"init_model_ckpt_meta {str(init_model_ckpt_meta)} OK\n") assert init_model_ckpt_data.exists() - with log.open("a") as f: - f.write(f"init_model_ckpt_data {str(init_model_ckpt_data)} OK\n") assert init_model_ckpt_index.exists() - with log.open("a") as f: - f.write(f"init_model_ckpt_index {str(init_model_ckpt_index)} OK\n") + with log.open("w") as f: + f.write(f"init_model {str(init_model)} OK\n") for ii in jtmp["data"]: assert Path(ii).exists() assert (ii in init_data_str) or (ii in iter_data_str) @@ -508,38 +499,42 @@ def execute( with log.open("a") as f: f.write(f"script {str(script)} OK\n") - cnn_dir.mkdir(exist_ok=True, parents=True) with cnn_model.open("w") as f: f.write("read from init model: \n") - f.write(init_model.read_text() + "\n") + f.write(init_frz_model.read_text() + "\n") with model_ckpt_meta_file.open("w") as f: - f.write("read from init model ckpt: \n") + f.write("read from init model: \n") f.write(init_model_ckpt_meta.read_text() + "\n") with model_ckpt_data_file.open("w") as f: - f.write("read from init model ckpt: \n") + f.write("read from init model: \n") f.write(init_model_ckpt_data.read_text() + "\n") with model_ckpt_index_file.open("w") as f: - f.write("read from init model ckpt: \n") + f.write("read from init model: \n") f.write(init_model_ckpt_index.read_text() + "\n") + qnn_dir.mkdir(exist_ok=True, parents=True) with qnn_model.open("w") as f: f.write("read from init model: \n") - f.write(init_model.read_text() + "\n") + f.write(init_frz_model.read_text() + "\n") with lcurve.open("w") as f: f.write("read from train_script: \n") f.write(script.read_text() + "\n") + model_files = "nvnmd_models" + os.makedirs(model_files, exist_ok=True) + shutil.copy(cnn_model, "nvnmd_models") + shutil.copy(qnn_model, "nvnmd_models") + shutil.copy(model_ckpt_meta_file, "nvnmd_models") + shutil.copy(model_ckpt_data_file, "nvnmd_models") + shutil.copy(model_ckpt_index_file, "nvnmd_models") + os.chdir(cwd) return OPIO( { "script": work_dir / oscript, - "cnn_model": work_dir / cnn_model, - "qnn_model": work_dir / qnn_model, - "model_ckpt_data": work_dir / model_ckpt_data_file, - "model_ckpt_meta": work_dir / model_ckpt_meta_file, - "model_ckpt_index": work_dir / model_ckpt_index_file, + "model": work_dir / model_files, "lcurve": work_dir / lcurve, "log": work_dir / log, } @@ -573,12 +568,6 @@ def execute( script = ip["task_path"] / "input.json" if ip["init_model"] is not None: raise FatalError("init model is not None") - if ip["init_model_ckpt_meta"] is not None: - raise FatalError("init model ckpt meta is not None") - if ip["init_model_ckpt_data"] is not None: - raise FatalError("init model ckpt data is not None") - if ip["init_model_ckpt_index"] is not None: - raise FatalError("init model ckpt index is not None") init_data = ip["init_data"] iter_data = ip["iter_data"] @@ -615,20 +604,22 @@ def execute( work_dir.mkdir(exist_ok=True, parents=True) os.chdir(work_dir) - oscript = Path("input.json") - if not oscript.exists(): - from shutil import ( - copyfile, - ) + for script_str in ["input.json", "input_cnn.json", "input_qnn.json"]: + oscript = Path(script_str) + if not oscript.exists(): + from shutil import ( + copyfile, + ) - copyfile(script, oscript) - + copyfile(script, oscript) + + oscript = Path("input.json") cnn_dir = Path("nvnmd_cnn") qnn_dir = Path("nvnmd_qnn") cnn_model = cnn_dir / Path("frozen_model.pb") qnn_model = qnn_dir / Path("model.pb") - model_ckpt_meta_file = cnn_dir / Path("model.ckpt.meta") - model_ckpt_data_file = cnn_dir / Path("model.ckpt.data-00000-of-00001") + model_ckpt_meta_file = cnn_dir / Path("model.ckpt.meta") + model_ckpt_data_file = cnn_dir / Path("model.ckpt.data-00000-of-00001") model_ckpt_index_file = cnn_dir / Path("model.ckpt.index") lcurve = cnn_dir / Path("lcurve.out") log = Path("log") @@ -651,6 +642,7 @@ def execute( f.write("read from init model ckpt: \n") with model_ckpt_index_file.open("w") as f: f.write("read from init model ckpt: \n") + qnn_dir.mkdir(exist_ok=True, parents=True) with qnn_model.open("w") as f: f.write("read from init model: \n") @@ -658,16 +650,20 @@ def execute( f.write("read from train_script: \n") f.write(script.read_text() + "\n") + model_files = "nvnmd_models" + os.makedirs(model_files, exist_ok=True) + shutil.copy(cnn_model, "nvnmd_models") + shutil.copy(qnn_model, "nvnmd_models") + shutil.copy(model_ckpt_meta_file, "nvnmd_models") + shutil.copy(model_ckpt_data_file, "nvnmd_models") + shutil.copy(model_ckpt_index_file, "nvnmd_models") + os.chdir(cwd) return OPIO( { "script": work_dir / oscript, - "cnn_model": work_dir / cnn_model, - "qnn_model": work_dir / qnn_model, - "model_ckpt_data": work_dir / model_ckpt_meta_file, - "model_ckpt_meta": work_dir / model_ckpt_meta_file, - "model_ckpt_index": work_dir / model_ckpt_meta_file, + "model": work_dir / model_files, "lcurve": work_dir / lcurve, "log": work_dir / log, } diff --git a/tests/op/test_prep_dp_train.py b/tests/op/test_prep_dp_train.py index a380e221..5ac15f41 100644 --- a/tests/op/test_prep_dp_train.py +++ b/tests/op/test_prep_dp_train.py @@ -56,6 +56,26 @@ }, } +template_script_nvnmd_v0 = { + "nvnmd": {"version": 0, "seed": 1}, + "training": { + "systems": [], + "stop_batch": 2000, + "batch_size": "auto", + "seed": 1, + }, +} + +template_script_nvnmd_v1 = { + "nvnmd": {"version": 1, "seed": 1}, + "training": { + "systems": [], + "stop_batch": 2000, + "batch_size": "auto", + "seed": 1, + }, +} + class faked_rg: faked_random = -1 @@ -161,6 +181,48 @@ def test_template_list_hyb_sea(self): self.assertEqual(jdata["model"]["fitting_net"]["seed"], 4 * ii + 1) self.assertEqual(jdata["training"]["seed"], 4 * ii + 2) + def test_template_nvnmd_v1(self): + ip = OPIO( + { + "template_script": template_script_nvnmd_v1, + "numb_models": self.numb_models, + } + ) + + faked_rg.faked_random = -1 + with mock.patch("random.randrange", faked_rg.randrange): + op = self.ptrain.execute(ip) + + self._check_output_dir_and_file_exist(op, self.numb_models) + + for ii in range(self.numb_models): + with open(Path(train_task_pattern % ii) / train_script_name) as fp: + jdata = json.load(fp) + self.assertEqual(jdata["nvnmd"]["version"], 1) + self.assertEqual(jdata["nvnmd"]["seed"], 2 * ii + 0) + self.assertEqual(jdata["training"]["seed"], 2 * ii + 1) + + def test_template_nvnmd_v0(self): + ip = OPIO( + { + "template_script": template_script_nvnmd_v0, + "numb_models": self.numb_models, + } + ) + + faked_rg.faked_random = -1 + with mock.patch("random.randrange", faked_rg.randrange): + op = self.ptrain.execute(ip) + + self._check_output_dir_and_file_exist(op, self.numb_models) + + for ii in range(self.numb_models): + with open(Path(train_task_pattern % ii) / train_script_name) as fp: + jdata = json.load(fp) + self.assertEqual(jdata["nvnmd"]["version"], 0) + self.assertEqual(jdata["nvnmd"]["seed"], 2 * ii + 0) + self.assertEqual(jdata["training"]["seed"], 2 * ii + 1) + def test_template_raise_wrong_list_length(self): ip = OPIO( { @@ -168,6 +230,8 @@ def test_template_raise_wrong_list_length(self): template_script_hybrid, template_script_hybrid, template_script_se_e2_a, + template_script_nvnmd_v1, + template_script_nvnmd_v0 ], "numb_models": self.numb_models, } diff --git a/tests/op/test_prep_nvnmd_train.py b/tests/op/test_prep_nvnmd_train.py deleted file mode 100644 index 0d5fe698..00000000 --- a/tests/op/test_prep_nvnmd_train.py +++ /dev/null @@ -1,173 +0,0 @@ -import json -import shutil -import unittest -from pathlib import ( - Path, -) - -import numpy as np -from dflow.python import ( - OP, - OPIO, - Artifact, - OPIOSign, -) -from mock import ( - mock, -) - -# isort: off -from .context import ( - dpgen2, -) -from dpgen2.constants import ( - train_script_name, - train_task_pattern, -) -from dpgen2.op.prep_nvnmd_train import ( - PrepNvNMDTrain, -) - -# isort: on - -template_script_nvnmd_v0 = { - "nvnmd": { - "version": 0, - "seed": 1 - }, - "training": { - "systems": [], - "stop_batch": 2000, - "batch_size": "auto", - "seed": 1, - }, -} - - -template_script_nvnmd_v1 = { - "nvnmd": { - "version": 1, - "seed": 1 - }, - "training": { - "systems": [], - "stop_batch": 2000, - "batch_size": "auto", - "seed": 1, - }, -} - - -class faked_rg: - faked_random = -1 - - @classmethod - def randrange(cls, xx): - cls.faked_random += 1 - return cls.faked_random - - -class TestPrepNvNMDTrain(unittest.TestCase): - def setUp(self): - self.numb_models = 2 - self.ptrain = PrepNvNMDTrain() - - def tearDown(self): - for ii in range(self.numb_models): - if Path(train_task_pattern % ii).exists(): - shutil.rmtree(train_task_pattern % ii) - - def _check_output_dir_and_file_exist(self, op, numb_models): - task_names = op["task_names"] - task_paths = op["task_paths"] - for ii in range(self.numb_models): - self.assertEqual(train_task_pattern % ii, task_names[ii]) - self.assertEqual(Path(train_task_pattern % ii), task_paths[ii]) - self.assertTrue(task_paths[ii].is_dir()) - self.assertTrue((task_paths[ii] / train_script_name).is_file()) - - def test_template_nvnmd_v1(self): - ip = OPIO( - {"template_script": template_script_nvnmd_v1, "numb_models": self.numb_models} - ) - - faked_rg.faked_random = -1 - with mock.patch("random.randrange", faked_rg.randrange): - op = self.ptrain.execute(ip) - - self._check_output_dir_and_file_exist(op, self.numb_models) - - for ii in range(self.numb_models): - with open(Path(train_task_pattern % ii) / train_script_name) as fp: - jdata = json.load(fp) - self.assertEqual(jdata["nvnmd"]["version"], 1) - self.assertEqual(jdata["nvnmd"]["seed"], 2 * ii + 0) - self.assertEqual(jdata["training"]["seed"], 2 * ii + 1) - - def test_template_nvnmd_v0(self): - ip = OPIO( - { - "template_script": template_script_nvnmd_v0, - "numb_models": self.numb_models, - } - ) - - faked_rg.faked_random = -1 - with mock.patch("random.randrange", faked_rg.randrange): - op = self.ptrain.execute(ip) - - self._check_output_dir_and_file_exist(op, self.numb_models) - - for ii in range(self.numb_models): - with open(Path(train_task_pattern % ii) / train_script_name) as fp: - jdata = json.load(fp) - self.assertEqual(jdata["nvnmd"]["version"], 0) - self.assertEqual(jdata["nvnmd"]["seed"], 2 * ii + 0) - self.assertEqual(jdata["training"]["seed"], 2 * ii + 1) - - def test_template_list_nvnmd_v0_v1(self): - ip = OPIO( - { - "template_script": [template_script_nvnmd_v0, template_script_nvnmd_v1], - "numb_models": self.numb_models, - } - ) - - faked_rg.faked_random = -1 - with mock.patch("random.randrange", faked_rg.randrange): - op = self.ptrain.execute(ip) - - self._check_output_dir_and_file_exist(op, self.numb_models) - - ii = 0 - with open(Path(train_task_pattern % ii) / train_script_name) as fp: - jdata = json.load(fp) - self.assertEqual(jdata["nvnmd"]["version"], 0) - self.assertEqual(jdata["nvnmd"]["seed"], 2 * ii) - self.assertEqual(jdata["training"]["seed"], 2 * ii + 1) - ii = 1 - with open(Path(train_task_pattern % ii) / train_script_name) as fp: - jdata = json.load(fp) - self.assertEqual(jdata["nvnmd"]["version"], 1) - self.assertEqual(jdata["nvnmd"]["seed"], 2 * ii) - self.assertEqual(jdata["training"]["seed"], 2 * ii + 1) - - def test_template_raise_wrong_list_length(self): - ip = OPIO( - { - "template_script": [ - template_script_nvnmd_v1, - template_script_nvnmd_v0, - template_script_nvnmd_v1 - ], - "numb_models": self.numb_models, - } - ) - - with self.assertRaises(RuntimeError) as context: - faked_rg.faked_random = -1 - with mock.patch("random.randrange", faked_rg.randrange): - op = self.ptrain.execute(ip) - self.assertTrue( - "length of the template list should be equal to 2" in str(context.exception) - ) diff --git a/tests/op/test_run_dp_train.py b/tests/op/test_run_dp_train.py index 384e7a3b..ebe793a1 100644 --- a/tests/op/test_run_dp_train.py +++ b/tests/op/test_run_dp_train.py @@ -129,6 +129,7 @@ def setUp(self): "auto_prob": "prob_sys_size", }, "disp_file": "lcurve.out", + "save_ckpt": "model.ckpt", }, "learning_rate": { "start_lr": 1.0, @@ -155,6 +156,7 @@ def setUp(self): "auto_prob": "prob_sys_size; 0:4:0.9; 4:7:0.1", }, "disp_file": "lcurve.out", + "save_ckpt": "model.ckpt", "numb_steps": 400000, }, "learning_rate": { @@ -194,6 +196,7 @@ def setUp(self): "batch_size": "auto", "auto_prob_style": "prob_sys_size", "disp_file": "lcurve.out", + "save_ckpt": "model.ckpt" }, "learning_rate": { "start_lr": 1.0, @@ -218,6 +221,7 @@ def setUp(self): "batch_size": "auto", "auto_prob_style": "prob_sys_size; 0:4:0.9; 4:7:0.1", "disp_file": "lcurve.out", + "save_ckpt": "model.ckpt", "stop_batch": 400000, }, "learning_rate": { @@ -808,6 +812,7 @@ def setUp(self): "auto_prob": "prob_sys_size", }, "disp_file": "lcurve.out", + "save_ckpt": "model.ckpt", }, "learning_rate": { "start_lr": 1.0, diff --git a/tests/op/test_run_nvnmd.py b/tests/op/test_run_nvnmd.py index fedc164b..2340d2bb 100644 --- a/tests/op/test_run_nvnmd.py +++ b/tests/op/test_run_nvnmd.py @@ -33,11 +33,13 @@ lmp_traj_name, model_name_pattern, ) +from dpgen2.op.run_lmp import ( + get_ele_temp, + set_models +) from dpgen2.op.run_nvnmd import ( RunNvNMD, - get_ele_temp, merge_pimd_files, - set_models, ) from dpgen2.utils import ( BinaryFileInput, @@ -55,9 +57,11 @@ def setUp(self): (self.task_path / lmp_conf_name).write_text("foo") (self.task_path / lmp_input_name).write_text("bar") self.task_name = "task_000" - self.models = [self.model_path / Path(f"model_{ii}.pb") for ii in range(4)] + self.models = [self.model_path / Path(f"model_{ii}") for ii in range(4)] for idx, ii in enumerate(self.models): - ii.write_text(f"model{idx}") + ii.mkdir(parents=True, exist_ok=True) + model_file = ii / Path("model.pb") + model_file.write_text(f"model{idx}") def tearDown(self): if Path("task").is_dir(): @@ -69,7 +73,7 @@ def tearDown(self): @patch("dpgen2.op.run_nvnmd.run_command") def test_success(self, mocked_run): - mocked_run.side_effect = [(0, "foo\n", "")] + mocked_run.side_effect = [(0, "foo\n", "")] * 4 op = RunNvNMD() out = op.execute( OPIO( @@ -84,28 +88,27 @@ def test_success(self, mocked_run): work_dir = Path(self.task_name) # check output self.assertEqual(out["log"], work_dir / lmp_log_name) - self.assertEqual(out["traj"], work_dir / lmp_traj_name) + self.assertEqual(out["traj"], work_dir / ("0_%s"%lmp_traj_name)) self.assertEqual(out["model_devi"], work_dir / lmp_model_devi_name) # check call - models = ["models/path/model_%d.pb"%i for i in range(len(self.models))] + models = ["models/path/model_%d.pb" % i for i in range(len(self.models))] calls = [ call( - " ; ".join( + " ".join( [ - " ".join( - [ - "cp", model_name, "model.pb", "&&", - "mylmp", "-i", lmp_input_name, - "-log", lmp_log_name, - "-v", "rerun", "%d"%i, "&&", - "cp", lmp_traj_name, lmp_traj_name+".%d"%i - ] - ) - for i, model_name in enumerate(models) + "mylmp", + "-i", + "%d_%s" % (ii, lmp_input_name), + "-log", + "%d_%s" % (ii, lmp_log_name), + "-v", + "rerun", + "%d" % ii ] ), shell=True, - ), + ) + for ii in range(len(models)) ] mocked_run.assert_has_calls(calls) # check input files are correctly linked @@ -132,131 +135,28 @@ def test_error(self, mocked_run): ) ) # check call - models = ["models/path/model_%d.pb"%i for i in range(len(self.models))] + models = ["models/path/model_%d.pb" % i for i in range(len(self.models))] calls = [ call( - " ; ".join( + " ".join( [ - " ".join( - [ - "cp", model_name, "model.pb", "&&", - "mylmp", "-i", lmp_input_name, - "-log", lmp_log_name, - "-v", "rerun", "%d"%i, "&&", - "cp", lmp_traj_name, lmp_traj_name+".%d"%i - ] - ) - for i, model_name in enumerate(models) + "mylmp", + "-i", + "%d_%s" % (ii, lmp_input_name), + "-log", + "%d_%s" % (ii, lmp_log_name), + "-v", + "rerun", + "%d" % ii ] ), shell=True, - ), + ) + for ii in range(1) ] mocked_run.assert_has_calls(calls) -class TestRunNvNMDDist(unittest.TestCase): - lmp_config = """variable NSTEPS equal 1000 - -units metal -boundary p p p -atom_style atomic - -neighbor 1.0 bin - -box tilt large -if "${restart} > 0" then "read_restart dpgen.restart.*" else "read_data conf.lmp" - -group target_element_1 type 4 -#set group other_element type/subset ${ELEMENT_TYPE_4} ${ELEMENT_NUMB_4} ${OUTER_RANDOM_SEED_4} - -change_box all triclinic -mass 6 26.980000 -pair_style deepmd model.000.pb out_freq 10 out_file model_devi.out -pair_coeff * * - -thermo_style custom step temp pe ke etotal press vol lx ly lz xy xz yz -thermo ${THERMO_FREQ} -#dump 1 all custom ${DUMP_FREQ} traj/*.lammpstrj id type x y z fx fy fz - -if "${restart} == 0" then "velocity all create 2754.34 709383" -fix 1 all npt temp 2754.34 2754.34 ${TAU_T} iso 1.0 1.0 ${TAU_P} -timestep 0.002000 -run 3000 upto -""" - - def setUp(self): - self.task_path = Path("task/path") - self.task_path.mkdir(parents=True, exist_ok=True) - self.model_path = Path("models/path") - self.model_path.mkdir(parents=True, exist_ok=True) - self.teacher_path = Path("models/teacher") - self.teacher_path.mkdir(parents=True, exist_ok=True) - - (self.task_path / lmp_conf_name).write_text("foo") - (self.task_path / lmp_input_name).write_text(TestRunNvNMDDist.lmp_config) - - self.task_name = "task_000" - self.models = [self.model_path / Path(f"model_{ii}.pb") for ii in range(1)] - for idx, ii in enumerate(self.models): - ii.write_text(f"model{idx}") - - (self.teacher_path / "teacher.pb").write_text("teacher model") - self.teacher_model = BinaryFileInput(self.teacher_path / "teacher.pb", "pb") - - self.maxDiff = None - - def tearDown(self): - if Path("task").is_dir(): - shutil.rmtree("task") - if Path("models").is_dir(): - shutil.rmtree("models") - if Path(self.task_name).is_dir(): - shutil.rmtree(self.task_name) - - @patch("dpgen2.op.run_nvnmd.run_command") - def test_success(self, mocked_run): - mocked_run.side_effect = [(0, "foo\n", "")] - op = RunNvNMD() - out = op.execute( - OPIO( - { - "config": { - "command": "mylmp", - "teacher_model_path": self.teacher_model, - }, - "task_name": self.task_name, - "task_path": self.task_path, - "models": self.models, - } - ) - ) - work_dir = Path(self.task_name) - - # check input files are correctly linked - self.assertEqual((work_dir / lmp_conf_name).read_text(), "foo") - - lmp_config = TestRunNvNMDDist.lmp_config.replace( - "pair_style deepmd model.000.pb", - "pair_style deepmd model.000.pb model.001.pb", - ) - self.assertEqual((work_dir / lmp_input_name).read_text(), lmp_config) - - # check if the teacher model is linked to model.000.pb - ii = 0 - self.assertEqual( - (work_dir / (model_name_pattern % ii)).read_text(), f"teacher model" - ) - - ii = 1 - self.assertEqual( - (work_dir / (model_name_pattern % ii)).read_text(), f"model{ii - 1}" - ) - - # The number of models have to be 2 in knowledge distillation - self.assertEqual(len(list((work_dir.glob("*.pb")))), 2) - - def swap_element(arg): bk = arg.copy() arg[1] = bk[0] @@ -266,21 +166,21 @@ def swap_element(arg): class TestSetModels(unittest.TestCase): def setUp(self): self.input_name = Path("lmp.input") - self.model_names = ["model.000.pth", "model.001.pb"] + self.model_names = ["model.000.pb", "model.001.pb"] def tearDown(self): os.remove(self.input_name) def test(self): - lmp_config = "pair_style deepmd model.000.pb model.001.pb out_freq 10 out_file model_devi.out\n" - expected_output = "pair_style deepmd model.000.pth model.001.pb out_freq 10 out_file model_devi.out\n" + lmp_config = "pair_style nvnmd model.000.pb\n" + expected_output = "pair_style nvnmd model.000.pb\n" input_name = self.input_name input_name.write_text(lmp_config) set_models(input_name, self.model_names) self.assertEqual(input_name.read_text(), expected_output) def test_failed(self): - lmp_config = "pair_style deepmd model.000.pb model.001.pb out_freq 10 out_file model_devi.out model.002.pb\n" + lmp_config = "pair_style deepmd model.000.pb\n" input_name = self.input_name input_name = Path("lmp.input") input_name.write_text(lmp_config) @@ -288,92 +188,9 @@ def test_failed(self): set_models(input_name, self.model_names) def test_failed_no_matching(self): - lmp_config = "pair_style deepmd out_freq 10 out_file model_devi.out\n" + lmp_config = "pair_style deepmd\n" input_name = self.input_name input_name = Path("lmp.input") input_name.write_text(lmp_config) with self.assertRaises(RuntimeError) as re: - set_models(input_name, self.model_names) - - -class TestGetEleTemp(unittest.TestCase): - def test_get_ele_temp_none(self): - with open("log", "w") as f: - f.write( - "pair_style deepmd model.000.pb model.001.pb model.002.pb model.003.pb model.004.pb out_freq 10 out_file model_devi.out" - ) - ele_temp = get_ele_temp("log") - self.assertIsNone(ele_temp) - - def test_get_ele_temp(self): - with open("log", "w") as f: - f.write( - "pair_style deepmd model.000.pb model.001.pb model.002.pb model.003.pb model.004.pb out_freq 10 out_file model_devi.out fparam 6.6" - ) - ele_temp = get_ele_temp("log") - self.assertEqual(ele_temp, 6.6) - - def tearDown(self): - if os.path.exists("log"): - os.remove("log") - - -class TestMergePIMDFiles(unittest.TestCase): - def test_merge_pimd_files(self): - for i in range(1, 3): - with open("traj.%s.dump" % i, "w") as f: - f.write( - """ITEM: TIMESTEP -0 -ITEM: NUMBER OF ATOMS -3 -ITEM: BOX BOUNDS xy xz yz pp pp pp -0.0000000000000000e+00 1.2444661140399999e+01 0.0000000000000000e+00 -0.0000000000000000e+00 1.2444661140399999e+01 0.0000000000000000e+00 -0.0000000000000000e+00 1.2444661140399999e+01 0.0000000000000000e+00 -ITEM: ATOMS id type x y z -1 8 7.23489 0.826309 4.61669 -2 1 8.04419 0.520382 5.14395 -3 1 6.48126 0.446895 4.99766 -ITEM: TIMESTEP -10 -ITEM: NUMBER OF ATOMS -3 -ITEM: BOX BOUNDS xy xz yz pp pp pp -0.0000000000000000e+00 1.2444661140399999e+01 0.0000000000000000e+00 -0.0000000000000000e+00 1.2444661140399999e+01 0.0000000000000000e+00 -0.0000000000000000e+00 1.2444661140399999e+01 0.0000000000000000e+00 -ITEM: ATOMS id type x y z -1 8 7.23103 0.814939 4.59892 -2 1 7.96453 0.61699 5.19158 -3 1 6.43661 0.370311 5.09854 -""" - ) - for i in range(1, 3): - with open("model_devi.%s.out" % i, "w") as f: - f.write( - """# step max_devi_v min_devi_v avg_devi_v max_devi_f min_devi_f avg_devi_f - 0 9.023897e-17 3.548771e-17 5.237314e-17 8.196123e-16 1.225653e-16 3.941002e-16 - 10 1.081667e-16 4.141596e-17 7.534462e-17 9.070597e-16 1.067947e-16 4.153524e-16 -""" - ) - - merge_pimd_files() - self.assertTrue(os.path.exists(lmp_traj_name)) - self.assertTrue(os.path.exists(lmp_model_devi_name)) - s = dpdata.System(lmp_traj_name, fmt="lammps/dump") - assert len(s) == 4 - model_devi = np.loadtxt(lmp_model_devi_name) - assert model_devi.shape[0] == 4 - - def tearDown(self): - for f in [ - lmp_traj_name, - "traj.1.dump", - "traj.2.dump", - lmp_model_devi_name, - "model_devi.1.out", - "model_devi.2.out", - ]: - if os.path.exists(f): - os.remove(f) + set_models(input_name, self.model_names) \ No newline at end of file diff --git a/tests/op/test_run_nvnmd_train.py b/tests/op/test_run_nvnmd_train.py index 2e1495d8..7a57cad8 100644 --- a/tests/op/test_run_nvnmd_train.py +++ b/tests/op/test_run_nvnmd_train.py @@ -35,9 +35,12 @@ train_qnn_script_name, train_task_pattern, ) +from dpgen2.op.run_dp_train import ( + RunDPTrain, + _get_data_size_of_all_mult_sys, +) from dpgen2.op.run_nvnmd_train import ( RunNvNMDTrain, - _get_data_size_of_all_mult_sys, _make_train_command, ) @@ -78,10 +81,11 @@ def setUp(self): self.init_data = [Path("init/data-0"), Path("init/data-1")] self.init_data = sorted(list(self.init_data)) - self.init_model = Path("bar.pb") - self.init_model_ckpt_meta = Path("model.ckpt.meta") - self.init_model_ckpt_data = Path("model.ckpt.data") - self.init_model_ckpt_index = Path("model.ckpt.index") + #self.init_model = Path("bar.pb") + #self.init_model_ckpt_meta = Path("model.ckpt.meta") + #self.init_model_ckpt_data = Path("model.ckpt.data") + #self.init_model_ckpt_index = Path("model.ckpt.index") + self.init_model = Path("nvnmd_models") self.config = { "init_model_policy": "no", @@ -92,7 +96,7 @@ def setUp(self): "init_model_start_pref_f": 100, "init_model_start_pref_v": 0.0, } - self.config = RunNvNMDTrain.normalize_config(self.config) + self.config = RunDPTrain.normalize_config(self.config) self.old_data_size = ( self.init_nframs_0 + self.init_nframs_1 + sum(self.nframes_0) @@ -134,7 +138,7 @@ def setUp(self): "auto_prob": "prob_sys_size", }, "disp_file": "lcurve.out", - "save_ckpt": "model.ckpt" + "save_ckpt": "model.ckpt", }, "learning_rate": { "start_lr": 1.0, @@ -173,6 +177,34 @@ def setUp(self): "start_pref_v": 0.0, }, } + self.expected_qnn_model_odict_v2 = { + "training": { + "training_data": { + "systems": [ + "init/data-0", + "init/data-1", + "data-0/foo3", + "data-0/foo4", + "data-1/foo2", + "data-1/foo3", + "data-1/foo5", + ], + "batch_size": "auto", + "auto_prob": "prob_sys_size; 0:4:0.9; 4:7:0.1", + }, + "disp_file": "lcurve.out", + "save_ckpt": "model.ckpt", + "numb_steps": 0, + }, + "learning_rate": { + "start_lr": 1e-4, + }, + "loss": { + "start_pref_e": 0.1, + "start_pref_f": 100, + "start_pref_v": 0.0, + }, + } self.idict_v1 = { "training": { @@ -201,7 +233,7 @@ def setUp(self): "batch_size": "auto", "auto_prob_style": "prob_sys_size", "disp_file": "lcurve.out", - "save_ckpt": "model.ckpt" + "save_ckpt": "model.ckpt", }, "learning_rate": { "start_lr": 1.0, @@ -238,6 +270,32 @@ def setUp(self): "start_pref_v": 0.0, }, } + self.expected_qnn_model_odict_v1 = { + "training": { + "systems": [ + "init/data-0", + "init/data-1", + "data-0/foo3", + "data-0/foo4", + "data-1/foo2", + "data-1/foo3", + "data-1/foo5", + ], + "batch_size": "auto", + "auto_prob_style": "prob_sys_size; 0:4:0.9; 4:7:0.1", + "disp_file": "lcurve.out", + "save_ckpt": "model.ckpt", + "stop_batch": 0, + }, + "learning_rate": { + "start_lr": 1e-4, + }, + "loss": { + "start_pref_e": 0.1, + "start_pref_f": 100, + "start_pref_v": 0.0, + }, + } def tearDown(self): for ii in [ @@ -262,68 +320,9 @@ def test_normalize_config(self): self.assertAlmostEqual(config["init_model_start_pref_f"], 100) self.assertAlmostEqual(config["init_model_start_pref_v"], 0.0) - def test_get_size_of_all_mult_sys(self): - cc = _get_data_size_of_all_mult_sys(self.iter_data) - self.assertEqual(cc, sum(self.nframes_0) + sum(self.nframes_1)) - cc = _get_data_size_of_all_mult_sys(self.mixed_iter_data, mixed_type=True) - self.assertEqual(cc, sum(self.nframes_0) + sum(self.nframes_1)) - # read the mixed type systems as if they were standard system, - # should give the correct estimate of the data size - cc = _get_data_size_of_all_mult_sys(self.mixed_iter_data, mixed_type=False) - self.assertEqual(cc, sum(self.nframes_0) + sum(self.nframes_1)) - - def test_decide_init_model_no_model(self): - do_init_model = RunNvNMDTrain.decide_init_model( - self.config, None, self.init_data, self.iter_data - ) - self.assertFalse(do_init_model) - - def test_decide_init_model_none_iter_data(self): - do_init_model = RunNvNMDTrain.decide_init_model( - self.config, self.init_model, self.init_data, None - ) - self.assertFalse(do_init_model) - - def test_decide_init_model_no_iter_data(self): - do_init_model = RunNvNMDTrain.decide_init_model( - self.config, self.init_model, self.init_data, [] - ) - self.assertFalse(do_init_model) - - def test_decide_init_model_config_no(self): - config = self.config.copy() - config["init_model_policy"] = "no" - do_init_model = RunNvNMDTrain.decide_init_model( - config, self.init_model, self.init_data, self.iter_data - ) - self.assertFalse(do_init_model) - - def test_decide_init_model_config_yes(self): - config = self.config.copy() - config["init_model_policy"] = "yes" - do_init_model = RunNvNMDTrain.decide_init_model( - config, self.init_model, self.init_data, self.iter_data - ) - self.assertTrue(do_init_model) - - def test_decide_init_model_config_larger_than_no(self): - config = self.config.copy() - config["init_model_policy"] = f"old_data_larger_than:{self.old_data_size}" - do_init_model = RunNvNMDTrain.decide_init_model( - config, self.init_model, self.init_data, self.iter_data - ) - self.assertFalse(do_init_model) - - def test_decide_init_model_config_larger_than_yes(self): - config = self.config.copy() - config["init_model_policy"] = f"old_data_larger_than:{self.old_data_size-1}" - do_init_model = RunNvNMDTrain.decide_init_model( - config, self.init_model, self.init_data, self.iter_data - ) - self.assertTrue(do_init_model) def test_update_input_dict_v1_init_model(self): - odict = RunNvNMDTrain.write_data_to_input_script( + odict = RunDPTrain.write_data_to_input_script( self.idict_v1, self.config, self.init_data, @@ -333,13 +332,17 @@ def test_update_input_dict_v1_init_model(self): ) config = self.config.copy() config["init_model_policy"] = "yes" - odict = RunNvNMDTrain.write_other_to_input_script( - odict, config, True, False, major_version="1" + odict = RunDPTrain.write_other_to_input_script( + odict, config, True, major_version="1", do_quantized=False ) self.assertDictEqual(odict, self.expected_init_model_odict_v1) + odict = RunDPTrain.write_other_to_input_script( + odict, config, True, major_version="1", do_quantized=True + ) + self.assertDictEqual(odict, self.expected_qnn_model_odict_v1) def test_update_input_dict_v1(self): - odict = RunNvNMDTrain.write_data_to_input_script( + odict = RunDPTrain.write_data_to_input_script( self.idict_v1, self.config, self.init_data, @@ -349,14 +352,14 @@ def test_update_input_dict_v1(self): ) config = self.config.copy() config["init_model_policy"] = "no" - odict = RunNvNMDTrain.write_other_to_input_script( - odict, config, False, False, major_version="1" + odict = RunDPTrain.write_other_to_input_script( + odict, config, False, major_version="1", do_quantized=False ) self.assertDictEqual(odict, self.expected_odict_v1) def test_update_input_dict_v2_init_model(self): idict = self.idict_v2 - odict = RunNvNMDTrain.write_data_to_input_script( + odict = RunDPTrain.write_data_to_input_script( idict, self.config, self.init_data, @@ -366,14 +369,18 @@ def test_update_input_dict_v2_init_model(self): ) config = self.config.copy() config["init_model_policy"] = "yes" - odict = RunNvNMDTrain.write_other_to_input_script( - odict, config, True, False, major_version="2" + odict = RunDPTrain.write_other_to_input_script( + odict, config, True, major_version="2", do_quantized=False ) self.assertDictEqual(odict, self.expected_init_model_odict_v2) + odict = RunDPTrain.write_other_to_input_script( + odict, config, True, major_version="2", do_quantized=True + ) + self.assertDictEqual(odict, self.expected_qnn_model_odict_v2) def test_update_input_dict_v2(self): idict = self.idict_v2 - odict = RunNvNMDTrain.write_data_to_input_script( + odict = RunDPTrain.write_data_to_input_script( idict, self.config, self.init_data, @@ -383,8 +390,8 @@ def test_update_input_dict_v2(self): ) config = self.config.copy() config["init_model_policy"] = "no" - odict = RunNvNMDTrain.write_other_to_input_script( - odict, config, False, False,major_version="2" + odict = RunDPTrain.write_other_to_input_script( + odict, config, False, major_version="2", do_quantized=False ) self.assertDictEqual(odict, self.expected_odict_v2) @@ -410,26 +417,47 @@ def test_exec_v1(self, mocked_run): "task_name": task_name, "task_path": Path(task_path), "init_model": Path(self.init_model), - "init_model_ckpt_meta": Path(self.init_model_ckpt_meta), - "init_model_ckpt_data": Path(self.init_model_ckpt_data), - "init_model_ckpt_index": Path(self.init_model_ckpt_index), "init_data": [Path(ii) for ii in self.init_data], "iter_data": [Path(ii) for ii in self.iter_data], } ) ) - self.assertEqual(out["script"], work_dir / train_cnn_script_name) - self.assertEqual(out["cnn_model"], work_dir / "nvnmd_cnn/frozen_model.pb") - self.assertEqual(out["qnn_model"], work_dir / "nvnmd_qnn/model.pb") - self.assertEqual(out["model_ckpt_data"], work_dir / "nvnmd_cnn/model.ckpt.data-00000-of-00001") - self.assertEqual(out["model_ckpt_meta"], work_dir / "nvnmd_cnn/model.ckpt.meta") - self.assertEqual(out["model_ckpt_index"], work_dir / "nvnmd_cnn/model.ckpt.index") - self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") - self.assertEqual(out["log"], work_dir / "train.log") + self.assertEqual( + out["script"], + work_dir / train_script_name + ) + self.assertEqual( + out["model"] / "frozen_model.pb", + work_dir / "nvnmd_models/frozen_model.pb", + ) + self.assertEqual( + out["model"] / "model.pb", + work_dir / "nvnmd_models/model.pb", + ) + self.assertEqual( + out["model"] / "model.ckpt.data-00000-of-00001", + work_dir / "nvnmd_models/model.ckpt.data-00000-of-00001", + ) + self.assertEqual( + out["model"] / "model.ckpt.meta", + work_dir / "nvnmd_models/model.ckpt.meta", + ) + self.assertEqual( + out["model"] / "model.ckpt.index", + work_dir / "nvnmd_models/model.ckpt.index", + ) + self.assertEqual( + out["lcurve"], + work_dir / "nvnmd_cnn/lcurve.out", + ) + self.assertEqual( + out["log"], + work_dir / "train.log", + ) calls = [ call(["dp", "train-nvnmd", train_cnn_script_name, "-s", "s1"]), - call(["dp", "train-nvnmd", train_qnn_script_name, "-s", "s2"]) + call(["dp", "train-nvnmd", train_qnn_script_name, "-s", "s2"]), ] mocked_run.assert_has_calls(calls) @@ -470,26 +498,41 @@ def test_exec_v2(self, mocked_run): "task_name": task_name, "task_path": Path(task_path), "init_model": Path(self.init_model), - "init_model_ckpt_meta": Path(self.init_model_ckpt_meta), - "init_model_ckpt_data": Path(self.init_model_ckpt_data), - "init_model_ckpt_index": Path(self.init_model_ckpt_index), "init_data": [Path(ii) for ii in self.init_data], "iter_data": [Path(ii) for ii in self.iter_data], } ) ) - self.assertEqual(out["script"], work_dir / train_cnn_script_name) - self.assertEqual(out["cnn_model"], work_dir / "nvnmd_cnn/frozen_model.pb") - self.assertEqual(out["qnn_model"], work_dir / "nvnmd_qnn/model.pb") - self.assertEqual(out["model_ckpt_data"], work_dir / "nvnmd_cnn/model.ckpt.data-00000-of-00001") - self.assertEqual(out["model_ckpt_meta"], work_dir / "nvnmd_cnn/model.ckpt.meta") - self.assertEqual(out["model_ckpt_index"], work_dir / "nvnmd_cnn/model.ckpt.index") + self.assertEqual( + out["script"], + work_dir / train_script_name + ) + self.assertEqual( + out["model"] / "frozen_model.pb", + work_dir / "nvnmd_models/frozen_model.pb", + ) + self.assertEqual( + out["model"] / "model.pb", + work_dir / "nvnmd_models/model.pb", + ) + self.assertEqual( + out["model"] / "model.ckpt.data-00000-of-00001", + work_dir / "nvnmd_models/model.ckpt.data-00000-of-00001", + ) + self.assertEqual( + out["model"] / "model.ckpt.meta", + work_dir / "nvnmd_models/model.ckpt.meta", + ) + self.assertEqual( + out["model"] / "model.ckpt.index", + work_dir / "nvnmd_models/model.ckpt.index", + ) self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") self.assertEqual(out["log"], work_dir / "train.log") calls = [ call(["dp", "train-nvnmd", train_cnn_script_name, "-s", "s1"]), - call(["dp", "train-nvnmd", train_qnn_script_name, "-s", "s2"]) + call(["dp", "train-nvnmd", train_qnn_script_name, "-s", "s2"]), ] mocked_run.assert_has_calls(calls) @@ -530,20 +573,35 @@ def test_exec_v2_init_model(self, mocked_run): "task_name": task_name, "task_path": Path(task_path), "init_model": Path(self.init_model), - "init_model_ckpt_meta": Path(self.init_model_ckpt_meta), - "init_model_ckpt_data": Path(self.init_model_ckpt_data), - "init_model_ckpt_index": Path(self.init_model_ckpt_index), "init_data": [Path(ii) for ii in self.init_data], "iter_data": [Path(ii) for ii in self.iter_data], } ) ) - self.assertEqual(out["script"], work_dir / train_cnn_script_name) - self.assertEqual(out["cnn_model"], work_dir / "nvnmd_cnn/frozen_model.pb") - self.assertEqual(out["qnn_model"], work_dir / "nvnmd_qnn/model.pb") - self.assertEqual(out["model_ckpt_data"], work_dir / "nvnmd_cnn/model.ckpt.data-00000-of-00001") - self.assertEqual(out["model_ckpt_meta"], work_dir / "nvnmd_cnn/model.ckpt.meta") - self.assertEqual(out["model_ckpt_index"], work_dir / "nvnmd_cnn/model.ckpt.index") + self.assertEqual( + out["script"], + work_dir / train_script_name + ) + self.assertEqual( + out["model"] / "frozen_model.pb", + work_dir / "nvnmd_models/frozen_model.pb", + ) + self.assertEqual( + out["model"] / "model.pb", + work_dir / "nvnmd_models/model.pb", + ) + self.assertEqual( + out["model"] / "model.ckpt.data-00000-of-00001", + work_dir / "nvnmd_models/model.ckpt.data-00000-of-00001", + ) + self.assertEqual( + out["model"] / "model.ckpt.meta", + work_dir / "nvnmd_models/model.ckpt.meta", + ) + self.assertEqual( + out["model"] / "model.ckpt.index", + work_dir / "nvnmd_models/model.ckpt.index", + ) self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") self.assertEqual(out["log"], work_dir / "train.log") @@ -556,7 +614,7 @@ def test_exec_v2_init_model(self, mocked_run): "model.ckpt", train_cnn_script_name, "-s", - "s1" + "s1", ] ) ] @@ -600,9 +658,6 @@ def test_exec_v2_train_error(self, mocked_run): "task_name": task_name, "task_path": Path(task_path), "init_model": Path(self.init_model), - "init_model_ckpt_meta": Path(self.init_model_ckpt_meta), - "init_model_ckpt_data": Path(self.init_model_ckpt_data), - "init_model_ckpt_index": Path(self.init_model_ckpt_index), "init_data": [Path(ii) for ii in self.init_data], "iter_data": [Path(ii) for ii in self.iter_data], } @@ -615,10 +670,11 @@ def test_exec_v2_train_error(self, mocked_run): mocked_run.assert_has_calls(calls) self.assertTrue(work_dir.is_dir()) - with open(work_dir / train_cnn_script_name) as fp: + with open(work_dir / train_script_name) as fp: jdata = json.load(fp) self.assertDictEqual(jdata, self.expected_odict_v2) + class TestRunNvNMDTrainNullIterData(unittest.TestCase): def setUp(self): self.atom_name = "foo" @@ -637,6 +693,7 @@ def setUp(self): self.init_model_ckpt_meta = Path("model.ckpt.meta") self.init_model_ckpt_data = Path("model.ckpt.data") self.init_model_ckpt_index = Path("model.ckpt.index") + self.init_model = Path("nvnmd_models") self.config = { "init_model_policy": "no", @@ -647,7 +704,7 @@ def setUp(self): "init_model_start_pref_f": 100, "init_model_start_pref_v": 0.0, } - self.config = RunNvNMDTrain.normalize_config(self.config) + self.config = RunDPTrain.normalize_config(self.config) self.task_name = "task-000" self.task_path = "input-000" @@ -678,7 +735,7 @@ def setUp(self): "auto_prob": "prob_sys_size", }, "disp_file": "lcurve.out", - "save_ckpt": "model.ckpt" + "save_ckpt": "model.ckpt", }, "learning_rate": { "start_lr": 1.0, @@ -697,7 +754,7 @@ def tearDown(self): def test_update_input_dict_v2_empty_list(self): idict = self.idict_v2 - odict = RunNvNMDTrain.write_data_to_input_script( + odict = RunDPTrain.write_data_to_input_script( idict, self.config, self.init_data, @@ -707,8 +764,8 @@ def test_update_input_dict_v2_empty_list(self): ) config = self.config.copy() config["init_model_policy"] = "no" - odict = RunNvNMDTrain.write_other_to_input_script( - odict, config, False, False, major_version="2" + odict = RunDPTrain.write_other_to_input_script( + odict, config, False, major_version="2", do_quantized=False ) self.assertDictEqual(odict, self.expected_odict_v2) @@ -736,26 +793,41 @@ def test_exec_v2_empty_dir(self, mocked_run): "task_name": task_name, "task_path": Path(task_path), "init_model": Path(self.init_model), - "init_model_ckpt_meta": Path(self.init_model_ckpt_meta), - "init_model_ckpt_data": Path(self.init_model_ckpt_data), - "init_model_ckpt_index": Path(self.init_model_ckpt_index), "init_data": [Path(ii) for ii in self.init_data], "iter_data": [empty_data], } ) ) - self.assertEqual(out["script"], work_dir / train_cnn_script_name) - self.assertEqual(out["cnn_model"], work_dir / "nvnmd_cnn/frozen_model.pb") - self.assertEqual(out["qnn_model"], work_dir / "nvnmd_qnn/model.pb") - self.assertEqual(out["model_ckpt_data"], work_dir / "nvnmd_cnn/model.ckpt.data-00000-of-00001") - self.assertEqual(out["model_ckpt_meta"], work_dir / "nvnmd_cnn/model.ckpt.meta") - self.assertEqual(out["model_ckpt_index"], work_dir / "nvnmd_cnn/model.ckpt.index") + self.assertEqual( + out["script"], + work_dir / train_script_name + ) + self.assertEqual( + out["model"] / "frozen_model.pb", + work_dir / "nvnmd_models/frozen_model.pb", + ) + self.assertEqual( + out["model"] / "model.pb", + work_dir / "nvnmd_models/model.pb", + ) + self.assertEqual( + out["model"] / "model.ckpt.data-00000-of-00001", + work_dir / "nvnmd_models/model.ckpt.data-00000-of-00001", + ) + self.assertEqual( + out["model"] / "model.ckpt.meta", + work_dir / "nvnmd_models/model.ckpt.meta", + ) + self.assertEqual( + out["model"] / "model.ckpt.index", + work_dir / "nvnmd_models/model.ckpt.index", + ) self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") self.assertEqual(out["log"], work_dir / "train.log") calls = [ call(["dp", "train-nvnmd", train_cnn_script_name, "-s", "s1"]), - call(["dp", "train-nvnmd", train_qnn_script_name, "-s", "s2"]) + call(["dp", "train-nvnmd", train_qnn_script_name, "-s", "s2"]), ] mocked_run.assert_has_calls(calls) diff --git a/tests/test_prep_run_dp_train.py b/tests/test_prep_run_dp_train.py index 536ca4b9..808d570a 100644 --- a/tests/test_prep_run_dp_train.py +++ b/tests/test_prep_run_dp_train.py @@ -53,8 +53,12 @@ MockedPrepDPTrain, MockedRunDPTrain, MockedRunDPTrainNoneInitModel, + MockedRunNvNMDTrain, + MockedRunNvNMDTrainCheckOptParam, + MockedRunNvNMDTrainNoneInitModel, make_mocked_init_data, make_mocked_init_models, + make_mocked_init_nvnmd_models, mocked_numb_models, mocked_template_script, ) @@ -144,6 +148,7 @@ def _check_lcurve( mlines = fp.read().strip().split("\n") tcase.assertEqual(flines[0], "read from train_script: ") for ii in range(len(mlines)): + print(flines[ii + 1], mlines[ii]) tcase.assertEqual(flines[ii + 1], mlines[ii]) @@ -173,6 +178,36 @@ def check_run_train_dp_output( os.chdir(cwd) +def check_run_train_nvnmd_output( + tcase, + work_dir, + script, + init_model, + init_data, + iter_data, + only_check_name=False, +): + cwd = os.getcwd() + os.chdir(work_dir) + _check_log( + tcase, + "log", + cwd, + script, + init_model, + init_data, + iter_data, + only_check_name=only_check_name, + ) + _check_model(tcase, "nvnmd_models/frozen_model.pb", cwd, init_model / "frozen_model.pb") + _check_model(tcase, "nvnmd_models/model.pb", cwd , init_model / "frozen_model.pb") + _check_model(tcase, "nvnmd_models/model.ckpt.meta", cwd, init_model / "model.ckpt.meta") + _check_model(tcase, "nvnmd_models/model.ckpt.data-00000-of-00001", cwd, init_model / "model.ckpt.data") + _check_model(tcase, "nvnmd_models/model.ckpt.index", cwd, init_model / "model.ckpt.index") + _check_lcurve(tcase, "nvnmd_cnn/lcurve.out", cwd, script) + os.chdir(cwd) + + class TestMockedPrepDPTrain(unittest.TestCase): def setUp(self): self.numb_models = mocked_numb_models @@ -270,6 +305,77 @@ def test(self): ) +class TestMockedRunNvNMDTrain(unittest.TestCase): + def setUp(self): + self.numb_models = mocked_numb_models + + self.init_models = make_mocked_init_nvnmd_models(self.numb_models) + + tmp_init_data = make_mocked_init_data() + self.init_data = tmp_init_data + + tmp_iter_data = [Path("iter_data/foo"), Path("iter_data/bar")] + for ii in tmp_iter_data: + ii.mkdir(exist_ok=True, parents=True) + (ii / "a").write_text("data a") + (ii / "b").write_text("data b") + self.iter_data = tmp_iter_data + + self.template_script = mocked_template_script.copy() + + self.task_names = ["task.0000", "task.0001", "task.0002"] + self.task_paths = [Path(ii) for ii in self.task_names] + self.train_scripts = [ + Path("task.0000/input.json"), + Path("task.0001/input.json"), + Path("task.0002/input.json"), + ] + + for ii in range(3): + Path(self.task_names[ii]).mkdir(exist_ok=True, parents=True) + Path(self.train_scripts[ii]).write_text("{}") + + def tearDown(self): + for ii in ["init_data", "iter_data"] + self.task_names: + if Path(ii).exists(): + shutil.rmtree(str(ii)) + for ii in self.init_models: + if Path(ii).exists(): + shutil.rmtree(ii) + + def test(self): + for ii in range(3): + run = MockedRunNvNMDTrain() + ip = OPIO( + { + "config": {}, + "task_name": self.task_names[ii], + "task_path": self.task_paths[ii], + "init_model": self.init_models[ii], + "init_data": self.init_data, + "iter_data": self.iter_data, + } + ) + op = run.execute(ip) + self.assertEqual(op["script"], Path(train_task_pattern % ii) / "input.json") + self.assertTrue(op["script"].is_file()) + self.assertEqual(op["model"] / "frozen_model.pb", Path(train_task_pattern % ii) / "nvnmd_models/frozen_model.pb") + self.assertEqual(op["model"] / "model.pb", Path(train_task_pattern % ii) / "nvnmd_models/model.pb") + self.assertEqual(op["model"] / "model.ckpt.meta", Path(train_task_pattern % ii) / "nvnmd_models/model.ckpt.meta") + self.assertEqual(op["model"] / "model.ckpt.data-00000-of-00001", Path(train_task_pattern % ii) / "nvnmd_models/model.ckpt.data-00000-of-00001") + self.assertEqual(op["model"] / "model.ckpt.index", Path(train_task_pattern % ii) / "nvnmd_models/model.ckpt.index") + self.assertEqual(op["log"], Path(train_task_pattern % ii) / "log") + self.assertEqual(op["lcurve"], Path(train_task_pattern % ii) / "nvnmd_cnn/lcurve.out") + check_run_train_nvnmd_output( + self, + self.task_names[ii], + self.train_scripts[ii], + self.init_models[ii], + self.init_data, + self.iter_data, + ) + + @unittest.skipIf(skip_ut_with_dflow, skip_ut_with_dflow_reason) class TestTrainDp(unittest.TestCase): def setUp(self): @@ -449,3 +555,127 @@ def test_finetune(self): self.path_iter_data, only_check_name=True, ) + + +@unittest.skipIf(skip_ut_with_dflow, skip_ut_with_dflow_reason) +class TestTrainNvNMD(unittest.TestCase): + def setUp(self): + self.numb_models = mocked_numb_models + + tmp_models = make_mocked_init_nvnmd_models(self.numb_models) + self.init_models = upload_artifact(tmp_models) + self.str_init_models = tmp_models + + tmp_init_data = make_mocked_init_data() + self.init_data = upload_artifact(tmp_init_data) + self.path_init_data = tmp_init_data + + tmp_iter_data = [Path("iter_data/foo"), Path("iter_data/bar")] + for ii in tmp_iter_data: + ii.mkdir(exist_ok=True, parents=True) + (ii / "a").write_text("data a") + (ii / "b").write_text("data b") + self.iter_data = upload_artifact(tmp_iter_data) + self.path_iter_data = tmp_iter_data + + self.template_script = mocked_template_script.copy() + + self.task_names = ["task.0000", "task.0001", "task.0002"] + self.task_paths = [Path(ii) for ii in self.task_names] + self.train_scripts = [ + Path("task.0000/input.json"), + Path("task.0001/input.json"), + Path("task.0002/input.json"), + ] + + def tearDown(self): + for ii in ["init_data", "iter_data"] + self.task_names: + if Path(ii).exists(): + shutil.rmtree(str(ii)) + for ii in self.str_init_models: + if Path(ii).exists(): + shutil.rmtree(ii) + + def test_train(self): + steps = PrepRunDPTrain( + "train-steps", + MockedPrepDPTrain, + MockedRunNvNMDTrain, + upload_python_packages=upload_python_packages, + prep_config=default_config, + run_config=default_config, + ) + train_step = Step( + "train-step", + template=steps, + parameters={ + "numb_models": self.numb_models, + "template_script": self.template_script, + "train_config": {}, + }, + artifacts={ + "init_models": self.init_models, + "init_data": self.init_data, + "iter_data": self.iter_data, + }, + ) + wf = Workflow(name="nvnmd-train", host=default_host) + wf.add(train_step) + wf.submit() + + while wf.query_status() in ["Pending", "Running"]: + time.sleep(4) + + self.assertEqual(wf.query_status(), "Succeeded") + step = wf.query_step(name="train-step")[0] + self.assertEqual(step.phase, "Succeeded") + + download_artifact(step.outputs.artifacts["scripts"]) + download_artifact(step.outputs.artifacts["models"]) + download_artifact(step.outputs.artifacts["logs"]) + download_artifact(step.outputs.artifacts["lcurves"]) + + for ii in range(3): + check_run_train_nvnmd_output( + self, + self.task_names[ii], + self.train_scripts[ii], + self.str_init_models[ii], + self.path_init_data, + self.path_iter_data, + only_check_name=True, + ) + + def test_train_no_init_model(self): + steps = PrepRunDPTrain( + "train-steps", + MockedPrepDPTrain, + MockedRunNvNMDTrainNoneInitModel, + upload_python_packages=upload_python_packages, + prep_config=default_config, + run_config=default_config, + ) + train_step = Step( + "train-step", + template=steps, + parameters={ + "numb_models": self.numb_models, + "template_script": self.template_script, + "train_config": {}, + }, + artifacts={ + "init_models": None, + "init_data": self.init_data, + "iter_data": self.iter_data, + }, + ) + wf = Workflow(name="nvnmd-train", host=default_host) + wf.add(train_step) + wf.submit() + + while wf.query_status() in ["Pending", "Running"]: + time.sleep(4) + + self.assertEqual(wf.query_status(), "Succeeded") + step = wf.query_step(name="train-step")[0] + self.assertEqual(step.phase, "Succeeded") diff --git a/tests/test_prep_run_lmp.py b/tests/test_prep_run_lmp.py index 3b350240..7923f025 100644 --- a/tests/test_prep_run_lmp.py +++ b/tests/test_prep_run_lmp.py @@ -53,6 +53,7 @@ ) from mocked_ops import ( MockedRunLmp, + MockedRunNvNMD, mocked_numb_models, ) @@ -218,6 +219,78 @@ def test(self): self.check_run_lmp_output(self.task_list_str[ii], self.model_list) +class TestMockedRunNvNMD(unittest.TestCase): + def setUp(self): + self.ntask = 2 + self.nmodels = 3 + self.task_list = [] + self.model_list = [] + for ii in range(self.ntask): + work_path = Path(lmp_task_pattern % ii) + work_path.mkdir(exist_ok=True, parents=True) + (work_path / lmp_conf_name).write_text(f"conf {ii}") + (work_path / lmp_input_name).write_text(f"input {ii}") + self.task_list.append(work_path) + for ii in range(self.nmodels): + model = Path(f"model{ii}.pb") + model.write_text(f"model {ii}") + self.model_list.append(model) + + def check_run_lmp_output( + self, + task_name: str, + models: List[Path], + ): + cwd = os.getcwd() + os.chdir(task_name) + fc = [] + for ii in [lmp_conf_name, lmp_input_name] + [ii.name for ii in models]: + fc.append(Path(ii).read_text()) + self.assertEqual(fc, Path(lmp_log_name).read_text().strip().split("\n")) + self.assertEqual( + f"traj of {task_name}", Path(lmp_traj_name).read_text().split("\n")[0] + ) + self.assertEqual( + f"model_devi of {task_name}", Path(lmp_model_devi_name).read_text() + ) + os.chdir(cwd) + + def tearDown(self): + for ii in range(self.ntask): + work_path = Path(lmp_task_pattern % ii) + if work_path.is_dir(): + shutil.rmtree(work_path) + for ii in range(self.nmodels): + model = Path(f"model{ii}.pb") + if model.is_file(): + os.remove(model) + + def test(self): + self.task_list_str = [str(ii) for ii in self.task_list] + self.model_list_str = [str(ii) for ii in self.model_list] + for ii in range(self.ntask): + ip = OPIO( + { + "task_name": self.task_list_str[ii], + "task_path": self.task_list[ii], + "models": self.model_list, + "config": {}, + } + ) + op = MockedRunNvNMD() + out = op.execute(ip) + self.assertEqual(out["log"], Path(f"task.{ii:06d}") / lmp_log_name) + self.assertEqual(out["traj"], Path(f"task.{ii:06d}") / lmp_traj_name) + self.assertEqual( + out["model_devi"], Path(f"task.{ii:06d}") / lmp_model_devi_name + ) + self.assertTrue(out["log"].is_file()) + self.assertTrue(out["traj"].is_file()) + self.assertTrue(out["model_devi"].is_file()) + self.check_run_lmp_output(self.task_list_str[ii], self.model_list) + + + # @unittest.skip("temp") @unittest.skipIf(skip_ut_with_dflow, skip_ut_with_dflow_reason) class TestPrepRunLmp(unittest.TestCase): diff --git a/tests/test_prep_run_nvnmd.py b/tests/test_prep_run_nvnmd.py deleted file mode 100644 index b64d1316..00000000 --- a/tests/test_prep_run_nvnmd.py +++ /dev/null @@ -1,307 +0,0 @@ -import json -import os -import pickle -import shutil -import time -import unittest -from pathlib import ( - Path, -) -from typing import ( - List, - Set, -) - -import jsonpickle -import numpy as np -from dflow import ( - InputArtifact, - InputParameter, - Inputs, - OutputArtifact, - OutputParameter, - Outputs, - S3Artifact, - Step, - Steps, - Workflow, - argo_range, - download_artifact, - upload_artifact, -) -from dflow.python import ( - OP, - OPIO, - Artifact, - OPIOSign, - PythonOPTemplate, -) - -try: - from context import ( - dpgen2, - ) -except ModuleNotFoundError: - # case of upload everything to argo, no context needed - pass -from context import ( - default_host, - default_image, - skip_ut_with_dflow, - skip_ut_with_dflow_reason, - upload_python_packages, -) -from mocked_ops import ( - MockedRunNvNMD, - mocked_numb_models, -) - -from dpgen2.constants import ( - lmp_conf_name, - lmp_input_name, - lmp_log_name, - lmp_model_devi_name, - lmp_task_pattern, - lmp_traj_name, - model_name_pattern, - train_log_name, - train_script_name, - train_task_pattern, -) -from dpgen2.exploration.task import ( - BaseExplorationTaskGroup, - ExplorationTask, -) -from dpgen2.op.prep_lmp import ( - PrepLmp, -) -from dpgen2.superop.prep_run_lmp import ( - PrepRunLmp, -) -from dpgen2.utils.step_config import normalize as normalize_step_dict - -default_config = normalize_step_dict( - { - "template_config": { - "image": default_image, - } - } -) - - -def make_task_group_list(ngrp, ntask_per_grp): - tgrp = BaseExplorationTaskGroup() - for ii in range(ngrp): - for jj in range(ntask_per_grp): - tt = ExplorationTask() - tt.add_file(lmp_conf_name, f"group{ii} task{jj} conf").add_file( - lmp_input_name, f"group{ii} task{jj} input" - ) - tgrp.add_task(tt) - return tgrp - - -def check_lmp_tasks(tcase, ngrp, ntask_per_grp): - cc = 0 - tdirs = [] - for ii in range(ngrp): - for jj in range(ntask_per_grp): - tdir = lmp_task_pattern % cc - tdirs.append(tdir) - tcase.assertTrue(Path(tdir).is_dir()) - fconf = Path(tdir) / lmp_conf_name - finpt = Path(tdir) / lmp_input_name - tcase.assertTrue(fconf.is_file()) - tcase.assertTrue(finpt.is_file()) - tcase.assertEqual(fconf.read_text(), f"group{ii} task{jj} conf") - tcase.assertEqual(finpt.read_text(), f"group{ii} task{jj} input") - cc += 1 - return tdirs - - -class TestPrepLmp(unittest.TestCase): - def setUp(self): - self.ngrp = 2 - self.ntask_per_grp = 3 - self.task_group_list = make_task_group_list(self.ngrp, self.ntask_per_grp) - - def tearDown(self): - for ii in range(self.ngrp * self.ntask_per_grp): - work_path = Path(lmp_task_pattern % ii) - if work_path.is_dir(): - shutil.rmtree(work_path) - - def test(self): - op = PrepLmp() - out = op.execute( - OPIO( - { - "lmp_task_grp": self.task_group_list, - } - ) - ) - tdirs = check_lmp_tasks(self, self.ngrp, self.ntask_per_grp) - tdirs = [str(ii) for ii in tdirs] - - self.assertEqual(tdirs, out["task_names"]) - self.assertEqual(tdirs, [str(ii) for ii in out["task_paths"]]) - - -class TestMockedRunNvNMD(unittest.TestCase): - def setUp(self): - self.ntask = 2 - self.nmodels = 3 - self.task_list = [] - self.model_list = [] - for ii in range(self.ntask): - work_path = Path(lmp_task_pattern % ii) - work_path.mkdir(exist_ok=True, parents=True) - (work_path / lmp_conf_name).write_text(f"conf {ii}") - (work_path / lmp_input_name).write_text(f"input {ii}") - self.task_list.append(work_path) - for ii in range(self.nmodels): - model = Path(f"model{ii}.pb") - model.write_text(f"model {ii}") - self.model_list.append(model) - - def check_run_lmp_output( - self, - task_name: str, - models: List[Path], - ): - cwd = os.getcwd() - os.chdir(task_name) - fc = [] - for ii in [lmp_conf_name, lmp_input_name] + [ii.name for ii in models]: - fc.append(Path(ii).read_text()) - self.assertEqual(fc, Path(lmp_log_name).read_text().strip().split("\n")) - self.assertEqual( - f"traj of {task_name}", Path(lmp_traj_name).read_text().split("\n")[0] - ) - self.assertEqual( - f"model_devi of {task_name}", Path(lmp_model_devi_name).read_text() - ) - os.chdir(cwd) - - def tearDown(self): - for ii in range(self.ntask): - work_path = Path(lmp_task_pattern % ii) - if work_path.is_dir(): - shutil.rmtree(work_path) - for ii in range(self.nmodels): - model = Path(f"model{ii}.pb") - if model.is_file(): - os.remove(model) - - def test(self): - self.task_list_str = [str(ii) for ii in self.task_list] - self.model_list_str = [str(ii) for ii in self.model_list] - for ii in range(self.ntask): - ip = OPIO( - { - "task_name": self.task_list_str[ii], - "task_path": self.task_list[ii], - "models": self.model_list, - "config": {}, - } - ) - op = MockedRunNvNMD() - out = op.execute(ip) - self.assertEqual(out["log"], Path(f"task.{ii:06d}") / lmp_log_name) - self.assertEqual(out["traj"], Path(f"task.{ii:06d}") / lmp_traj_name) - self.assertEqual( - out["model_devi"], Path(f"task.{ii:06d}") / lmp_model_devi_name - ) - self.assertTrue(out["log"].is_file()) - self.assertTrue(out["traj"].is_file()) - self.assertTrue(out["model_devi"].is_file()) - self.check_run_lmp_output(self.task_list_str[ii], self.model_list) - - -# @unittest.skip("temp") -@unittest.skipIf(skip_ut_with_dflow, skip_ut_with_dflow_reason) -class TestPrepRunNvNMD(unittest.TestCase): - def setUp(self): - self.ngrp = 2 - self.ntask_per_grp = 3 - self.task_group_list = make_task_group_list(self.ngrp, self.ntask_per_grp) - self.nmodels = mocked_numb_models - self.model_list = [] - for ii in range(self.nmodels): - model = Path(f"model{ii}.pb") - model.write_text(f"model {ii}") - self.model_list.append(model) - self.models = upload_artifact(self.model_list) - - def tearDown(self): - for ii in range(self.nmodels): - model = Path(f"model{ii}.pb") - if model.is_file(): - os.remove(model) - for ii in range(self.ngrp * self.ntask_per_grp): - work_path = Path(f"task.{ii:06d}") - if work_path.is_dir(): - shutil.rmtree(work_path) - - def check_run_lmp_output( - self, - task_name: str, - models: List[Path], - ): - cwd = os.getcwd() - os.chdir(task_name) - fc = [] - idx = int(task_name.split(".")[1]) - ii = idx // self.ntask_per_grp - jj = idx - ii * self.ntask_per_grp - fc.append(f"group{ii} task{jj} conf") - fc.append(f"group{ii} task{jj} input") - for ii in [ii.name for ii in models]: - fc.append((Path("..") / Path(ii)).read_text()) - self.assertEqual(fc, Path(lmp_log_name).read_text().strip().split("\n")) - self.assertEqual( - f"traj of {task_name}", Path(lmp_traj_name).read_text().split("\n")[0] - ) - self.assertEqual( - f"model_devi of {task_name}", Path(lmp_model_devi_name).read_text() - ) - os.chdir(cwd) - - def test(self): - steps = PrepRunLmp( - "prep-run-lmp", - PrepLmp, - MockedRunNvNMD, - upload_python_packages=upload_python_packages, - prep_config=default_config, - run_config=default_config, - ) - prep_run_step = Step( - "prep-run-step", - template=steps, - parameters={ - "explore_config": {}, - "expl_task_grp": self.task_group_list, - }, - artifacts={ - "models": self.models, - }, - ) - - wf = Workflow(name="dp-train", host=default_host) - wf.add(prep_run_step) - wf.submit() - - while wf.query_status() in ["Pending", "Running"]: - time.sleep(4) - - self.assertEqual(wf.query_status(), "Succeeded") - step = wf.query_step(name="prep-run-step")[0] - self.assertEqual(step.phase, "Succeeded") - - download_artifact(step.outputs.artifacts["model_devis"]) - download_artifact(step.outputs.artifacts["trajs"]) - download_artifact(step.outputs.artifacts["logs"]) - - for ii in step.outputs.parameters["task_names"].value: - self.check_run_lmp_output(ii, self.model_list) diff --git a/tests/test_prep_run_nvnmd_train.py b/tests/test_prep_run_nvnmd_train.py deleted file mode 100644 index 7defc199..00000000 --- a/tests/test_prep_run_nvnmd_train.py +++ /dev/null @@ -1,459 +0,0 @@ -import json -import os -import shutil -import time -import unittest -from pathlib import ( - Path, -) -from typing import ( - List, - Set, -) - -import numpy as np -from dflow import ( - InputArtifact, - InputParameter, - Inputs, - OutputArtifact, - OutputParameter, - Outputs, - S3Artifact, - Step, - Steps, - Workflow, - argo_range, - download_artifact, - upload_artifact, -) -from dflow.python import ( - OP, - OPIO, - Artifact, - OPIOSign, - PythonOPTemplate, -) - -try: - from context import ( - dpgen2, - ) -except ModuleNotFoundError: - # case of upload everything to argo, no context needed - pass -from context import ( - default_host, - default_image, - skip_ut_with_dflow, - skip_ut_with_dflow_reason, - upload_python_packages, -) -from mocked_ops import ( - MockedPrepNvNMDTrain, - MockedRunNvNMDTrain, - MockedRunNvNMDTrainNoneInitModel, - make_mocked_init_data, - make_mocked_init_models, - make_mocked_init_models_ckpt, - mocked_numb_models, - mocked_template_script, -) - -from dpgen2.constants import ( - train_task_pattern, -) -from dpgen2.superop.prep_run_nvnmd_train import ( - PrepRunNvNMDTrain, -) -from dpgen2.utils.step_config import normalize as normalize_step_dict - -default_config = normalize_step_dict( - { - "template_config": { - "image": default_image, - } - } -) - - -def _check_log( - tcase, fname, path, script, init_model, init_model_ckpt, init_data, iter_data, only_check_name=False -): - with open(fname) as fp: - lines_ = fp.read().strip().split("\n") - if only_check_name: - lines = [] - for ii in lines_: - ww = ii.split(" ") - ww[1] = str(Path(ww[1]).name) - lines.append(" ".join(ww)) - else: - lines = lines_ - revised_fname = lambda ff: Path(ff).name if only_check_name else Path(ff) - tcase.assertEqual( - lines[0].split(" "), - ["init_model", str(revised_fname(Path(path) / init_model)), "OK"], - ) - tcase.assertEqual( - lines[1].split(" "), - ["init_model_ckpt_meta", str(revised_fname(Path(path) / init_model_ckpt / "model.ckpt.meta")), "OK"], - ) - tcase.assertEqual( - lines[2].split(" "), - ["init_model_ckpt_data", str(revised_fname(Path(path) / init_model_ckpt / "model.ckpt.data")), "OK"], - ) - tcase.assertEqual( - lines[3].split(" "), - ["init_model_ckpt_index", str(revised_fname(Path(path) / init_model_ckpt / "model.ckpt.index")), "OK"], - ) - for ii in range(2): - tcase.assertEqual( - lines[4 + ii].split(" "), - [ - "data", - str(revised_fname(Path(path) / sorted(list(init_data))[ii])), - "OK", - ], - ) - for ii in range(2): - tcase.assertEqual( - lines[6 + ii].split(" "), - [ - "data", - str(revised_fname(Path(path) / sorted(list(iter_data))[ii])), - "OK", - ], - ) - tcase.assertEqual( - lines[8].split(" "), ["script", str(revised_fname(Path(path) / script)), "OK"] - ) - - -def _check_model( - tcase, - fname, - path, - model, -): - with open(fname) as fp: - flines = fp.read().strip().split("\n") - with open(Path(path) / model) as fp: - mlines = fp.read().strip().split("\n") - tcase.assertEqual(flines[0], "read from init model: ") - for ii in range(len(mlines)): - tcase.assertEqual(flines[ii + 1], mlines[ii]) - - -def _check_model_ckpt( - tcase, - fname, - path, - model, -): - with open(fname) as fp: - flines = fp.read().strip().split("\n") - with open(Path(path) / model) as fp: - mlines = fp.read().strip().split("\n") - tcase.assertEqual(flines[0], "read from init model ckpt: ") - for ii in range(len(mlines)): - tcase.assertEqual(flines[ii + 1], mlines[ii]) - - -def _check_lcurve( - tcase, - fname, - path, - script, -): - with open(fname) as fp: - flines = fp.read().strip().split("\n") - with open(Path(path) / script) as fp: - mlines = fp.read().strip().split("\n") - tcase.assertEqual(flines[0], "read from train_script: ") - for ii in range(len(mlines)): - tcase.assertEqual(flines[ii + 1], mlines[ii]) - - -def check_run_train_nvnmd_output( - tcase, - work_dir, - script, - init_model, - init_model_ckpt, - init_data, - iter_data, - only_check_name=False, -): - cwd = os.getcwd() - os.chdir(work_dir) - _check_log( - tcase, - "log", - cwd, - script, - init_model, - init_model_ckpt, - init_data, - iter_data, - only_check_name=only_check_name, - ) - _check_model(tcase, "nvnmd_cnn/frozen_model.pb", cwd, init_model) - _check_model(tcase, "nvnmd_qnn/model.pb", cwd, init_model) - _check_model_ckpt(tcase, "nvnmd_cnn/model.ckpt.meta", cwd, init_model_ckpt / "model.ckpt.meta") - _check_model_ckpt(tcase, "nvnmd_cnn/model.ckpt.data-00000-of-00001", cwd, init_model_ckpt / "model.ckpt.data") - _check_model_ckpt(tcase, "nvnmd_cnn/model.ckpt.index", cwd, init_model_ckpt / "model.ckpt.index") - _check_lcurve(tcase, "nvnmd_cnn/lcurve.out", cwd, script) - os.chdir(cwd) - - -class TestMockedPrepNvNMDTrain(unittest.TestCase): - def setUp(self): - self.numb_models = mocked_numb_models - self.template_script = mocked_template_script.copy() - self.expected_subdirs = ["task.0000", "task.0001", "task.0002"] - self.expected_train_scripts = [ - Path("task.0000/input.json"), - Path("task.0001/input.json"), - Path("task.0002/input.json"), - ] - - def tearDown(self): - for ii in self.expected_subdirs: - if Path(ii).exists(): - shutil.rmtree(ii) - - def test(self): - prep = MockedPrepNvNMDTrain() - ip = OPIO( - { - "template_script": self.template_script, - "numb_models": self.numb_models, - } - ) - op = prep.execute(ip) - # self.assertEqual(self.expected_train_scripts, op["train_scripts"]) - self.assertEqual(self.expected_subdirs, op["task_names"]) - self.assertEqual([Path(ii) for ii in self.expected_subdirs], op["task_paths"]) - - -class TestMockedRunNvNMDTrain(unittest.TestCase): - def setUp(self): - self.numb_models = mocked_numb_models - - self.init_models = make_mocked_init_models(self.numb_models) - self.init_models_ckpt = make_mocked_init_models_ckpt(self.numb_models) - - tmp_init_data = make_mocked_init_data() - self.init_data = tmp_init_data - - tmp_iter_data = [Path("iter_data/foo"), Path("iter_data/bar")] - for ii in tmp_iter_data: - ii.mkdir(exist_ok=True, parents=True) - (ii / "a").write_text("data a") - (ii / "b").write_text("data b") - self.iter_data = tmp_iter_data - - self.template_script = mocked_template_script.copy() - - self.task_names = ["task.0000", "task.0001", "task.0002"] - self.task_paths = [Path(ii) for ii in self.task_names] - self.train_scripts = [ - Path("task.0000/input.json"), - Path("task.0001/input.json"), - Path("task.0002/input.json"), - ] - - for ii in range(3): - Path(self.task_names[ii]).mkdir(exist_ok=True, parents=True) - Path(self.train_scripts[ii]).write_text("{}") - - def tearDown(self): - for ii in ["init_data", "iter_data"] + self.task_names: - if Path(ii).exists(): - shutil.rmtree(str(ii)) - for ii in self.init_models: - if Path(ii).exists(): - os.remove(ii) - for ii in self.init_models_ckpt: - if Path(ii).exists(): - shutil.rmtree(ii) - - def test(self): - for ii in range(3): - run = MockedRunNvNMDTrain() - ip = OPIO( - { - "config": {}, - "task_name": self.task_names[ii], - "task_path": self.task_paths[ii], - "init_model": self.init_models[ii], - "init_model_ckpt_meta": self.init_models_ckpt[ii] / "model.ckpt.meta", - "init_model_ckpt_data": self.init_models_ckpt[ii] / "model.ckpt.data", - "init_model_ckpt_index": self.init_models_ckpt[ii] / "model.ckpt.index", - "init_data": self.init_data, - "iter_data": self.iter_data, - } - ) - op = run.execute(ip) - self.assertEqual(op["script"], Path(train_task_pattern % ii) / "input.json") - self.assertTrue(op["script"].is_file()) - self.assertEqual(op["cnn_model"], Path(train_task_pattern % ii) / "nvnmd_cnn" / "frozen_model.pb") - self.assertEqual(op["qnn_model"], Path(train_task_pattern % ii) / "nvnmd_qnn" / "model.pb") - self.assertEqual(op["model_ckpt_data"], Path(train_task_pattern % ii) / "nvnmd_cnn" / "model.ckpt.data-00000-of-00001") - self.assertEqual(op["model_ckpt_meta"], Path(train_task_pattern % ii) / "nvnmd_cnn" /"model.ckpt.meta") - self.assertEqual(op["model_ckpt_index"], Path(train_task_pattern % ii) / "nvnmd_cnn" /"model.ckpt.index") - self.assertEqual(op["log"], Path(train_task_pattern % ii) / "log") - self.assertEqual(op["lcurve"], Path(train_task_pattern % ii) / "nvnmd_cnn" / "lcurve.out") - check_run_train_nvnmd_output( - self, - self.task_names[ii], - self.train_scripts[ii], - self.init_models[ii], - self.init_models_ckpt[ii], - self.init_data, - self.iter_data, - ) - -@unittest.skipIf(skip_ut_with_dflow, skip_ut_with_dflow_reason) -class TestTrainNvNMD(unittest.TestCase): - def setUp(self): - self.numb_models = mocked_numb_models - - tmp_models = make_mocked_init_models(self.numb_models) - self.init_models = upload_artifact(tmp_models) - self.str_init_models = tmp_models - - tmp_models_ckpt = make_mocked_init_models_ckpt(self.numb_models) - self.init_models_ckpt_meta = upload_artifact([dir / "model.ckpt.meta" for dir in tmp_models_ckpt]) - self.init_models_ckpt_data = upload_artifact([dir / "model.ckpt.data" for dir in tmp_models_ckpt]) - self.init_models_ckpt_index = upload_artifact([dir / "model.ckpt.index" for dir in tmp_models_ckpt]) - self.str_init_models_ckpt = tmp_models_ckpt - - tmp_init_data = make_mocked_init_data() - self.init_data = upload_artifact(tmp_init_data) - self.path_init_data = tmp_init_data - - tmp_iter_data = [Path("iter_data/foo"), Path("iter_data/bar")] - for ii in tmp_iter_data: - ii.mkdir(exist_ok=True, parents=True) - (ii / "a").write_text("data a") - (ii / "b").write_text("data b") - self.iter_data = upload_artifact(tmp_iter_data) - self.path_iter_data = tmp_iter_data - - self.template_script = mocked_template_script.copy() - - self.task_names = ["task.0000", "task.0001", "task.0002"] - self.task_paths = [Path(ii) for ii in self.task_names] - self.train_scripts = [ - Path("task.0000/input.json"), - Path("task.0001/input.json"), - Path("task.0002/input.json"), - ] - - def tearDown(self): - for ii in ["init_data", "iter_data"] + self.task_names: - if Path(ii).exists(): - shutil.rmtree(str(ii)) - for ii in self.str_init_models: - if Path(ii).exists(): - os.remove(ii) - for ii in self.str_init_models_ckpt: - if Path(ii).exists(): - shutil.rmtree(ii) - - def test_train(self): - steps = PrepRunNvNMDTrain( - "train-steps", - MockedPrepNvNMDTrain, - MockedRunNvNMDTrain, - upload_python_packages=upload_python_packages, - prep_config=default_config, - run_config=default_config, - ) - train_step = Step( - "train-step", - template=steps, - parameters={ - "numb_models": self.numb_models, - "template_script": self.template_script, - "train_config": {}, - }, - artifacts={ - "init_models": self.init_models, - "init_models_ckpt_meta": self.init_models_ckpt_meta, - "init_models_ckpt_data": self.init_models_ckpt_data, - "init_models_ckpt_index": self.init_models_ckpt_index, - "init_data": self.init_data, - "iter_data": self.iter_data, - }, - ) - wf = Workflow(name="nvnmd-train", host=default_host) - wf.add(train_step) - wf.submit() - - while wf.query_status() in ["Pending", "Running"]: - time.sleep(4) - - self.assertEqual(wf.query_status(), "Succeeded") - step = wf.query_step(name="train-step")[0] - self.assertEqual(step.phase, "Succeeded") - - download_artifact(step.outputs.artifacts["scripts"]) - download_artifact(step.outputs.artifacts["models"]) - download_artifact(step.outputs.artifacts["models_ckpt_meta"]) - download_artifact(step.outputs.artifacts["models_ckpt_data"]) - download_artifact(step.outputs.artifacts["models_ckpt_index"]) - download_artifact(step.outputs.artifacts["nvnmodels"]) - download_artifact(step.outputs.artifacts["logs"]) - download_artifact(step.outputs.artifacts["lcurves"]) - - for ii in range(3): - check_run_train_nvnmd_output( - self, - self.task_names[ii], - self.train_scripts[ii], - self.str_init_models[ii], - self.str_init_models_ckpt[ii], - self.path_init_data, - self.path_iter_data, - only_check_name=True, - ) - - def test_train_no_init_model(self): - steps = PrepRunNvNMDTrain( - "train-steps", - MockedPrepNvNMDTrain, - MockedRunNvNMDTrainNoneInitModel, - upload_python_packages=upload_python_packages, - prep_config=default_config, - run_config=default_config, - ) - train_step = Step( - "train-step", - template=steps, - parameters={ - "numb_models": self.numb_models, - "template_script": self.template_script, - "train_config": {}, - }, - artifacts={ - "init_models": None, - "init_models_ckpt_meta": None, - "init_models_ckpt_data": None, - "init_models_ckpt_index": None, - "init_data": self.init_data, - "iter_data": self.iter_data, - }, - ) - wf = Workflow(name="nvnmd-train", host=default_host) - wf.add(train_step) - wf.submit() - - while wf.query_status() in ["Pending", "Running"]: - time.sleep(4) - - self.assertEqual(wf.query_status(), "Succeeded") - step = wf.query_step(name="train-step")[0] - self.assertEqual(step.phase, "Succeeded") From 38c50ea0644234e1e40afb79e3e6e126a4a0f0d8 Mon Sep 17 00:00:00 2001 From: jiongwalai <843497845@qq.com> Date: Sun, 22 Jun 2025 18:52:06 +0800 Subject: [PATCH 37/49] fix download testunit --- dpgen2/superop/prep_run_dp_train.py | 3 +- tests/utils/test_dl_dpgen2_arti.py | 60 ----------------------------- 2 files changed, 2 insertions(+), 61 deletions(-) diff --git a/dpgen2/superop/prep_run_dp_train.py b/dpgen2/superop/prep_run_dp_train.py index 72a60b41..752a152a 100644 --- a/dpgen2/superop/prep_run_dp_train.py +++ b/dpgen2/superop/prep_run_dp_train.py @@ -11,6 +11,7 @@ Optional, Set, Type, + Union, ) from dflow import ( @@ -60,7 +61,7 @@ def __init__( self, name: str, prep_train_op: Type[OP], - run_train_op: Type[OP], + run_train_op: Type[Union[RunDPTrain, RunNvNMDTrain]], prep_config: Optional[dict] = None, run_config: Optional[dict] = None, upload_python_packages: Optional[List[os.PathLike]] = None, diff --git a/tests/utils/test_dl_dpgen2_arti.py b/tests/utils/test_dl_dpgen2_arti.py index b9b85575..c1166678 100644 --- a/tests/utils/test_dl_dpgen2_arti.py +++ b/tests/utils/test_dl_dpgen2_arti.py @@ -66,21 +66,6 @@ def test_train_download(self, mocked_dl): path=Path("foo/iter-000000/prep-run-train/inputs"), skip_exists=True, ), - mock.call( - "arti-init_models_ckpt_meta", - path=Path("foo/iter-000000/prep-run-train/inputs"), - skip_exists=True, - ), - mock.call( - "arti-init_models_ckpt_data", - path=Path("foo/iter-000000/prep-run-train/inputs"), - skip_exists=True, - ), - mock.call( - "arti-init_models_ckpt_index", - path=Path("foo/iter-000000/prep-run-train/inputs"), - skip_exists=True, - ), mock.call( "arti-init_data", path=Path("foo/iter-000000/prep-run-train/inputs"), @@ -101,21 +86,6 @@ def test_train_download(self, mocked_dl): path=Path("foo/iter-000000/prep-run-train/outputs"), skip_exists=True, ), - mock.call( - "arti-models_ckpt_meta", - path=Path("foo/iter-000000/prep-run-train/outputs"), - skip_exists=True, - ), - mock.call( - "arti-models_ckpt_data", - path=Path("foo/iter-000000/prep-run-train/outputs"), - skip_exists=True, - ), - mock.call( - "arti-models_ckpt_index", - path=Path("foo/iter-000000/prep-run-train/outputs"), - skip_exists=True, - ), mock.call( "arti-logs", path=Path("foo/iter-000000/prep-run-train/outputs"), @@ -297,21 +267,6 @@ def test_update_finished_steps_none_steps(self, mocked_dl): path=Path("iter-000000/prep-run-train/inputs"), skip_exists=True, ), - mock.call( - "arti-init_models_ckpt_meta", - path=Path("iter-000000/prep-run-train/inputs"), - skip_exists=True, - ), - mock.call( - "arti-init_models_ckpt_data", - path=Path("iter-000000/prep-run-train/inputs"), - skip_exists=True, - ), - mock.call( - "arti-init_models_ckpt_index", - path=Path("iter-000000/prep-run-train/inputs"), - skip_exists=True, - ), mock.call( "arti-init_data", path=Path("iter-000000/prep-run-train/inputs"), @@ -332,21 +287,6 @@ def test_update_finished_steps_none_steps(self, mocked_dl): path=Path("iter-000000/prep-run-train/outputs"), skip_exists=True, ), - mock.call( - "arti-models_ckpt_meta", - path=Path("iter-000000/prep-run-train/outputs"), - skip_exists=True, - ), - mock.call( - "arti-models_ckpt_data", - path=Path("iter-000000/prep-run-train/outputs"), - skip_exists=True, - ), - mock.call( - "arti-models_ckpt_index", - path=Path("iter-000000/prep-run-train/outputs"), - skip_exists=True, - ), mock.call( "arti-logs", path=Path("iter-000000/prep-run-train/outputs"), From d33ce44512419f0d5a1a1f771362955ace4731db Mon Sep 17 00:00:00 2001 From: jiongwalai <843497845@qq.com> Date: Sun, 22 Jun 2025 19:53:57 +0800 Subject: [PATCH 38/49] fix download testunit --- tests/utils/test_dl_dpgen2_arti.py | 3 +++ tests/utils/test_dl_dpgen2_arti_by_def.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/tests/utils/test_dl_dpgen2_arti.py b/tests/utils/test_dl_dpgen2_arti.py index c1166678..a9f251ac 100644 --- a/tests/utils/test_dl_dpgen2_arti.py +++ b/tests/utils/test_dl_dpgen2_arti.py @@ -28,6 +28,9 @@ class MockedArti: + def get(self, key): + return self.__getitem__(key) + def __getitem__( self, key, diff --git a/tests/utils/test_dl_dpgen2_arti_by_def.py b/tests/utils/test_dl_dpgen2_arti_by_def.py index 91d35e93..e6a30a32 100644 --- a/tests/utils/test_dl_dpgen2_arti_by_def.py +++ b/tests/utils/test_dl_dpgen2_arti_by_def.py @@ -30,6 +30,9 @@ class MockedArti: + def get(self, key): + return self.__getitem__(key) + def __getitem__( self, key, From d7eac9ae31cf08ebcdabca1d98d7a5fd61e9a08c Mon Sep 17 00:00:00 2001 From: jiongwalai <843497845@qq.com> Date: Sun, 22 Jun 2025 20:30:16 +0800 Subject: [PATCH 39/49] fix lmp template task group in lmp-nvnmd --- dpgen2/exploration/task/lmp_template_task_group.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpgen2/exploration/task/lmp_template_task_group.py b/dpgen2/exploration/task/lmp_template_task_group.py index 27075695..81288c5d 100644 --- a/dpgen2/exploration/task/lmp_template_task_group.py +++ b/dpgen2/exploration/task/lmp_template_task_group.py @@ -223,7 +223,7 @@ def revise_lmp_input_plm(lmp_lines, in_plm, out_plm="output.plumed"): def revise_lmp_input_rerun(lmp_lines): lmp_lines.append("jump SELF end") lmp_lines.append("label rerun") - lmp_lines.append(f"rerun {lmp_traj_name}.0 dump x y z fx fy fz add yes") + lmp_lines.append(f"rerun {lmp_traj_name}_0 dump x y z fx fy fz add yes") lmp_lines.append("label end") return lmp_lines From ed4cbbe75d3639b1730c0bdf2607982b11ac6556 Mon Sep 17 00:00:00 2001 From: jiongwalai <843497845@qq.com> Date: Mon, 23 Jun 2025 01:52:08 +0800 Subject: [PATCH 40/49] fix lmp-nvnmd ipnut file --- dpgen2/exploration/task/lmp/lmp_input.py | 8 ++++---- dpgen2/exploration/task/lmp_template_task_group.py | 4 ++-- dpgen2/op/run_nvnmd.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/dpgen2/exploration/task/lmp/lmp_input.py b/dpgen2/exploration/task/lmp/lmp_input.py index 3a08bbbc..969ac5aa 100644 --- a/dpgen2/exploration/task/lmp/lmp_input.py +++ b/dpgen2/exploration/task/lmp/lmp_input.py @@ -141,7 +141,7 @@ def make_lmp_input( if nvnmd_version is None: ret += "dump 1 all custom ${DUMP_FREQ} traj/*.lammpstrj id type x y z fx fy fz\n" else: - ret += "dump 1 all custom ${DUMP_FREQ} traj_${rerun}/*.lammpstrj id type x y z fx fy fz\n" + ret += "dump 1 all custom ${DUMP_FREQ} ${rerun}_traj/*.lammpstrj id type x y z fx fy fz\n" else: lmp_traj_file_name = ( lmp_pimd_traj_name % pimd_bead if pimd_bead is not None else lmp_traj_name @@ -153,7 +153,7 @@ def make_lmp_input( ) else: ret += ( - "dump 1 all custom ${DUMP_FREQ} %s_${rerun} id type x y z fx fy fz\n" + "dump 1 all custom ${DUMP_FREQ} ${rerun}_%s id type x y z fx fy fz\n" % lmp_traj_file_name ) ret += "restart 10000 dpgen.restart\n" @@ -211,8 +211,8 @@ def make_lmp_input( ret += "jump SELF end\n" ret += "label rerun\n" if trj_seperate_files: - ret += "rerun traj_0/*.lammpstrj dump x y z fx fy fz add yes\n" + ret += "rerun 0_traj/*.lammpstrj dump x y z fx fy fz add yes\n" else: - ret += "rerun %s_0 dump x y z fx fy fz add yes\n" % lmp_traj_name + ret += "rerun 0_%s dump x y z fx fy fz add yes\n" % lmp_traj_name ret += "label end\n" return ret diff --git a/dpgen2/exploration/task/lmp_template_task_group.py b/dpgen2/exploration/task/lmp_template_task_group.py index 81288c5d..486b0f5d 100644 --- a/dpgen2/exploration/task/lmp_template_task_group.py +++ b/dpgen2/exploration/task/lmp_template_task_group.py @@ -206,7 +206,7 @@ def revise_lmp_input_dump(lmp_lines, trj_freq, pimd_bead=None, nvnmd_version=Non else: lmp_lines[ idx - ] = f"dump dpgen_dump all custom {trj_freq} {lmp_traj_file_name} id type x y z fx fy fz" + ] = "dump dpgen_dump all custom %s ${rerun}_%s id type x y z fx fy fz" % (trj_freq, lmp_traj_file_name) lmp_lines.insert(idx + 1, 'if "${rerun} > 0" then "jump SELF rerun"') return lmp_lines @@ -223,7 +223,7 @@ def revise_lmp_input_plm(lmp_lines, in_plm, out_plm="output.plumed"): def revise_lmp_input_rerun(lmp_lines): lmp_lines.append("jump SELF end") lmp_lines.append("label rerun") - lmp_lines.append(f"rerun {lmp_traj_name}_0 dump x y z fx fy fz add yes") + lmp_lines.append(f"rerun 0_{lmp_traj_name} dump x y z fx fy fz add yes") lmp_lines.append("label end") return lmp_lines diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index 064ac8f8..2eef4060 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -215,7 +215,7 @@ def execute( ret_dict = { - "log": work_dir / lmp_log_name, + "log": work_dir / ("%d_%s"%(0, lmp_log_name)), "traj": work_dir / ("%d_%s" % (0, lmp_traj_name)), "model_devi": self.get_model_devi(work_dir / lmp_model_devi_name), } @@ -253,7 +253,7 @@ def set_lmp_models(lmp_input_name: str, model_names: List[str]): lmp_input_lines[idx] = " ".join(new_line_split) + "\n" - with open(lmp_input_name + ".%d"%(ii), "w", encoding="utf8") as f: + with open("%d_%s"%(ii,lmp_input_name), "w", encoding="utf8") as f: f.write("".join(lmp_input_lines)) From 179fa31ff0d53da09a7bb45d0e164cd4cd28ca28 Mon Sep 17 00:00:00 2001 From: jiongwalai <843497845@qq.com> Date: Mon, 23 Jun 2025 09:27:06 +0800 Subject: [PATCH 41/49] fix lmp-nvnmd testunit --- tests/op/test_run_nvnmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/op/test_run_nvnmd.py b/tests/op/test_run_nvnmd.py index 2340d2bb..04a02059 100644 --- a/tests/op/test_run_nvnmd.py +++ b/tests/op/test_run_nvnmd.py @@ -87,7 +87,7 @@ def test_success(self, mocked_run): ) work_dir = Path(self.task_name) # check output - self.assertEqual(out["log"], work_dir / lmp_log_name) + self.assertEqual(out["log"], work_dir / ("0_%s"%lmp_log_name)) self.assertEqual(out["traj"], work_dir / ("0_%s"%lmp_traj_name)) self.assertEqual(out["model_devi"], work_dir / lmp_model_devi_name) # check call From a232ae1fab64e921e5b8945fef7bc559cbcd9180 Mon Sep 17 00:00:00 2001 From: jiongwalai <843497845@qq.com> Date: Fri, 27 Jun 2025 00:27:29 +0800 Subject: [PATCH 42/49] delete info --- dpgen2/op/run_nvnmd.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index 2eef4060..73994b45 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -174,7 +174,6 @@ def execute( set_lmp_models(lmp_input_name, model_names) # run lmp - #for ii in range(1): for ii in range(len(model_names)): commands = " ".join( [ From 7a5924db04246812030de81c2f6a6767fb718646 Mon Sep 17 00:00:00 2001 From: jiongwalai <843497845@qq.com> Date: Fri, 27 Jun 2025 09:03:21 +0800 Subject: [PATCH 43/49] remove preprunnvnmd superop --- dpgen2/entrypoint/submit.py | 2 -- dpgen2/op/__init__.py | 3 --- dpgen2/superop/block.py | 4 +--- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/dpgen2/entrypoint/submit.py b/dpgen2/entrypoint/submit.py index 7726eb68..a39d6cf8 100644 --- a/dpgen2/entrypoint/submit.py +++ b/dpgen2/entrypoint/submit.py @@ -105,7 +105,6 @@ PrepCalyModelDevi, PrepDPTrain, PrepLmp, - PrepNvNMDTrain, PrepRelax, RunCalyDPOptim, RunCalyModelDevi, @@ -128,7 +127,6 @@ PrepRunDPTrain, PrepRunFp, PrepRunLmp, - PrepRunNvNMDTrain, ) from dpgen2.superop.caly_evo_step import ( CalyEvoStep, diff --git a/dpgen2/op/__init__.py b/dpgen2/op/__init__.py index a2d43f58..fa1c020c 100644 --- a/dpgen2/op/__init__.py +++ b/dpgen2/op/__init__.py @@ -22,9 +22,6 @@ from .prep_lmp import ( PrepLmp, ) -from .prep_nvnmd_train import ( - PrepNvNMDTrain, -) from .prep_relax import ( PrepRelax, ) diff --git a/dpgen2/superop/block.py b/dpgen2/superop/block.py index ad553c35..922e3a5e 100644 --- a/dpgen2/superop/block.py +++ b/dpgen2/superop/block.py @@ -64,9 +64,7 @@ from .prep_run_lmp import ( PrepRunLmp, ) -from .prep_run_nvnmd_train import ( - PrepRunNvNMDTrain, -) + block_default_optional_parameter = { "data_mixed_type": False, From ede835ad17e8fc62206ace495ab44ccda2f43050 Mon Sep 17 00:00:00 2001 From: jiongwalai <843497845@qq.com> Date: Fri, 27 Jun 2025 09:06:06 +0800 Subject: [PATCH 44/49] remove prep_run_nvnmd_train superop --- dpgen2/superop/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/dpgen2/superop/__init__.py b/dpgen2/superop/__init__.py index 50e0a5d7..0223605f 100644 --- a/dpgen2/superop/__init__.py +++ b/dpgen2/superop/__init__.py @@ -16,6 +16,3 @@ from .prep_run_lmp import ( PrepRunLmp, ) -from .prep_run_nvnmd_train import ( - PrepRunNvNMDTrain, -) From 2b14439fe8aa2069a37beb2e8b9f8a350a5628a0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 27 Jun 2025 01:38:47 +0000 Subject: [PATCH 45/49] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../exploration/render/traj_render_lammps.py | 4 +- .../task/lmp_template_task_group.py | 7 +- dpgen2/op/prep_dp_train.py | 4 +- dpgen2/op/run_dp_train.py | 7 +- dpgen2/op/run_nvnmd.py | 26 +++---- dpgen2/op/run_nvnmd_train.py | 39 +++++----- dpgen2/superop/block.py | 1 - tests/mocked_ops.py | 28 ++++---- tests/op/test_prep_dp_train.py | 2 +- tests/op/test_run_dp_train.py | 2 +- tests/op/test_run_nvnmd.py | 23 +++--- tests/op/test_run_nvnmd_train.py | 71 ++++++++----------- tests/test_prep_run_dp_train.py | 57 +++++++++++---- tests/test_prep_run_lmp.py | 1 - 14 files changed, 145 insertions(+), 127 deletions(-) diff --git a/dpgen2/exploration/render/traj_render_lammps.py b/dpgen2/exploration/render/traj_render_lammps.py index 0b1c36f9..2f4e56d9 100644 --- a/dpgen2/exploration/render/traj_render_lammps.py +++ b/dpgen2/exploration/render/traj_render_lammps.py @@ -109,10 +109,10 @@ def get_confs( conf_filters: Optional["ConfFilters"] = None, optional_outputs: Optional[List[Path]] = None, ) -> dpdata.MultiSystems: - - from ase.io import( # type: ignore + from ase.io import ( # type: ignore read, ) + ntraj = len(trajs) ele_temp = None if optional_outputs: diff --git a/dpgen2/exploration/task/lmp_template_task_group.py b/dpgen2/exploration/task/lmp_template_task_group.py index 486b0f5d..03eb9a66 100644 --- a/dpgen2/exploration/task/lmp_template_task_group.py +++ b/dpgen2/exploration/task/lmp_template_task_group.py @@ -204,9 +204,10 @@ def revise_lmp_input_dump(lmp_lines, trj_freq, pimd_bead=None, nvnmd_version=Non idx ] = f"dump dpgen_dump all custom {trj_freq} {lmp_traj_file_name} id type x y z" else: - lmp_lines[ - idx - ] = "dump dpgen_dump all custom %s ${rerun}_%s id type x y z fx fy fz" % (trj_freq, lmp_traj_file_name) + lmp_lines[idx] = ( + "dump dpgen_dump all custom %s ${rerun}_%s id type x y z fx fy fz" + % (trj_freq, lmp_traj_file_name) + ) lmp_lines.insert(idx + 1, 'if "${rerun} > 0" then "jump SELF rerun"') return lmp_lines diff --git a/dpgen2/op/prep_dp_train.py b/dpgen2/op/prep_dp_train.py index f5767bcd..10bd0674 100644 --- a/dpgen2/op/prep_dp_train.py +++ b/dpgen2/op/prep_dp_train.py @@ -123,7 +123,9 @@ def _script_rand_seed( if "model_dict" in jtmp["model"]: for d in jtmp["model"]["model_dict"].values(): if isinstance(d["descriptor"], str): - self._set_desc_seed(jtmp["model"]["shared_dict"][d["descriptor"]]) + self._set_desc_seed( + jtmp["model"]["shared_dict"][d["descriptor"]] + ) d["fitting_net"]["seed"] = random.randrange(sys.maxsize) % (2**32) else: self._set_desc_seed(jtmp["model"]["descriptor"]) diff --git a/dpgen2/op/run_dp_train.py b/dpgen2/op/run_dp_train.py index 57797a81..4c271a7d 100644 --- a/dpgen2/op/run_dp_train.py +++ b/dpgen2/op/run_dp_train.py @@ -1,3 +1,4 @@ +import copy import glob import json import logging @@ -5,7 +6,6 @@ import os import random import shutil -import copy from pathlib import ( Path, ) @@ -440,15 +440,14 @@ def write_other_to_input_script( raise RuntimeError( "unsupported DeePMD-kit major version", major_version ) - + if do_quantized: if major_version == "1": odict["training"]["stop_batch"] = 0 elif major_version == "2": odict["training"]["numb_steps"] = 0 - - return odict + return odict @staticmethod def skip_training( diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index 73994b45..e6a7bab5 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -11,10 +11,10 @@ ) from typing import ( List, - Union, Optional, Set, Tuple, + Union, ) import numpy as np @@ -141,7 +141,7 @@ def execute( # link input files for ii in input_files: iname = ii.name - #Path(iname).symlink_to(ii) + # Path(iname).symlink_to(ii) try: Path(iname).symlink_to(ii) except: @@ -153,7 +153,7 @@ def execute( ext = os.path.splitext(mm)[-1] if ext == ".pb": mname = model_name_pattern % (idx) - #Path(mname).symlink_to(mm) + # Path(mname).symlink_to(mm) try: Path(mname).symlink_to(mm) except: @@ -208,13 +208,12 @@ def execute( merge_pimd_files() - traj_files = glob.glob("*_%s"%lmp_traj_name) + traj_files = glob.glob("*_%s" % lmp_traj_name) if len(traj_files) > 1: calc_model_devi(traj_files, lmp_model_devi_name) - ret_dict = { - "log": work_dir / ("%d_%s"%(0, lmp_log_name)), + "log": work_dir / ("%d_%s" % (0, lmp_log_name)), "traj": work_dir / ("%d_%s" % (0, lmp_traj_name)), "model_devi": self.get_model_devi(work_dir / lmp_model_devi_name), } @@ -243,16 +242,16 @@ def set_lmp_models(lmp_input_name: str, model_names: List[str]): if idx is None: return new_line_split = lmp_input_lines[idx].split() - match_idx = find_only_one_key(new_line_split, ['model.pb'], raise_not_found=False) + match_idx = find_only_one_key(new_line_split, ["model.pb"], raise_not_found=False) if match_idx is None: raise RuntimeError(f"last matching index should not be -1, terribly wrong ") - + for ii, model_name in enumerate(model_names): new_line_split[match_idx] = model_name - + lmp_input_lines[idx] = " ".join(new_line_split) + "\n" - with open("%d_%s"%(ii,lmp_input_name), "w", encoding="utf8") as f: + with open("%d_%s" % (ii, lmp_input_name), "w", encoding="utf8") as f: f.write("".join(lmp_input_lines)) @@ -275,8 +274,8 @@ def calc_model_devi( traj_files, fname="model_devi.out", ): - - from ase.io import read # type: ignore + from ase.io import read # type: ignore + trajectories = [] for f in traj_files: traj = read(f, format="lammps-dump-text", index=":", order=True) @@ -321,6 +320,7 @@ def calc_model_devi( devi = np.array(devi) write_model_devi_out(devi, fname=fname) + def write_model_devi_out(devi: np.ndarray, fname: Union[str, Path], header: str = ""): assert devi.shape[1] == 8 header = "%s\n%10s" % (header, "step") @@ -338,4 +338,4 @@ def write_model_devi_out(devi: np.ndarray, fname: Union[str, Path], header: str delimiter="", header=header, ) - return devi \ No newline at end of file + return devi diff --git a/dpgen2/op/run_nvnmd_train.py b/dpgen2/op/run_nvnmd_train.py index 2c0300f1..fa00ff74 100644 --- a/dpgen2/op/run_nvnmd_train.py +++ b/dpgen2/op/run_nvnmd_train.py @@ -40,16 +40,16 @@ train_script_name, train_task_pattern, ) +from dpgen2.op.run_dp_train import ( + RunDPTrain, + _expand_all_multi_sys_to_sys, +) from dpgen2.utils.chdir import ( set_directory, ) from dpgen2.utils.run_command import ( run_command, ) -from dpgen2.op.run_dp_train import ( - RunDPTrain, - _expand_all_multi_sys_to_sys, -) def _make_train_command( @@ -190,9 +190,15 @@ def execute( task_path = ip["task_path"] init_model = ip["init_model"] init_frz_model = ip["init_model"] / "frozen_model.pb" if init_model else None - init_model_ckpt_data = ip["init_model"] / "model.ckpt.data-00000-of-00001" if init_model else None - init_model_ckpt_meta = ip["init_model"] / "model.ckpt.meta" if init_model else None - init_model_ckpt_index = ip["init_model"] / "model.ckpt.index" if init_model else None + init_model_ckpt_data = ( + ip["init_model"] / "model.ckpt.data-00000-of-00001" if init_model else None + ) + init_model_ckpt_meta = ( + ip["init_model"] / "model.ckpt.meta" if init_model else None + ) + init_model_ckpt_index = ( + ip["init_model"] / "model.ckpt.index" if init_model else None + ) init_data = ip["init_data"] iter_data = ip["iter_data"] valid_data = ip["valid_data"] @@ -242,11 +248,7 @@ def execute( valid_data, ) train_cnn_dict = RunDPTrain.write_other_to_input_script( - train_dict, - config, - do_init_model, - major_version, - False + train_dict, config, do_init_model, major_version, False ) train_qnn_dict = RunDPTrain.write_other_to_input_script( train_dict, @@ -264,10 +266,10 @@ def clean_before_quit(): fplog.close() # dump train script - + with open(train_script_name, "w") as fp: json.dump(train_cnn_dict, fp, indent=4) - + with open(train_cnn_script_name, "w") as fp: json.dump(train_cnn_dict, fp, indent=4) @@ -321,11 +323,10 @@ def clean_before_quit(): model_ckpt_index_file = "nvnmd_cnn/model.ckpt.index" model_ckpt_meta_file = "nvnmd_cnn/model.ckpt.meta" lcurve_file = "nvnmd_cnn/lcurve.out" - + if os.path.exists("input_v2_compat.json"): shutil.copy2("input_v2_compat.json", train_script_name) - - + else: cnn_model_file = init_model model_ckpt_data_file = "" @@ -367,7 +368,7 @@ def clean_before_quit(): qnn_model_file = "nvnmd_qnn/model.pb" clean_before_quit() - + # copy all models files to the output directory os.makedirs("nvnmd_models", exist_ok=True) if os.path.exists(cnn_model_file): @@ -380,7 +381,7 @@ def clean_before_quit(): shutil.copy(model_ckpt_data_file, "nvnmd_models") if os.path.exists(model_ckpt_index_file): shutil.copy(model_ckpt_index_file, "nvnmd_models") - + model_files = "nvnmd_models" return OPIO( diff --git a/dpgen2/superop/block.py b/dpgen2/superop/block.py index 922e3a5e..0e39ab38 100644 --- a/dpgen2/superop/block.py +++ b/dpgen2/superop/block.py @@ -65,7 +65,6 @@ PrepRunLmp, ) - block_default_optional_parameter = { "data_mixed_type": False, "finetune_mode": "no", diff --git a/tests/mocked_ops.py b/tests/mocked_ops.py index 12527cff..87bacdcd 100644 --- a/tests/mocked_ops.py +++ b/tests/mocked_ops.py @@ -127,11 +127,16 @@ def make_mocked_init_nvnmd_models(numb_models): for ii in range(numb_models): nvnmd_models_dir = Path(nvnmd_model_name_pattern % ii) nvnmd_models_dir.mkdir(exist_ok=True, parents=True) - for jj in ("frozen_model.pb", "model.ckpt.meta", "model.ckpt.data", "model.ckpt.index"): + for jj in ( + "frozen_model.pb", + "model.ckpt.meta", + "model.ckpt.data", + "model.ckpt.index", + ): ff = nvnmd_models_dir / jj ff.write_text(f"This is init {jj} {ii}") tmp_models.append(nvnmd_models_dir) - return tmp_models + return tmp_models def make_mocked_init_data(): @@ -423,11 +428,11 @@ def execute( assert re.match("task.[0-9][0-9][0-9][0-9]", ip["task_name"]) task_id = int(ip["task_name"].split(".")[1]) assert ip["task_name"] in str(ip["task_path"]) - init_frz_model = ip["init_model"] / "frozen_model.pb" - init_model_ckpt_data = ip["init_model"] / "model.ckpt.data" - init_model_ckpt_meta = ip["init_model"] / "model.ckpt.meta" - init_model_ckpt_index = ip["init_model"] / "model.ckpt.index" - + init_frz_model = ip["init_model"] / "frozen_model.pb" + init_model_ckpt_data = ip["init_model"] / "model.ckpt.data" + init_model_ckpt_meta = ip["init_model"] / "model.ckpt.meta" + init_model_ckpt_index = ip["init_model"] / "model.ckpt.index" + assert ".pb" in str(init_frz_model) assert "ckpt.meta" in str(init_model_ckpt_meta) assert "ckpt.data" in str(init_model_ckpt_data) @@ -499,7 +504,6 @@ def execute( with log.open("a") as f: f.write(f"script {str(script)} OK\n") - cnn_dir.mkdir(exist_ok=True, parents=True) with cnn_model.open("w") as f: f.write("read from init model: \n") @@ -513,7 +517,7 @@ def execute( with model_ckpt_index_file.open("w") as f: f.write("read from init model: \n") f.write(init_model_ckpt_index.read_text() + "\n") - + qnn_dir.mkdir(exist_ok=True, parents=True) with qnn_model.open("w") as f: f.write("read from init model: \n") @@ -529,7 +533,7 @@ def execute( shutil.copy(model_ckpt_meta_file, "nvnmd_models") shutil.copy(model_ckpt_data_file, "nvnmd_models") shutil.copy(model_ckpt_index_file, "nvnmd_models") - + os.chdir(cwd) return OPIO( @@ -643,7 +647,7 @@ def execute( f.write("read from init model ckpt: \n") with model_ckpt_index_file.open("w") as f: f.write("read from init model ckpt: \n") - + qnn_dir.mkdir(exist_ok=True, parents=True) with qnn_model.open("w") as f: f.write("read from init model: \n") @@ -658,7 +662,7 @@ def execute( shutil.copy(model_ckpt_meta_file, "nvnmd_models") shutil.copy(model_ckpt_data_file, "nvnmd_models") shutil.copy(model_ckpt_index_file, "nvnmd_models") - + os.chdir(cwd) return OPIO( diff --git a/tests/op/test_prep_dp_train.py b/tests/op/test_prep_dp_train.py index 5ac15f41..427a8332 100644 --- a/tests/op/test_prep_dp_train.py +++ b/tests/op/test_prep_dp_train.py @@ -231,7 +231,7 @@ def test_template_raise_wrong_list_length(self): template_script_hybrid, template_script_se_e2_a, template_script_nvnmd_v1, - template_script_nvnmd_v0 + template_script_nvnmd_v0, ], "numb_models": self.numb_models, } diff --git a/tests/op/test_run_dp_train.py b/tests/op/test_run_dp_train.py index d7386a3c..a64c0878 100644 --- a/tests/op/test_run_dp_train.py +++ b/tests/op/test_run_dp_train.py @@ -198,7 +198,7 @@ def setUp(self): "batch_size": "auto", "auto_prob_style": "prob_sys_size", "disp_file": "lcurve.out", - "save_ckpt": "model.ckpt" + "save_ckpt": "model.ckpt", }, "learning_rate": { "start_lr": 1.0, diff --git a/tests/op/test_run_nvnmd.py b/tests/op/test_run_nvnmd.py index 8b26d42b..e09bb03a 100644 --- a/tests/op/test_run_nvnmd.py +++ b/tests/op/test_run_nvnmd.py @@ -33,10 +33,7 @@ lmp_traj_name, model_name_pattern, ) -from dpgen2.op.run_lmp import ( - get_ele_temp, - set_models -) +from dpgen2.op.run_lmp import get_ele_temp, set_models from dpgen2.op.run_nvnmd import ( RunNvNMD, merge_pimd_files, @@ -87,8 +84,8 @@ def test_success(self, mocked_run): ) work_dir = Path(self.task_name) # check output - self.assertEqual(out["log"], work_dir / ("0_%s"%lmp_log_name)) - self.assertEqual(out["traj"], work_dir / ("0_%s"%lmp_traj_name)) + self.assertEqual(out["log"], work_dir / ("0_%s" % lmp_log_name)) + self.assertEqual(out["traj"], work_dir / ("0_%s" % lmp_traj_name)) self.assertEqual(out["model_devi"], work_dir / lmp_model_devi_name) # check call models = ["models/path/model_%d.pb" % i for i in range(len(self.models))] @@ -104,12 +101,12 @@ def test_success(self, mocked_run): "%d_%s" % (ii, lmp_log_name), "-v", "rerun", - "%d" % ii + "%d" % ii, ] ), shell=True, - ) - for ii in range(len(models)) + ) + for ii in range(len(models)) ] mocked_run.assert_has_calls(calls) # check input files are correctly linked @@ -148,12 +145,12 @@ def test_error(self, mocked_run): "%d_%s" % (ii, lmp_log_name), "-v", "rerun", - "%d" % ii + "%d" % ii, ] ), shell=True, - ) - for ii in range(1) + ) + for ii in range(1) ] mocked_run.assert_has_calls(calls) @@ -194,4 +191,4 @@ def test_failed_no_matching(self): input_name = Path("lmp.input") input_name.write_text(lmp_config) with self.assertRaises(RuntimeError) as re: - set_models(input_name, self.model_names) \ No newline at end of file + set_models(input_name, self.model_names) diff --git a/tests/op/test_run_nvnmd_train.py b/tests/op/test_run_nvnmd_train.py index 7a57cad8..729bea0f 100644 --- a/tests/op/test_run_nvnmd_train.py +++ b/tests/op/test_run_nvnmd_train.py @@ -81,10 +81,10 @@ def setUp(self): self.init_data = [Path("init/data-0"), Path("init/data-1")] self.init_data = sorted(list(self.init_data)) - #self.init_model = Path("bar.pb") - #self.init_model_ckpt_meta = Path("model.ckpt.meta") - #self.init_model_ckpt_data = Path("model.ckpt.data") - #self.init_model_ckpt_index = Path("model.ckpt.index") + # self.init_model = Path("bar.pb") + # self.init_model_ckpt_meta = Path("model.ckpt.meta") + # self.init_model_ckpt_data = Path("model.ckpt.data") + # self.init_model_ckpt_index = Path("model.ckpt.index") self.init_model = Path("nvnmd_models") self.config = { @@ -203,7 +203,7 @@ def setUp(self): "start_pref_e": 0.1, "start_pref_f": 100, "start_pref_v": 0.0, - }, + }, } self.idict_v1 = { @@ -320,7 +320,6 @@ def test_normalize_config(self): self.assertAlmostEqual(config["init_model_start_pref_f"], 100) self.assertAlmostEqual(config["init_model_start_pref_v"], 0.0) - def test_update_input_dict_v1_init_model(self): odict = RunDPTrain.write_data_to_input_script( self.idict_v1, @@ -422,28 +421,25 @@ def test_exec_v1(self, mocked_run): } ) ) + self.assertEqual(out["script"], work_dir / train_script_name) self.assertEqual( - out["script"], - work_dir / train_script_name - ) - self.assertEqual( - out["model"] / "frozen_model.pb", + out["model"] / "frozen_model.pb", work_dir / "nvnmd_models/frozen_model.pb", ) self.assertEqual( - out["model"] / "model.pb", + out["model"] / "model.pb", work_dir / "nvnmd_models/model.pb", ) self.assertEqual( - out["model"] / "model.ckpt.data-00000-of-00001", + out["model"] / "model.ckpt.data-00000-of-00001", work_dir / "nvnmd_models/model.ckpt.data-00000-of-00001", ) self.assertEqual( - out["model"] / "model.ckpt.meta", + out["model"] / "model.ckpt.meta", work_dir / "nvnmd_models/model.ckpt.meta", ) self.assertEqual( - out["model"] / "model.ckpt.index", + out["model"] / "model.ckpt.index", work_dir / "nvnmd_models/model.ckpt.index", ) self.assertEqual( @@ -503,28 +499,25 @@ def test_exec_v2(self, mocked_run): } ) ) + self.assertEqual(out["script"], work_dir / train_script_name) self.assertEqual( - out["script"], - work_dir / train_script_name - ) - self.assertEqual( - out["model"] / "frozen_model.pb", + out["model"] / "frozen_model.pb", work_dir / "nvnmd_models/frozen_model.pb", ) self.assertEqual( - out["model"] / "model.pb", + out["model"] / "model.pb", work_dir / "nvnmd_models/model.pb", ) self.assertEqual( - out["model"] / "model.ckpt.data-00000-of-00001", + out["model"] / "model.ckpt.data-00000-of-00001", work_dir / "nvnmd_models/model.ckpt.data-00000-of-00001", ) self.assertEqual( - out["model"] / "model.ckpt.meta", + out["model"] / "model.ckpt.meta", work_dir / "nvnmd_models/model.ckpt.meta", ) self.assertEqual( - out["model"] / "model.ckpt.index", + out["model"] / "model.ckpt.index", work_dir / "nvnmd_models/model.ckpt.index", ) self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") @@ -578,28 +571,25 @@ def test_exec_v2_init_model(self, mocked_run): } ) ) + self.assertEqual(out["script"], work_dir / train_script_name) self.assertEqual( - out["script"], - work_dir / train_script_name - ) - self.assertEqual( - out["model"] / "frozen_model.pb", + out["model"] / "frozen_model.pb", work_dir / "nvnmd_models/frozen_model.pb", ) self.assertEqual( - out["model"] / "model.pb", + out["model"] / "model.pb", work_dir / "nvnmd_models/model.pb", ) self.assertEqual( - out["model"] / "model.ckpt.data-00000-of-00001", + out["model"] / "model.ckpt.data-00000-of-00001", work_dir / "nvnmd_models/model.ckpt.data-00000-of-00001", ) self.assertEqual( - out["model"] / "model.ckpt.meta", + out["model"] / "model.ckpt.meta", work_dir / "nvnmd_models/model.ckpt.meta", ) self.assertEqual( - out["model"] / "model.ckpt.index", + out["model"] / "model.ckpt.index", work_dir / "nvnmd_models/model.ckpt.index", ) self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") @@ -798,28 +788,25 @@ def test_exec_v2_empty_dir(self, mocked_run): } ) ) + self.assertEqual(out["script"], work_dir / train_script_name) self.assertEqual( - out["script"], - work_dir / train_script_name - ) - self.assertEqual( - out["model"] / "frozen_model.pb", + out["model"] / "frozen_model.pb", work_dir / "nvnmd_models/frozen_model.pb", ) self.assertEqual( - out["model"] / "model.pb", + out["model"] / "model.pb", work_dir / "nvnmd_models/model.pb", ) self.assertEqual( - out["model"] / "model.ckpt.data-00000-of-00001", + out["model"] / "model.ckpt.data-00000-of-00001", work_dir / "nvnmd_models/model.ckpt.data-00000-of-00001", ) self.assertEqual( - out["model"] / "model.ckpt.meta", + out["model"] / "model.ckpt.meta", work_dir / "nvnmd_models/model.ckpt.meta", ) self.assertEqual( - out["model"] / "model.ckpt.index", + out["model"] / "model.ckpt.index", work_dir / "nvnmd_models/model.ckpt.index", ) self.assertEqual(out["lcurve"], work_dir / "nvnmd_cnn/lcurve.out") diff --git a/tests/test_prep_run_dp_train.py b/tests/test_prep_run_dp_train.py index 808d570a..e4d3a28a 100644 --- a/tests/test_prep_run_dp_train.py +++ b/tests/test_prep_run_dp_train.py @@ -199,14 +199,25 @@ def check_run_train_nvnmd_output( iter_data, only_check_name=only_check_name, ) - _check_model(tcase, "nvnmd_models/frozen_model.pb", cwd, init_model / "frozen_model.pb") - _check_model(tcase, "nvnmd_models/model.pb", cwd , init_model / "frozen_model.pb") - _check_model(tcase, "nvnmd_models/model.ckpt.meta", cwd, init_model / "model.ckpt.meta") - _check_model(tcase, "nvnmd_models/model.ckpt.data-00000-of-00001", cwd, init_model / "model.ckpt.data") - _check_model(tcase, "nvnmd_models/model.ckpt.index", cwd, init_model / "model.ckpt.index") + _check_model( + tcase, "nvnmd_models/frozen_model.pb", cwd, init_model / "frozen_model.pb" + ) + _check_model(tcase, "nvnmd_models/model.pb", cwd, init_model / "frozen_model.pb") + _check_model( + tcase, "nvnmd_models/model.ckpt.meta", cwd, init_model / "model.ckpt.meta" + ) + _check_model( + tcase, + "nvnmd_models/model.ckpt.data-00000-of-00001", + cwd, + init_model / "model.ckpt.data", + ) + _check_model( + tcase, "nvnmd_models/model.ckpt.index", cwd, init_model / "model.ckpt.index" + ) _check_lcurve(tcase, "nvnmd_cnn/lcurve.out", cwd, script) os.chdir(cwd) - + class TestMockedPrepDPTrain(unittest.TestCase): def setUp(self): @@ -342,7 +353,7 @@ def tearDown(self): for ii in self.init_models: if Path(ii).exists(): shutil.rmtree(ii) - + def test(self): for ii in range(3): run = MockedRunNvNMDTrain() @@ -359,13 +370,31 @@ def test(self): op = run.execute(ip) self.assertEqual(op["script"], Path(train_task_pattern % ii) / "input.json") self.assertTrue(op["script"].is_file()) - self.assertEqual(op["model"] / "frozen_model.pb", Path(train_task_pattern % ii) / "nvnmd_models/frozen_model.pb") - self.assertEqual(op["model"] / "model.pb", Path(train_task_pattern % ii) / "nvnmd_models/model.pb") - self.assertEqual(op["model"] / "model.ckpt.meta", Path(train_task_pattern % ii) / "nvnmd_models/model.ckpt.meta") - self.assertEqual(op["model"] / "model.ckpt.data-00000-of-00001", Path(train_task_pattern % ii) / "nvnmd_models/model.ckpt.data-00000-of-00001") - self.assertEqual(op["model"] / "model.ckpt.index", Path(train_task_pattern % ii) / "nvnmd_models/model.ckpt.index") + self.assertEqual( + op["model"] / "frozen_model.pb", + Path(train_task_pattern % ii) / "nvnmd_models/frozen_model.pb", + ) + self.assertEqual( + op["model"] / "model.pb", + Path(train_task_pattern % ii) / "nvnmd_models/model.pb", + ) + self.assertEqual( + op["model"] / "model.ckpt.meta", + Path(train_task_pattern % ii) / "nvnmd_models/model.ckpt.meta", + ) + self.assertEqual( + op["model"] / "model.ckpt.data-00000-of-00001", + Path(train_task_pattern % ii) + / "nvnmd_models/model.ckpt.data-00000-of-00001", + ) + self.assertEqual( + op["model"] / "model.ckpt.index", + Path(train_task_pattern % ii) / "nvnmd_models/model.ckpt.index", + ) self.assertEqual(op["log"], Path(train_task_pattern % ii) / "log") - self.assertEqual(op["lcurve"], Path(train_task_pattern % ii) / "nvnmd_cnn/lcurve.out") + self.assertEqual( + op["lcurve"], Path(train_task_pattern % ii) / "nvnmd_cnn/lcurve.out" + ) check_run_train_nvnmd_output( self, self.task_names[ii], @@ -373,7 +402,7 @@ def test(self): self.init_models[ii], self.init_data, self.iter_data, - ) + ) @unittest.skipIf(skip_ut_with_dflow, skip_ut_with_dflow_reason) diff --git a/tests/test_prep_run_lmp.py b/tests/test_prep_run_lmp.py index 7923f025..db3ae06c 100644 --- a/tests/test_prep_run_lmp.py +++ b/tests/test_prep_run_lmp.py @@ -290,7 +290,6 @@ def test(self): self.check_run_lmp_output(self.task_list_str[ii], self.model_list) - # @unittest.skip("temp") @unittest.skipIf(skip_ut_with_dflow, skip_ut_with_dflow_reason) class TestPrepRunLmp(unittest.TestCase): From 2a25efe2db83d53570fa86faf8926677e03cd1b3 Mon Sep 17 00:00:00 2001 From: jiongwalai Date: Sat, 28 Jun 2025 10:14:34 +0800 Subject: [PATCH 46/49] remove prep_run_nvnmd_train superop && accept suggestion --- dpgen2/entrypoint/args.py | 5 +---- dpgen2/op/run_nvnmd.py | 3 +-- tests/test_prep_run_dp_train.py | 1 - 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/dpgen2/entrypoint/args.py b/dpgen2/entrypoint/args.py index 70492ab9..7d24924c 100644 --- a/dpgen2/entrypoint/args.py +++ b/dpgen2/entrypoint/args.py @@ -31,9 +31,6 @@ from dpgen2.op.run_lmp import ( RunLmp, ) -from dpgen2.op.run_nvnmd_train import ( - RunNvNMDTrain, -) from dpgen2.utils import ( normalize_step_dict, step_conf_args, @@ -145,7 +142,7 @@ def nvnmd_train_args(): RunDPTrain.training_args(), optional=True, default=RunDPTrain.normalize_config({}), - doc=doc_numb_models, + doc=doc_config, ), Argument("numb_models", int, optional=True, default=4, doc=doc_numb_models), Argument( diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index e6a7bab5..00170501 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -126,7 +126,6 @@ def execute( config = ip["config"] if ip["config"] is not None else {} config = RunLmp.normalize_config(config) command = config["command"] - teacher_model: Optional[BinaryFileInput] = config["teacher_model_path"] shuffle_models: Optional[bool] = config["shuffle_models"] task_name = ip["task_name"] task_path = ip["task_path"] @@ -283,7 +282,7 @@ def calc_model_devi( num_frames = len(trajectories[0]) for traj in trajectories: - assert len(traj) == num_frames, "Not match" + assert len(traj) == num_frames, f"Trajectory length mismatch: expected {num_frames}, got {len(traj)} frames" devi = [] for frame_idx in range(num_frames): diff --git a/tests/test_prep_run_dp_train.py b/tests/test_prep_run_dp_train.py index e4d3a28a..414839a1 100644 --- a/tests/test_prep_run_dp_train.py +++ b/tests/test_prep_run_dp_train.py @@ -54,7 +54,6 @@ MockedRunDPTrain, MockedRunDPTrainNoneInitModel, MockedRunNvNMDTrain, - MockedRunNvNMDTrainCheckOptParam, MockedRunNvNMDTrainNoneInitModel, make_mocked_init_data, make_mocked_init_models, From 3c8982ce77194c4a3bba977ac8abbabd1e6e2326 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 28 Jun 2025 02:20:58 +0000 Subject: [PATCH 47/49] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpgen2/op/run_nvnmd.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index 00170501..7f4e0cc3 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -282,7 +282,9 @@ def calc_model_devi( num_frames = len(trajectories[0]) for traj in trajectories: - assert len(traj) == num_frames, f"Trajectory length mismatch: expected {num_frames}, got {len(traj)} frames" + assert ( + len(traj) == num_frames + ), f"Trajectory length mismatch: expected {num_frames}, got {len(traj)} frames" devi = [] for frame_idx in range(num_frames): From aa2e96b75a343d9040476bf0dae2f0374a6b9563 Mon Sep 17 00:00:00 2001 From: jiongwalai Date: Sat, 28 Jun 2025 11:06:33 +0800 Subject: [PATCH 48/49] add _check_nvnmd_model_files function and remove debug info --- dpgen2/op/run_nvnmd.py | 20 ++------------------ tests/test_prep_run_dp_train.py | 29 ++++++++++++----------------- 2 files changed, 14 insertions(+), 35 deletions(-) diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index 7f4e0cc3..0aa3af9f 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -1,57 +1,41 @@ import glob import itertools -import json import logging import os import random -import re -import shutil from pathlib import ( Path, ) from typing import ( List, Optional, - Set, - Tuple, Union, ) import numpy as np -from dargs import ( - Argument, - ArgumentEncoder, - Variant, - dargs, -) + from dflow.python import ( OP, OPIO, Artifact, BigParameter, - FatalError, - HDF5Datasets, OPIOSign, TransientError, ) from dpgen2.constants import ( - lmp_conf_name, lmp_input_name, lmp_log_name, lmp_model_devi_name, lmp_traj_name, - model_name_match_pattern, model_name_pattern, plm_output_name, - pytorch_model_name_pattern, ) from dpgen2.op.run_lmp import ( RunLmp, find_only_one_key, ) from dpgen2.utils import ( - BinaryFileInput, set_directory, ) from dpgen2.utils.run_command import ( @@ -243,7 +227,7 @@ def set_lmp_models(lmp_input_name: str, model_names: List[str]): new_line_split = lmp_input_lines[idx].split() match_idx = find_only_one_key(new_line_split, ["model.pb"], raise_not_found=False) if match_idx is None: - raise RuntimeError(f"last matching index should not be -1, terribly wrong ") + raise RuntimeError("last matching index should not be -1, terribly wrong ") for ii, model_name in enumerate(model_names): new_line_split[match_idx] = model_name diff --git a/tests/test_prep_run_dp_train.py b/tests/test_prep_run_dp_train.py index 414839a1..876bd09b 100644 --- a/tests/test_prep_run_dp_train.py +++ b/tests/test_prep_run_dp_train.py @@ -134,6 +134,17 @@ def _check_model( for ii in range(len(mlines)): tcase.assertEqual(flines[ii + 1], mlines[ii]) +def _check_nvnmd_model_files(tcase, cwd, init_model): + """Helper to check all nvnmd model files.""" + model_checks = [ + ("nvnmd_models/frozen_model.pb", init_model / "frozen_model.pb"), + ("nvnmd_models/model.pb", init_model / "frozen_model.pb"), + ("nvnmd_models/model.ckpt.meta", init_model / "model.ckpt.meta"), + ("nvnmd_models/model.ckpt.data-00000-of-00001", init_model / "model.ckpt.data"), + ("nvnmd_models/model.ckpt.index", init_model / "model.ckpt.index"), + ] + for output_file, expected_file in model_checks: + _check_model(tcase, output_file, cwd, expected_file) def _check_lcurve( tcase, @@ -147,7 +158,6 @@ def _check_lcurve( mlines = fp.read().strip().split("\n") tcase.assertEqual(flines[0], "read from train_script: ") for ii in range(len(mlines)): - print(flines[ii + 1], mlines[ii]) tcase.assertEqual(flines[ii + 1], mlines[ii]) @@ -198,22 +208,7 @@ def check_run_train_nvnmd_output( iter_data, only_check_name=only_check_name, ) - _check_model( - tcase, "nvnmd_models/frozen_model.pb", cwd, init_model / "frozen_model.pb" - ) - _check_model(tcase, "nvnmd_models/model.pb", cwd, init_model / "frozen_model.pb") - _check_model( - tcase, "nvnmd_models/model.ckpt.meta", cwd, init_model / "model.ckpt.meta" - ) - _check_model( - tcase, - "nvnmd_models/model.ckpt.data-00000-of-00001", - cwd, - init_model / "model.ckpt.data", - ) - _check_model( - tcase, "nvnmd_models/model.ckpt.index", cwd, init_model / "model.ckpt.index" - ) + _check_nvnmd_model_files(tcase, cwd, init_model) _check_lcurve(tcase, "nvnmd_cnn/lcurve.out", cwd, script) os.chdir(cwd) From c50b4119740cdee883829ddec4ad0972f6872791 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 28 Jun 2025 03:12:44 +0000 Subject: [PATCH 49/49] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpgen2/op/run_nvnmd.py | 1 - tests/test_prep_run_dp_train.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/dpgen2/op/run_nvnmd.py b/dpgen2/op/run_nvnmd.py index 0aa3af9f..4d899088 100644 --- a/dpgen2/op/run_nvnmd.py +++ b/dpgen2/op/run_nvnmd.py @@ -13,7 +13,6 @@ ) import numpy as np - from dflow.python import ( OP, OPIO, diff --git a/tests/test_prep_run_dp_train.py b/tests/test_prep_run_dp_train.py index 876bd09b..b137070b 100644 --- a/tests/test_prep_run_dp_train.py +++ b/tests/test_prep_run_dp_train.py @@ -134,6 +134,7 @@ def _check_model( for ii in range(len(mlines)): tcase.assertEqual(flines[ii + 1], mlines[ii]) + def _check_nvnmd_model_files(tcase, cwd, init_model): """Helper to check all nvnmd model files.""" model_checks = [ @@ -146,6 +147,7 @@ def _check_nvnmd_model_files(tcase, cwd, init_model): for output_file, expected_file in model_checks: _check_model(tcase, output_file, cwd, expected_file) + def _check_lcurve( tcase, fname,