fix: standardize the deepmd/npy/mixed format (#425)

iProzd · pre-commit-ci[bot] · web-flow · commit 06c21b6068cd · 2023-02-25T14:28:40.000Z
This PR has concated two commits together:

1. Update the dpdata.MultiSystems() when from_deepmd_npy_mixed method is
called;

dpdata.MultiSystems().from_deepmd_npy_mixed only returned the results
before but did not change itself, which is fixed in this commit, to be
consistent with other from methods.
(another bug is also fixed: not using .copy() in data["atom_names"] may
cause error when manually changing type_map for this system. UTs are
added in the next commit.)


2. Allow multiple sets in mixed-type format;

Now for maximum 50000 frames in one sys and 2000 frames in one set.
The reason I did not use 5000 frames per set, is that I think maximum
set frames will be much more often used in mixed-type format than other
format, and 2000 will be enough for large batch and more friendly for
memory.

Add UTs for type_map changing and mixed_type dir check.

---------

Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/dpdata/deepmd/mixed.py b/dpdata/deepmd/mixed.py
@@ -54,60 +54,80 @@ def to_system_data(folder, type_map=None, labels=True):
     if os.path.isfile(os.path.join(folder, "nopbc")):
         data["nopbc"] = True
     sets = sorted(glob.glob(os.path.join(folder, "set.*")))
-    assert len(sets) == 1, "Mixed type must have only one set!"
-    cells, coords, eners, forces, virs, real_atom_types = _load_set(
-        sets[0], data.get("nopbc", False)
-    )
-    nframes = np.reshape(cells, [-1, 3, 3]).shape[0]
-    cells = np.reshape(cells, [nframes, 3, 3])
-    coords = np.reshape(coords, [nframes, -1, 3])
-    real_atom_types = np.reshape(real_atom_types, [nframes, -1])
-    natom = real_atom_types.shape[1]
-    if labels:
-        if eners is not None and eners.size > 0:
+    all_cells = []
+    all_coords = []
+    all_eners = []
+    all_forces = []
+    all_virs = []
+    all_real_atom_types = []
+    for ii in sets:
+        cells, coords, eners, forces, virs, real_atom_types = _load_set(
+            ii, data.get("nopbc", False)
+        )
+        nframes = np.reshape(cells, [-1, 3, 3]).shape[0]
+        all_cells.append(np.reshape(cells, [nframes, 3, 3]))
+        all_coords.append(np.reshape(coords, [nframes, -1, 3]))
+        all_real_atom_types.append(np.reshape(real_atom_types, [nframes, -1]))
+        if eners is not None:
             eners = np.reshape(eners, [nframes])
-        if forces is not None and forces.size > 0:
-            forces = np.reshape(forces, [nframes, -1, 3])
-        if virs is not None and virs.size > 0:
-            virs = np.reshape(virs, [nframes, 3, 3])
+        if labels:
+            if eners is not None and eners.size > 0:
+                all_eners.append(np.reshape(eners, [nframes]))
+            if forces is not None and forces.size > 0:
+                all_forces.append(np.reshape(forces, [nframes, -1, 3]))
+            if virs is not None and virs.size > 0:
+                all_virs.append(np.reshape(virs, [nframes, 3, 3]))
+    all_cells_concat = np.concatenate(all_cells, axis=0)
+    all_coords_concat = np.concatenate(all_coords, axis=0)
+    all_real_atom_types_concat = np.concatenate(all_real_atom_types, axis=0)
+    all_eners_concat = None
+    all_forces_concat = None
+    all_virs_concat = None
+    if len(all_eners) > 0:
+        all_eners_concat = np.concatenate(all_eners, axis=0)
+    if len(all_forces) > 0:
+        all_forces_concat = np.concatenate(all_forces, axis=0)
+    if len(all_virs) > 0:
+        all_virs_concat = np.concatenate(all_virs, axis=0)
     data_list = []
     while True:
-        if real_atom_types.size == 0:
+        if all_real_atom_types_concat.size == 0:
             break
         temp_atom_numbs = [
-            np.count_nonzero(real_atom_types[0] == i)
+            np.count_nonzero(all_real_atom_types_concat[0] == i)
             for i in range(len(data["atom_names"]))
         ]
         # temp_formula = formula(data['atom_names'], temp_atom_numbs)
-        temp_idx = np.arange(real_atom_types.shape[0])[
-            (real_atom_types == real_atom_types[0]).all(-1)
+        temp_idx = np.arange(all_real_atom_types_concat.shape[0])[
+            (all_real_atom_types_concat == all_real_atom_types_concat[0]).all(-1)
         ]
-        rest_idx = np.arange(real_atom_types.shape[0])[
-            (real_atom_types != real_atom_types[0]).any(-1)
+        rest_idx = np.arange(all_real_atom_types_concat.shape[0])[
+            (all_real_atom_types_concat != all_real_atom_types_concat[0]).any(-1)
         ]
         temp_data = data.copy()
+        temp_data["atom_names"] = data["atom_names"].copy()
         temp_data["atom_numbs"] = temp_atom_numbs
-        temp_data["atom_types"] = real_atom_types[0]
-        real_atom_types = real_atom_types[rest_idx]
-        temp_data["cells"] = cells[temp_idx]
-        cells = cells[rest_idx]
-        temp_data["coords"] = coords[temp_idx]
-        coords = coords[rest_idx]
+        temp_data["atom_types"] = all_real_atom_types_concat[0]
+        all_real_atom_types_concat = all_real_atom_types_concat[rest_idx]
+        temp_data["cells"] = all_cells_concat[temp_idx]
+        all_cells_concat = all_cells_concat[rest_idx]
+        temp_data["coords"] = all_coords_concat[temp_idx]
+        all_coords_concat = all_coords_concat[rest_idx]
         if labels:
-            if eners is not None and eners.size > 0:
-                temp_data["energies"] = eners[temp_idx]
-                eners = eners[rest_idx]
-            if forces is not None and forces.size > 0:
-                temp_data["forces"] = forces[temp_idx]
-                forces = forces[rest_idx]
-            if virs is not None and virs.size > 0:
-                temp_data["virials"] = virs[temp_idx]
-                virs = virs[rest_idx]
+            if all_eners_concat is not None and all_eners_concat.size > 0:
+                temp_data["energies"] = all_eners_concat[temp_idx]
+                all_eners_concat = all_eners_concat[rest_idx]
+            if all_forces_concat is not None and all_forces_concat.size > 0:
+                temp_data["forces"] = all_forces_concat[temp_idx]
+                all_forces_concat = all_forces_concat[rest_idx]
+            if all_virs_concat is not None and all_virs_concat.size > 0:
+                temp_data["virials"] = all_virs_concat[temp_idx]
+                all_virs_concat = all_virs_concat[rest_idx]
         data_list.append(temp_data)
     return data_list
 
 
-def dump(folder, data, comp_prec=np.float32, remove_sets=True):
+def dump(folder, data, set_size=2000, comp_prec=np.float32, remove_sets=True):
     os.makedirs(folder, exist_ok=True)
     sets = sorted(glob.glob(os.path.join(folder, "set.*")))
     if len(sets) > 0:
@@ -164,20 +184,29 @@ def dump(folder, data, comp_prec=np.float32, remove_sets=True):
             np.int64
         )
     # dump frame properties: cell, coord, energy, force and virial
-    set_folder = os.path.join(folder, "set.%03d" % 0)
-    os.makedirs(set_folder)
-    np.save(os.path.join(set_folder, "box"), cells)
-    np.save(os.path.join(set_folder, "coord"), coords)
-    if eners is not None:
-        np.save(os.path.join(set_folder, "energy"), eners)
-    if forces is not None:
-        np.save(os.path.join(set_folder, "force"), forces)
-    if virials is not None:
-        np.save(os.path.join(set_folder, "virial"), virials)
-    if real_atom_types is not None:
-        np.save(os.path.join(set_folder, "real_atom_types"), real_atom_types)
-    if "atom_pref" in data:
-        np.save(os.path.join(set_folder, "atom_pref"), atom_pref)
+    nsets = nframes // set_size
+    if set_size * nsets < nframes:
+        nsets += 1
+    for ii in range(nsets):
+        set_stt = ii * set_size
+        set_end = (ii + 1) * set_size
+        set_folder = os.path.join(folder, "set.%06d" % ii)
+        os.makedirs(set_folder)
+        np.save(os.path.join(set_folder, "box"), cells[set_stt:set_end])
+        np.save(os.path.join(set_folder, "coord"), coords[set_stt:set_end])
+        if eners is not None:
+            np.save(os.path.join(set_folder, "energy"), eners[set_stt:set_end])
+        if forces is not None:
+            np.save(os.path.join(set_folder, "force"), forces[set_stt:set_end])
+        if virials is not None:
+            np.save(os.path.join(set_folder, "virial"), virials[set_stt:set_end])
+        if real_atom_types is not None:
+            np.save(
+                os.path.join(set_folder, "real_atom_types"),
+                real_atom_types[set_stt:set_end],
+            )
+        if "atom_pref" in data:
+            np.save(os.path.join(set_folder, "atom_pref"), atom_pref[set_stt:set_end])
     try:
         os.remove(os.path.join(folder, "nopbc"))
     except OSError:
@@ -187,61 +216,43 @@ def dump(folder, data, comp_prec=np.float32, remove_sets=True):
             pass
 
 
-def mix_system(*system, type_map, split_num=200, **kwargs):
-    """Mix the systems into mixed_type ones
+def mix_system(*system, type_map, **kwargs):
+    """Mix the systems into mixed_type ones according to the unified given type_map.
 
     Parameters
     ----------
     *system : System
         The systems to mix
     type_map : list of str
         Maps atom type to name
-    split_num : int
-        Number of frames in each system
 
     Returns
     -------
     mixed_systems: dict
-        dict of mixed system with key '{atom_numbs}/sys.xxx'
+        dict of mixed system with key 'atom_numbs'
     """
     mixed_systems = {}
     temp_systems = {}
-    atom_numbs_sys_index = {}  # index of sys
     atom_numbs_frame_index = {}  # index of frames in cur sys
     for sys in system:
         tmp_sys = sys.copy()
         natom = tmp_sys.get_natoms()
         tmp_sys.convert_to_mixed_type(type_map=type_map)
-        if str(natom) not in atom_numbs_sys_index:
-            atom_numbs_sys_index[str(natom)] = 0
         if str(natom) not in atom_numbs_frame_index:
             atom_numbs_frame_index[str(natom)] = 0
         atom_numbs_frame_index[str(natom)] += tmp_sys.get_nframes()
         if str(natom) not in temp_systems or not temp_systems[str(natom)]:
             temp_systems[str(natom)] = tmp_sys
         else:
             temp_systems[str(natom)].append(tmp_sys)
-        if atom_numbs_frame_index[str(natom)] >= split_num:
-            while True:
-                sys_split, temp_systems[str(natom)], rest_num = split_system(
-                    temp_systems[str(natom)], split_num=split_num
-                )
-                sys_name = (
-                    f"{str(natom)}/sys." + "%.6d" % atom_numbs_sys_index[str(natom)]
-                )
-                mixed_systems[sys_name] = sys_split
-                atom_numbs_sys_index[str(natom)] += 1
-                if rest_num < split_num:
-                    atom_numbs_frame_index[str(natom)] = rest_num
-                    break
     for natom in temp_systems:
         if atom_numbs_frame_index[natom] > 0:
-            sys_name = f"{natom}/sys." + "%.6d" % atom_numbs_sys_index[natom]
+            sys_name = f"{natom}"
             mixed_systems[sys_name] = temp_systems[natom]
     return mixed_systems
 
 
-def split_system(sys, split_num=100):
+def split_system(sys, split_num=10000):
     rest = sys.get_nframes() - split_num
     if rest <= 0:
         return sys, None, 0
diff --git a/dpdata/format.py b/dpdata/format.py
@@ -132,7 +132,7 @@ def to_multi_systems(self, formulas, directory, **kwargs):
             "%s doesn't support MultiSystems.to" % (self.__class__.__name__)
         )
 
-    def mix_system(self, *system, type_map, split_num=200, **kwargs):
+    def mix_system(self, *system, type_map, **kwargs):
         """Mix the systems into mixed_type ones according to the unified given type_map.
 
         Parameters
@@ -141,13 +141,11 @@ def mix_system(self, *system, type_map, split_num=200, **kwargs):
             The systems to mix
         type_map : list of str
             Maps atom type to name
-        split_num : int
-            Number of frames in each system
 
         Returns
         -------
         mixed_systems: dict
-            dict of mixed system with key '{atom_numbs}/sys.xxx'
+            dict of mixed system with key 'atom_numbs'
         """
         raise NotImplementedError(
             "%s doesn't support System.from" % (self.__class__.__name__)
diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py
@@ -117,7 +117,7 @@ def from_labeled_system_mix(self, file_name, type_map=None, **kwargs):
             file_name, type_map=type_map, labels=True
         )
 
-    def mix_system(self, *system, type_map, split_num=200, **kwargs):
+    def mix_system(self, *system, type_map, **kwargs):
         """Mix the systems into mixed_type ones according to the unified given type_map.
 
         Parameters
@@ -126,49 +126,22 @@ def mix_system(self, *system, type_map, split_num=200, **kwargs):
             The systems to mix
         type_map : list of str
             Maps atom type to name
-        split_num : int
-            Number of frames in each system
 
         Returns
         -------
         mixed_systems: dict
-            dict of mixed system with key '{atom_numbs}/sys.xxx'
+            dict of mixed system with key 'atom_numbs'
         """
-        return dpdata.deepmd.mixed.mix_system(
-            *system, type_map=type_map, split_num=split_num, **kwargs
-        )
+        return dpdata.deepmd.mixed.mix_system(*system, type_map=type_map, **kwargs)
 
     def from_multi_systems(self, directory, **kwargs):
-        """MultiSystems.from
-
-        Parameters
-        ----------
-        directory : str
-            directory of system
-
-        Returns
-        -------
-        filenames: list[str]
-            list of filenames
-        """
-        if self.MultiMode == self.MultiModes.Directory:
-            level_1_dir = [
-                os.path.join(directory, name)
-                for name in os.listdir(directory)
-                if os.path.isdir(os.path.join(directory, name))
-                and os.path.isfile(os.path.join(directory, name, "type_map.raw"))
-            ]
-            level_2_dir = [
-                os.path.join(directory, name1, name2)
-                for name1 in os.listdir(directory)
-                for name2 in os.listdir(os.path.join(directory, name1))
-                if os.path.isdir(os.path.join(directory, name1))
-                and os.path.isdir(os.path.join(directory, name1, name2))
-                and os.path.isfile(
-                    os.path.join(directory, name1, name2, "type_map.raw")
-                )
-            ]
-            return level_1_dir + level_2_dir
+        sys_dir = []
+        for root, dirs, files in os.walk(directory):
+            if (
+                "type_map.raw" in files
+            ):  # mixed_type format systems must have type_map.raw
+                sys_dir.append(root)
+        return sys_dir
 
     MultiMode = Format.MultiModes.Directory
 
diff --git a/dpdata/system.py b/dpdata/system.py
@@ -1307,15 +1307,13 @@ def from_fmt_obj(self, fmtobj, directory, labeled=True, **kwargs):
                 if labeled:
                     data_list = fmtobj.from_labeled_system_mix(dd, **kwargs)
                     for data_item in data_list:
-                        system_list.append(LabeledSystem(data=data_item))
+                        system_list.append(LabeledSystem(data=data_item, **kwargs))
                 else:
                     data_list = fmtobj.from_system_mix(dd, **kwargs)
                     for data_item in data_list:
-                        system_list.append(System(data=data_item))
-            return self.__class__(
-                *system_list,
-                type_map=kwargs["type_map"] if "type_map" in kwargs else None,
-            )
+                        system_list.append(System(data=data_item, **kwargs))
+            self.append(*system_list)
+            return self
 
     def to_fmt_obj(self, fmtobj, directory, *args, **kwargs):
         if not isinstance(fmtobj, dpdata.plugins.deepmd.DeePMDMixedFormat):
diff --git a/tests/test_deepmd_mixed.py b/tests/test_deepmd_mixed.py