[feat] added get_model and get_models fct to mmcif (#145)

kierandidi · a-r-j · web-flow · commit 3e26557b46d7 · 2024-07-08T20:49:17.000+01:00
* [feat] added get_model and get_models fct to mmcif

* [docs] add CHANGELOG.md

* [feat] added tests for multiple models

* [feat] added tests for multiple models

* [feat] rename df to biopandas_structure

* bump changelog + changelog workflow

* Delete docs/sources/CHANGELOG.md

---------

Co-authored-by: Arian Jamasb &lt;arjamasb@gmail.com&gt;
diff --git a/.github/workflows/changelog-enforcer.yaml b/.github/workflows/changelog-enforcer.yaml
@@ -13,4 +13,5 @@ jobs:
       - uses: actions/checkout@v3
       - uses: dangoslen/changelog-enforcer@v3
         with:
-          skipLabels: 'skip-changelog'
+          skipLabels: 'skip-changelog'
+          changeLogPath: 'docs/CHANGELOG.md'
diff --git a/biopandas/mmcif/pandas_mmcif.py b/biopandas/mmcif/pandas_mmcif.py
@@ -6,9 +6,10 @@
 # License: BSD 3 clause
 # Project Website: http://rasbt.github.io/biopandas/
 # Code Repository: https://github.com/rasbt/biopandas
-
+from __future__ import annotations
 import gzip
 import sys
+import copy
 import warnings
 from typing import Dict, List, Optional
 from urllib.error import HTTPError, URLError
@@ -69,6 +70,66 @@ def read_mmcif(self, path):
         # self.header, self.code = self._parse_header_code() #TODO: implement
         self.code = self.data["entry"]["id"][0].lower()
         return self
+    
+    def label_models(self):
+        """Adds a column ("model_id") to the underlying
+        DataFrames containing the model number."""
+        if "ATOM" in self.df.keys():
+            self.df["ATOM"]["model_id"] = self.df["ATOM"]["pdbx_PDB_model_num"]
+        if "HETATM" in self.df.keys():
+            self.df["HETATM"]["model_id"] = self.df["HETATM"]["pdbx_PDB_model_num"]
+        return self
+    
+    def get_model(self, model_index: int) -> PandasMmcif:
+        """Returns a new PandasMmcif object with the dataframes subset to the
+        given model index.
+
+        Parameters
+        ----------
+        model_index : int
+            An integer representing the model index to subset to.
+
+        Returns
+        ---------
+        pandas_pdb.PandasPdb : A new PandasMMcif object containing the
+            structure subsetted to the given model.
+        """
+
+        biopandas_structure = copy.deepcopy(self)
+        if "ATOM" in biopandas_structure.df.keys():
+            biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[biopandas_structure.df["ATOM"]["pdbx_PDB_model_num"] == model_index]
+        if "HETATM" in biopandas_structure.df.keys():
+            biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
+                biopandas_structure.df["HETATM"]["pdbx_PDB_model_num"] == model_index
+            ]
+        return biopandas_structure
+
+    def get_models(self, model_indices: List[int]) -> PandasMmcif:
+        """Returns a new PandasMmcif object with the dataframes subset to the
+        given model index.
+
+        Parameters
+        ----------
+        model_indices : List[int]
+            A list representing the model indexes to subset to.
+
+        Returns
+        ---------
+        pandas_pdb.PandasMmtf : A new PandasMmcif object
+            containing the structure subsetted to the given model.
+        """
+
+        biopandas_structure = copy.deepcopy(self)
+
+        if "ATOM" in biopandas_structure.df.keys():
+            biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[
+                [x in model_indices for x in biopandas_structure.df["ATOM"]["pdbx_PDB_model_num"].tolist()]
+            ]
+        if "HETATM" in biopandas_structure.df.keys():
+            biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
+                [x in model_indices for x in biopandas_structure.df["HETATM"]["pdbx_PDB_model_num"].tolist()]
+            ]
+        return biopandas_structure
 
     def fetch_mmcif(
         self,
@@ -583,4 +644,4 @@ def convert_to_pandas_pdb(
                 pandaspdb.df["HETATM"]["atom_number"] + hetatom_offset
             )
 
-        return pandaspdb
+        return pandaspdb
diff --git a/biopandas/mmcif/tests/data/2jyf.cif.gz b/biopandas/mmcif/tests/data/2jyf.cif.gz
diff --git a/biopandas/mmcif/tests/test_multiple_models.py b/biopandas/mmcif/tests/test_multiple_models.py
@@ -0,0 +1,30 @@
+# BioPandas
+# Author: Sebastian Raschka <mail@sebastianraschka.com>
+# Author: Arian Jamasb <arian@jamasb.io>
+# License: BSD 3 clause
+# Project Website: http://rasbt.github.io/biopandas/
+# Code Repository: https://github.com/rasbt/biopandas
+import os
+
+from biopandas.mmcif import PandasMmcif
+
+TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), "data", "2jyf.cif.gz")
+
+def test_label_models():
+    biopandas_structure = PandasMmcif().read_mmcif(TESTDATA_FILENAME)
+    biopandas_structure.label_models()
+    assert "model_id" in biopandas_structure.df["ATOM"].columns
+
+def test_get_model():
+    biopandas_structure = PandasMmcif().read_mmcif(TESTDATA_FILENAME)
+    MODEL_INDEX = 1
+    new_biopandas_structure = biopandas_structure.get_model(MODEL_INDEX)
+    assert new_biopandas_structure.df["ATOM"]["pdbx_PDB_model_num"].all() == MODEL_INDEX
+
+
+def test_get_models():
+    biopandas_structure = PandasMmcif().read_mmcif(TESTDATA_FILENAME)
+    MODEL_INDICES = [1, 3, 5]
+
+    new_biopandas_structure = biopandas_structure.get_models(MODEL_INDICES)
+    assert new_biopandas_structure.df["ATOM"]["pdbx_PDB_model_num"].all() in MODEL_INDICES
diff --git a/biopandas/mmtf/pandas_mmtf.py b/biopandas/mmtf/pandas_mmtf.py
@@ -438,21 +438,21 @@ def get_model(self, model_index: int) -> PandasMmtf:
             structure subsetted to the given model.
         """
 
-        df = copy.deepcopy(self)
+        biopandas_structure = copy.deepcopy(self)
 
-        if "ATOM" in df.df.keys():
-            df.df["ATOM"] = df.df["ATOM"].loc[
-                df.df["ATOM"]["model_id"] == model_index
+        if "ATOM" in biopandas_structure.df.keys():
+            biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[
+                biopandas_structure.df["ATOM"]["model_id"] == model_index
             ]
-        if "HETATM" in df.df.keys():
-            df.df["HETATM"] = df.df["HETATM"].loc[
-                df.df["HETATM"]["model_id"] == model_index
+        if "HETATM" in biopandas_structure.df.keys():
+            biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
+                biopandas_structure.df["HETATM"]["model_id"] == model_index
             ]
-        if "ANISOU" in df.df.keys():
-            df.df["ANISOU"] = df.df["ANISOU"].loc[
-                df.df["ANISOU"]["model_id"] == model_index
+        if "ANISOU" in biopandas_structure.df.keys():
+            biopandas_structure.df["ANISOU"] = biopandas_structure.df["ANISOU"].loc[
+                biopandas_structure.df["ANISOU"]["model_id"] == model_index
             ]
-        return df
+        return biopandas_structure
 
     def get_models(self, model_indices: List[int]) -> PandasMmtf:
         """Returns a new PandasMmtf object with the dataframes subset to the
@@ -469,30 +469,30 @@ def get_models(self, model_indices: List[int]) -> PandasMmtf:
             containing the structure subsetted to the given model.
         """
 
-        df = copy.deepcopy(self)
+        biopandas_structure = copy.deepcopy(self)
 
-        if "ATOM" in df.df.keys():
-            df.df["ATOM"] = df.df["ATOM"].loc[
+        if "ATOM" in biopandas_structure.df.keys():
+            biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[
                 [
                     x in model_indices
-                    for x in df.df["ATOM"]["model_id"].tolist()
+                    for x in biopandas_structure.df["ATOM"]["model_id"].tolist()
                 ]
             ]
-        if "HETATM" in df.df.keys():
-            df.df["HETATM"] = df.df["HETATM"].loc[
+        if "HETATM" in biopandas_structure.df.keys():
+            biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
                 [
                     x in model_indices
-                    for x in df.df["HETATM"]["model_id"].tolist()
+                    for x in biopandas_structure.df["HETATM"]["model_id"].tolist()
                 ]
             ]
-        if "ANISOU" in df.df.keys():
-            df.df["ANISOU"] = df.df["ANISOU"].loc[
+        if "ANISOU" in biopandas_structure.df.keys():
+            biopandas_structure.df["ANISOU"] = biopandas_structure.df["ANISOU"].loc[
                 [
                     x in model_indices
-                    for x in df.df["ANISOU"]["model_id"].tolist()
+                    for x in biopandas_structure.df["ANISOU"]["model_id"].tolist()
                 ]
             ]
-        return df
+        return biopandas_structure
 
 
 def fetch_mmtf(pdb_code: str) -> pd.DataFrame:
diff --git a/biopandas/pdb/pandas_pdb.py b/biopandas/pdb/pandas_pdb.py
@@ -843,20 +843,20 @@ def get_model(self, model_index: int) -> PandasPdb:
           structure subsetted to the given model.
         """
 
-        df = deepcopy(self)
-        df.label_models()
-
-        if "ATOM" in df.df.keys():
-            df.df["ATOM"] = df.df["ATOM"].loc[df.df["ATOM"]["model_id"] == model_index]
-        if "HETATM" in df.df.keys():
-            df.df["HETATM"] = df.df["HETATM"].loc[
-                df.df["HETATM"]["model_id"] == model_index
+        biopandas_structure = deepcopy(self)
+        biopandas_structure.label_models()
+
+        if "ATOM" in biopandas_structure.df.keys():
+            biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[biopandas_structure.df["ATOM"]["model_id"] == model_index]
+        if "HETATM" in biopandas_structure.df.keys():
+            biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
+                biopandas_structure.df["HETATM"]["model_id"] == model_index
             ]
-        if "ANISOU" in df.df.keys():
-            df.df["ANISOU"] = df.df["ANISOU"].loc[
-                df.df["ANISOU"]["model_id"] == model_index
+        if "ANISOU" in biopandas_structure.df.keys():
+            biopandas_structure.df["ANISOU"] = biopandas_structure.df["ANISOU"].loc[
+                biopandas_structure.df["ANISOU"]["model_id"] == model_index
             ]
-        return df
+        return biopandas_structure
 
     def get_models(self, model_indices: List[int]) -> PandasPdb:
         """Returns a new PandasPDB object with the dataframes subset to the given model index.
@@ -872,22 +872,22 @@ def get_models(self, model_indices: List[int]) -> PandasPdb:
           containing the structure subsetted to the given model.
         """
 
-        df = deepcopy(self)
-        df.label_models()
+        biopandas_structure = deepcopy(self)
+        biopandas_structure.label_models()
 
-        if "ATOM" in df.df.keys():
-            df.df["ATOM"] = df.df["ATOM"].loc[
-                [x in model_indices for x in df.df["ATOM"]["model_id"].tolist()]
+        if "ATOM" in biopandas_structure.df.keys():
+            biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[
+                [x in model_indices for x in biopandas_structure.df["ATOM"]["model_id"].tolist()]
             ]
-        if "HETATM" in df.df.keys():
-            df.df["HETATM"] = df.df["HETATM"].loc[
-                [x in model_indices for x in df.df["HETATM"]["model_id"].tolist()]
+        if "HETATM" in biopandas_structure.df.keys():
+            biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
+                [x in model_indices for x in biopandas_structure.df["HETATM"]["model_id"].tolist()]
             ]
-        if "ANISOU" in df.df.keys():
-            df.df["ANISOU"] = df.df["ANISOU"].loc[
-                [x in model_indices for x in df.df["ANISOU"]["model_id"].tolist()]
+        if "ANISOU" in biopandas_structure.df.keys():
+            biopandas_structure.df["ANISOU"] = biopandas_structure.df["ANISOU"].loc[
+                [x in model_indices for x in biopandas_structure.df["ANISOU"]["model_id"].tolist()]
             ]
-        return df
+        return biopandas_structure
 
     def to_pdb_stream(self, records: tuple[str] = ("ATOM", "HETATM")) -> StringIO:
         """Writes a PDB dataframe to a stream.
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -6,7 +6,8 @@ The CHANGELOG for the current development version is available at
 
 ### 0.5.1dev1 (UNRELEASED)
 
-- Dev: switched testing framework entirely to pytest. Drops nose dependency due to version conflicts with Python 3.12 (`nose`) and 3.8 (`nose`)
+- Feature: added method to `PandasMmcif` that allow to select by model ids. PR #[145](https://github.com/BioPandas/biopandas/pull/145))
+- Dev: switched testing framework entirely to pytest. Drops nose dependency due to version conflicts with Python 3.12 (`nose`) and 3.8 (`nose`) PR #[146](https://github.com/BioPandas/biopandas/pull/146))
 
 
 ### 0.5.0dev1 (31/7/2023)