BioPandas · a-r-j · Dec 8, 2022 · Dec 8, 2022 · Dec 9, 2022 · Dec 9, 2022
diff --git a/.appveyor.yml b/.appveyor.yml
@@ -14,7 +14,7 @@ install:
   - conda config --set always_yes yes --set changeps1 no
   - conda update -q conda
   - conda info -a
-  - conda create -q -n test-environment --channel=conda-forge mmtf-python numpy scipy pandas nose python=%PYTHON_VERSION%
+  - conda create -q -n test-environment --channel=conda-forge mmtf-python numpy scipy requests pandas nose python=%PYTHON_VERSION%
   - activate test-environment
 
 test_script:

diff --git a/biopandas/mmcif/pandas_mmcif.py b/biopandas/mmcif/pandas_mmcif.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 import pandas as pd
+import requests
 
 from ..pdb.engines import amino3to1dict
 from ..pdb.pandas_pdb import PandasPdb
@@ -63,66 +64,89 @@ def read_mmcif(self, path):
         self
 
         """
-        self.mmcif_path, self.pdb_text = self._read_mmcif(path=path)
-        self._df = self._construct_df(text=self.pdb_text)
+        self.mmcif_path, self.mmcif_text = self._read_mmcif(path=path)
+        self._df = self._construct_df(text=self.mmcif_text)
         # self.header, self.code = self._parse_header_code() #TODO: implement
         self.code = self.data["entry"]["id"][0].lower()
         return self
 
-    def fetch_mmcif(self, pdb_code: Optional[str] = None, uniprot_id: Optional[str] = None, source: str = "pdb"):
-        """Fetches mmCIF file contents from the Protein Databank at rcsb.org or AlphaFold database at https://alphafold.ebi.ac.uk/.
-.
+    def fetch_mmcif(
+        self,
+        pdb_code: Optional[str] = None,
+        uniprot_id: Optional[str] = None,
+        sequence: Optional[str] = None,
+        source: str = "pdb",
+    ):
+        """
+                Fetches mmCIF file contents from the Protein Databank at rcsb.org,
+                AlphaFold database at https://alphafold.ebi.ac.uk/ or ESMFold database.
+        .
 
-        Parameters
-        ----------
-        pdb_code : str, optional
-            A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the PDB. Defaults to `None`.
+                Parameters
+                ----------
+                pdb_code : str, optional
+                    A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the
+                    PDB. Defaults to `None`.
 
-        uniprot_id : str, optional
-            A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from the AF2 database. Defaults to `None`.
+                uniprot_id : str, optional
+                    A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from
+                    the AF2 database. Defaults to `None`.
 
-        source : str
-            The source to retrieve the structure from 
-            (`"pdb"`, `"alphafold2-v1"`, `"alphafold2-v2"` or `"alphafold2-v3"`). Defaults to `"pdb"`.
+                sequence : str, optional
+                    A protein sequence to retrieve a structure from the ESMFold
+                    database. Defaults to `None`.
 
-        Returns
-        ---------
-        self
+                source : str
+                    The source to retrieve the structure from. Can be one of
+                    (`"pdb"`, `"alphafold2-v1"`, `"alphafold2-v2"` or
+                    `"alphafold2-v3"`). Defaults to `"pdb"`.
+
+                Returns
+                ---------
+                self
 
         """
+        if sequence is not None:
+            self.esmfold(sequence)
+            return self
         # Sanitize input
         invalid_input_identifier_1 = pdb_code is None and uniprot_id is None
         invalid_input_identifier_2 = pdb_code is not None and uniprot_id is not None
         invalid_input_combination_1 = uniprot_id is not None and source == "pdb"
         invalid_input_combination_2 = pdb_code is not None and source in {
-            "alphafold2-v1", "alphafold2-v2", "alphafold2-v3"}
+            "alphafold2-v1",
+            "alphafold2-v2",
+            "alphafold2-v3",
+        }
 
         if invalid_input_identifier_1 or invalid_input_identifier_2:
-            raise ValueError(
-                "Please provide either a PDB code or a UniProt ID.")
+            raise ValueError("Please provide either a PDB code or a UniProt ID.")
 
         if invalid_input_combination_1:
             raise ValueError(
-                "Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'.")
+                "Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'."
+            )
         elif invalid_input_combination_2:
             raise ValueError(
-                f"Please use a 'uniprot_id' instead of 'pdb_code' for source={source}.")
+                f"Please use a 'uniprot_id' instead of 'pdb_code' for source={source}."
+            )
 
         if source == "pdb":
             self.mmcif_path, self.mmcif_text = self._fetch_mmcif(pdb_code)
         elif source == "alphafold2-v1":
             af2_version = 1
-            self.mmcif_path, self.mmcif_text = self._fetch_af2(
-                uniprot_id, af2_version)
+            self.mmcif_path, self.mmcif_text = self._fetch_af2(uniprot_id, af2_version)
         elif source == "alphafold2-v2":
             af2_version = 2
             self.mmcif_path, self.mmcif_text = self._fetch_af2(uniprot_id, af2_version)
         elif source == "alphafold2-v3":
             af2_version = 3
             self.mmcif_path, self.mmcif_text = self._fetch_af2(uniprot_id, af2_version)
         else:
-            raise ValueError(f"Invalid source: {source}."
-                " Please use one of 'pdb', 'alphafold2-v1', 'alphafold2-v2' or 'alphafold2-v3.")
+            raise ValueError(
+                f"Invalid source: {source}."
+                " Please use one of 'pdb', 'alphafold2-v1', 'alphafold2-v2' or 'alphafold2-v3."
+            )
 
         self._df = self._construct_df(text=self.mmcif_text)
         return self
@@ -132,9 +156,9 @@ def _construct_df(self, text: str):
         data = data[list(data.keys())[0]]
         self.data = data
         df: Dict[str, pd.DataFrame] = {}
-        full_df = pd.DataFrame.from_dict(
-            data["atom_site"], orient="index").transpose()
-        full_df = full_df.astype(mmcif_col_types, errors="ignore")
+        full_df = pd.DataFrame.from_dict(data["atom_site"], orient="index").transpose()
+        types = {k: v for k, v in mmcif_col_types.items() if k in full_df.columns}
+        full_df = full_df.astype(types, errors="ignore")
         df["ATOM"] = pd.DataFrame(full_df[full_df.group_PDB == "ATOM"])
         df["HETATM"] = pd.DataFrame(full_df[full_df.group_PDB == "HETATM"])
         try:
@@ -152,8 +176,7 @@ def _fetch_mmcif(pdb_code):
             response = urlopen(url)
             txt = response.read()
             txt = (
-                txt.decode(
-                    "utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii")
+                txt.decode("utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii")
             )
         except HTTPError as e:
             print(f"HTTP Error {e.code}")
@@ -170,14 +193,13 @@ def _fetch_af2(uniprot_id: str, af2_version: int = 3):
         try:
             response = urlopen(url)
             txt = response.read()
-            if sys.version_info[0] >= 3:
-                txt = txt.decode('utf-8')
-            else:
-                txt = txt.encode('ascii')
+            txt = (
+                txt.decode("utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii")
+            )
         except HTTPError as e:
-            print('HTTP Error %s' % e.code)
+            print(f"HTTP Error {e.code}")
         except URLError as e:
-            print('URL Error %s' % e.args)
+            print(f"URL Error {e.args}")
         return url, txt
 
     @staticmethod
@@ -190,8 +212,7 @@ def _read_mmcif(path):
             r_mode = "rb"
             openf = gzip.open
         else:
-            allowed_formats = ", ".join(
-                (".cif", ".cif.gz", ".mmcif", ".mmcif.gz"))
+            allowed_formats = ", ".join((".cif", ".cif.gz", ".mmcif", ".mmcif.gz"))
             raise ValueError(
                 f"Wrong file format; allowed file formats are {allowed_formats}"
             )
@@ -201,11 +222,48 @@ def _read_mmcif(path):
 
         if path.endswith(".gz"):
             txt = (
-                txt.decode(
-                    "utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii")
+                txt.decode("utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii")
             )
         return path, txt
 
+    def esmfold(self, sequence: str, out_path: Optional[str] = None, version: int = 1):
+        """Fold a protein sequence using the ESMFold model from the ESMFold server at
+        https://api.esmatlas.com/foldSequence/v1/pdb/.
+
+
+        Parameters
+        ----------
+        sequence : str
+            A protein sequence in one-letter code.
+        out_path : str, optional
+            Path to save the PDB file to. If `None`, the file is not saved.
+            Defaults to `None`.
+
+        version : int, optional
+            The version of the ESMFold model to use. Defaults to `1`.
+
+
+        Returns
+        --------
+        self
+        """
+        URL = f"https://api.esmatlas.com/foldSequence/v{version}/cif/"
+        headers: Dict[str, str] = {
+            "Content-Type": "application/x-www-form-urlencoded",
+        }
+
+        cif = requests.post(URL, data=sequence, headers=headers).text
+
+        # append header
+        header = "\n".join([f"data_{sequence}", "#", f"_entry.id\t{sequence}", "#\n"])
+        cif = header + cif
+        if out_path is not None:
+            with open(out_path, "w") as f:
+                f.write(cif)
+
+        self._df = self._construct_df(text=cif)
+        return self
+
     def get(self, s, df=None, invert=False, records=("ATOM", "HETATM")):
         """Filter PDB DataFrames by properties
 
@@ -278,8 +336,7 @@ def _get_mainchain(
     def _get_hydrogen(df, invert):
         """Return only hydrogen atom entries from a DataFrame"""
         return (
-            df[(df["type_symbol"] != "H")] if invert else df[(
-                df["type_symbol"] == "H")]
+            df[(df["type_symbol"] != "H")] if invert else df[(df["type_symbol"] == "H")]
         )
 
     @staticmethod
@@ -346,8 +403,7 @@ def amino3to1(
                 indices.append(ind)
             cmp = num
 
-        transl = tmp.iloc[indices][residue_col].map(
-            amino3to1dict).fillna(fillna)
+        transl = tmp.iloc[indices][residue_col].map(amino3to1dict).fillna(fillna)
 
         return pd.concat((tmp.iloc[indices][chain_col], transl), axis=1)
 
@@ -473,7 +529,7 @@ def _init_get_dict():
             "heavy": PandasMmcif._get_heavy,
         }
 
-    def read_mmcif_from_list(self, mmcif_lines):
+    def read_mmcif_from_list(self, mmcif_lines: List[str]):
         """Reads mmCIF file from a list into DataFrames
 
         Attributes
@@ -486,8 +542,8 @@ def read_mmcif_from_list(self, mmcif_lines):
         self
 
         """
-        self.pdb_text = "".join(mmcif_lines)
-        self._df = self._construct_df(mmcif_lines)
+        self.mmcif_text = "\n".join(mmcif_lines)
+        self._df = self._construct_df("\n".join(mmcif_lines))
         # self.header, self.code = self._parse_header_code()
         self.code = self.data["entry"]["id"][0].lower()
         return self
@@ -532,10 +588,13 @@ def convert_to_pandas_pdb(self, offset_chains: bool = True, records: List[str] =
 
         # Update atom numbers
         if offset_chains:
-            offsets = pandaspdb.df["ATOM"]["chain_id"].astype(
-                "category").cat.codes
-            pandaspdb.df["ATOM"]["atom_number"] = pandaspdb.df["ATOM"]["atom_number"] + offsets
+            offsets = pandaspdb.df["ATOM"]["chain_id"].astype("category").cat.codes
+            pandaspdb.df["ATOM"]["atom_number"] = (
+                pandaspdb.df["ATOM"]["atom_number"] + offsets
+            )
             hetatom_offset = offsets.max() + 1
-            pandaspdb.df["HETATM"]["atom_number"] = pandaspdb.df["HETATM"]["atom_number"] + hetatom_offset
+            pandaspdb.df["HETATM"]["atom_number"] = (
+                pandaspdb.df["HETATM"]["atom_number"] + hetatom_offset
+            )
 
         return pandaspdb
diff --git a/biopandas/mmcif/tests/test_read_mmcif.py b/biopandas/mmcif/tests/test_read_mmcif.py
@@ -6,16 +6,18 @@
 
 
 import os
+from typing import Set
 from urllib.error import HTTPError
 from urllib.request import urlopen
 
 import numpy as np
 import pandas as pd
+from nose.tools import raises
+from pandas.testing import assert_frame_equal
+
 from biopandas.mmcif import PandasMmcif
 from biopandas.pdb import PandasPdb
 from biopandas.testutils import assert_raises
-from nose.tools import raises
-from pandas.testing import assert_frame_equal
 
 TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), "data", "3eiy.cif")
 
@@ -91,7 +93,6 @@
     af2_test_struct_v3 = f.read()
 
 
-
 def test__read_pdb():
     """Test private _read_pdb"""
     ppdb = PandasMmcif()
@@ -135,6 +136,19 @@ def test_fetch_pdb():
         assert ppdb.mmcif_path == "https://files.rcsb.org/download/3eiy.cif"
 
 
+def test_read_pdb_esmfold():
+    """Test retrieving a structure from ESMFold."""
+    sequence = "MTYGLY"
+    res_ids: Set[str] = {"A:MET:1", "A:THR:2", "A:TYR:3", "A:GLY:4", "A:LEU:5", "A:TYR:6"}
+    ppdb = PandasMmcif().fetch_mmcif(sequence=sequence)
+
+    df = ppdb.df["ATOM"]
+
+    folded_struct_residue_ids = set(list(df.label_asym_id + ":" + df.label_comp_id + ":" + df.label_seq_id.astype(str)))
+
+    assert folded_struct_residue_ids == res_ids, "Residue IDs do not match"
+
+
 def test_fetch_af2():
     """Test fetch_af2"""
     # Test latest release
@@ -245,7 +259,7 @@ def test_read_pdb():
     """Test public read_pdb"""
     ppdb = PandasMmcif()
     ppdb.read_mmcif(TESTDATA_FILENAME)
-    assert ppdb.pdb_text == three_eiy
+    assert ppdb.mmcif_text == three_eiy
     assert ppdb.code == "3eiy", ppdb.code
     assert ppdb.mmcif_path == TESTDATA_FILENAME
 
@@ -255,8 +269,8 @@ def test_read_pdb_from_list():
 
     for pdb_text, code in zip([three_eiy, four_eiy], ["3eiy", "4eiy"]):
         ppdb = PandasMmcif()
-        ppdb.read_mmcif_from_list(pdb_text)
-        assert ppdb.pdb_text == pdb_text
+        ppdb.read_mmcif_from_list(pdb_text.split("\n"))
+        assert ppdb.mmcif_text == pdb_text
         assert ppdb.code == code
         assert ppdb.mmcif_path == ""