doc: covalently

PaddlePaddle · Aug 23, 2024 · 7653585 · 7653585
1 parent 3bba852
commit 7653585
Show file tree

Hide file tree

Showing 5 changed files with 91 additions and 1 deletion.
diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md
@@ -126,6 +126,49 @@ A example of input data is as follows:
 }
 ```
 
+Another example of **covalently modified** input:
+```json
+{
+    "entities": [
+        {
+            "type": "protein",
+            "sequence": "MDALYKSTVAKFNEVIQLDCSTEFFSIALSSIAGILLLLLLFRSKRHSSLKLPPGKLGIPFIGESFIFLRALRSNSLEQFFDERVKKFGLVFKTSLIGHPTVVLCGPAGNRLILSNEEKLVQMSWPAQFMKLMGENSVATRRGEDHIVMRSALAGFFGPGALQSYIGKMNTEIQSHINEKWKGKDEVNVLPLVRELVFNISAILFFNIYDKQEQDRLHKLLETILVGSFALPIDLPGFGFHRALQGRAKLNKIMLSLIKKRKEDLQSGSATATQDLLSVLLTFRDDKGTPLTNDEILDNFSSLLHASYDTTTSPMALIFKLLSSNPECYQKVVQEQLEILSNKEEGEEITWKDLKAMKYTWQVAQETLRMFPPVFGTFRKAITDIQYDGYTIPKGWKLLWTTYSTHPKDLYFNEPEKFMPSRFDQEGKHVAPYTFLPFGGGQRSCVGWEFSKMEILLFVHHFVKTFSSYTPVDPDEKISGDPLPPLPSKGFSIKLFPRP",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "ccd": "HEM",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "smiles": "CC1=C2CC[C@@]3(CCCC(=C)[C@H]3C[C@@H](C2(C)C)CC1)C",
+            "count": 1
+        },
+        {
+            "type": "bond",
+            "bond": "A,CYS,445,SG,B,HEM,1,FE,covale,2.3",
+            "_comment": "<chain-id>,<residue name>,<residue index>,<atom id>,<chain-id>,<residue name>,<residue index>,<atom id>,<bond type>,<bond length>",
+            "_also_comment": "For ccd input, use CCD key as residue name; for smiles and file input, use `UNK-<index>` where index is the chain order you input"
+        }
+    ]
+}
+```
+
+For seaking all atom ids in CCD database:
+```shell
+helixfold_show_ccd +ccd_id=HEM
+```
+
+This command outputs like:
+```text
+# output:
+[2024-08-23 22:44:36,324][absl][INFO] - Started Loading CCD dataset from /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz
+[2024-08-23 22:44:43,236][absl][INFO] - Finished Loading CCD dataset from /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz in 6.912 seconds
+[2024-08-23 22:44:43,237][absl][INFO] - CCD dataset contains 43488 entries.
+[2024-08-23 22:44:43,237][absl][INFO] - Atoms in HEM: ['CHA', 'CHB', 'CHC', 'CHD', 'C1A', 'C2A', 'C3A', 'C4A', 'CMA', 'CAA', 'CBA', 'CGA', 'O1A', 'O2A', 'C1B', 'C2B', 'C3B', 'C4B', 'CMB', 'CAB', 'CBB', 'C1C', 'C2C', 'C3C', 'C4C', 'CMC', 'CAC', 'CBC', 'C1D', 'C2D', 'C3D', 'C4D', 'CMD', 'CAD', 'CBD', 'CGD', 'O1D', 'O2D', 'NA', 'NB', 'NC', 'ND', 'FE']
+```
+
 #### Running HelixFold for Inference
 To run inference on a sequence or multiple sequences using HelixFold3's pretrained parameters, run e.g.:
 

diff --git a/apps/protein_folding/helixfold3/data/demo_7s69_coval.json b/apps/protein_folding/helixfold3/data/demo_7s69_coval.json
@@ -13,7 +13,8 @@
         {
             "type": "bond",
             "bond": "A,ASN,74,ND2,B,UNK-1,1,C16,covale,2.3",
-            "_comment": "'A,74,ND2:B,1:CW,null' from RF2AA"
+            "_comment": "'A,74,ND2:B,1:CW,null' from RF2AA.",
+            "_also_comment": "For ccd input, use CCD key as residue name; for smiles and file input, use `UNK-<index>` where index is the chain order you input"
         }
     ]
 }
diff --git a/apps/protein_folding/helixfold3/data/demo_p450_heme_coval.json b/apps/protein_folding/helixfold3/data/demo_p450_heme_coval.json
@@ -0,0 +1,25 @@
+{
+    "entities": [
+        {
+            "type": "protein",
+            "sequence": "MDALYKSTVAKFNEVIQLDCSTEFFSIALSSIAGILLLLLLFRSKRHSSLKLPPGKLGIPFIGESFIFLRALRSNSLEQFFDERVKKFGLVFKTSLIGHPTVVLCGPAGNRLILSNEEKLVQMSWPAQFMKLMGENSVATRRGEDHIVMRSALAGFFGPGALQSYIGKMNTEIQSHINEKWKGKDEVNVLPLVRELVFNISAILFFNIYDKQEQDRLHKLLETILVGSFALPIDLPGFGFHRALQGRAKLNKIMLSLIKKRKEDLQSGSATATQDLLSVLLTFRDDKGTPLTNDEILDNFSSLLHASYDTTTSPMALIFKLLSSNPECYQKVVQEQLEILSNKEEGEEITWKDLKAMKYTWQVAQETLRMFPPVFGTFRKAITDIQYDGYTIPKGWKLLWTTYSTHPKDLYFNEPEKFMPSRFDQEGKHVAPYTFLPFGGGQRSCVGWEFSKMEILLFVHHFVKTFSSYTPVDPDEKISGDPLPPLPSKGFSIKLFPRP",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "ccd": "HEM",
+            "count": 1
+        },
+        {
+            "type": "ligand",
+            "smiles": "CC1=C2CC[C@@]3(CCCC(=C)[C@H]3C[C@@H](C2(C)C)CC1)C",
+            "count": 1
+        },
+        {
+            "type": "bond",
+            "bond": "A,CYS,445,SG,B,HEM,1,FE,covale,2.3",
+            "_comment": "<chain-id>,<residue name>,<residue index>,<atom id>,<chain-id>,<residue name>,<residue index>,<atom id>,<bond type>,<bond length>",
+            "_also_comment": "For ccd input, use CCD key as residue name; for smiles and file input, use `UNK-<index>` where index is the chain order you input"
+        }
+    ]
+}
diff --git a/apps/protein_folding/helixfold3/helixfold/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py
@@ -31,6 +31,7 @@
 import hydra
 
 from helixfold.common import all_atom_pdb_save
+from helixfold.data.pipeline_conf_bonds import load_ccd_dict
 from helixfold.model import config, utils
 from helixfold.data import pipeline_parallel as pipeline
 from helixfold.data import pipeline_multimer_parallel as pipeline_multimer
@@ -565,5 +566,24 @@ def main(cfg: DictConfig):
     ranking_all_predictions(all_pred_path)
     print(f'============ Inference finished ! ============')
 
+
+@hydra.main(version_base=None, config_path=os.path.join(script_path,'config',),config_name='helixfold')
+def show_atom_id_ccd(cfg: DictConfig):
+    ccd_preprocessed_path = cfg.db.ccd_preprocessed
+
+
+    ccd_id=cfg.ccd_id
+    if len(ccd_id) <= 3 and ccd_id in (ccd_dict:=load_ccd_dict(ccd_preprocessed_path)):
+        logging.info(f'Atoms in {ccd_id}: {ccd_dict[ccd_id]["atom_ids"]}')
+        return
+
+
+
+
+
+
+
+
+
 if __name__ == '__main__':
     main()
diff --git a/apps/protein_folding/helixfold3/pyproject.toml b/apps/protein_folding/helixfold3/pyproject.toml
@@ -47,3 +47,4 @@ joblib = "1.4.2"
 
 [tool.poetry.scripts]
 helixfold = 'helixfold.inference:main'
+helixfold_show_ccd = 'helixfold.inference:show_atom_id_ccd'
Original file line number	Diff line number	Diff line change
Expand Up		@@ -47,3 +47,4 @@ joblib = "1.4.2"

		[tool.poetry.scripts]
		helixfold = 'helixfold.inference:main'
		helixfold_show_ccd = 'helixfold.inference:show_atom_id_ccd'