fix: covalent bond

PaddlePaddle · Aug 23, 2024 · 3bba852 · 3bba852
1 parent c39a3e4
commit 3bba852
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 11 deletions.
diff --git a/apps/protein_folding/helixfold3/data/demo_7s69_coval.json b/apps/protein_folding/helixfold3/data/demo_7s69_coval.json
@@ -12,7 +12,7 @@
         },
         {
             "type": "bond",
-            "bond": "A,ASN,74,ND2,B,UNK1,1,C,covale,2.3",
+            "bond": "A,ASN,74,ND2,B,UNK-1,1,C16,covale,2.3",
             "_comment": "'A,74,ND2:B,1:CW,null' from RF2AA"
         }
     ]

diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
@@ -115,6 +115,7 @@ def parse_covalent_bond_input(input_string: str) -> List[CovalentBond]:
 
         # Append the CovalentBond instance to the list
         covalent_bonds.append(covalent_bond)
+        logging.info(f"Added {len(covalent_bonds)} bonds: {covalent_bonds}")
 
     return covalent_bonds
 
@@ -306,6 +307,8 @@ def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_pr
   chainId_to_type = {}
   ligand_bond_type = [] # (i, j, bond_type), represent the bond between token i and token j
   bond_index = [] # (i,j) represent the bond between token i and token j
+
+  ccd_id2atom_ids: dict[str, list]={}
   ccd_standard_set = residue_constants.STANDARD_LIST
   for chain_type_id, ccd_seq in all_chain_info.items():
       chain_type, chain_id = chain_type_id.rsplit('_', 1)
@@ -326,6 +329,8 @@ def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_pr
             else:
                 _ccd_feats = ccd_preprocessed_dict[ccd_id]
             atom_ids = _ccd_feats['atom_ids']
+
+            ccd_id2atom_ids[ccd_id] = atom_ids
             assert len(atom_ids) > 0, f'TODO filter - Got CCD <{ccd_id}>: 0 atom nums.'
 
             all_token_nums += len(atom_ids)
@@ -372,16 +377,43 @@ def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_pr
           ptnr2_label_seq_id = ptnr2_auth_seq_id
 
       try:
-        assert ptnr1_label_asym_id in chainId_to_ccd_list and ptnr2_label_asym_id in chainId_to_ccd_list
+        if not (ptnr1_label_asym_id in chainId_to_ccd_list and ptnr2_label_asym_id in chainId_to_ccd_list):
+           raise ValueError(f"Invalid chain id:\n{ptnr1_label_asym_id}/{ptnr2_label_asym_id}\n{chainId_to_ccd_list}")
         ptnr1_ccd_id = chainId_to_ccd_list[ptnr1_label_asym_id][int(ptnr1_label_seq_id) - 1]
         ptnr2_ccd_id = chainId_to_ccd_list[ptnr2_label_asym_id][int(ptnr2_label_seq_id) - 1]
-        assert ptnr1_ccd_id == ptnr1_label_comp_id and ptnr2_ccd_id == ptnr2_label_comp_id
-      except:
+
+
+        # renamed ligand residues
+
+
+        if ptnr1_ccd_id != ptnr1_label_comp_id:
+           logging.warning(f"Find ligand residue: {ptnr1_label_comp_id} -> {ptnr1_ccd_id}")
+           #ptnr1_label_comp_id = ptnr1_ccd_id
+
+        if ptnr2_ccd_id != ptnr2_label_comp_id:
+           logging.warning(f"Find ligand residue: {ptnr2_label_comp_id} -> {ptnr2_ccd_id}")
+           #ptnr2_label_comp_id = ptnr2_ccd_id
+
+      except ValueError as e:
         ## some convalent-bond from mmcif is misslead, pass it.
+        logging.warning(f'Error occurred during covalent bond processing: {e}')
         continue
+
+
 
-      ptnr1_ccd_atoms_list = ccd_preprocessed_dict[ptnr1_ccd_id]['atom_ids']
-      ptnr2_ccd_atoms_list = ccd_preprocessed_dict[ptnr2_ccd_id]['atom_ids']
+      if ptnr1_ccd_id in ccd_preprocessed_dict:
+        ptnr1_ccd_atoms_list = ccd_preprocessed_dict[ptnr1_ccd_id]['atom_ids']
+      else:
+        ptnr1_ccd_atoms_list = ccd_id2atom_ids[ptnr1_ccd_id]
+
+      logging.debug(f'{ptnr1_ccd_id=}: {ptnr1_ccd_atoms_list=}')
+
+      if ptnr2_ccd_id in ccd_preprocessed_dict:
+        ptnr2_ccd_atoms_list = ccd_preprocessed_dict[ptnr2_ccd_id]['atom_ids']
+      else:
+        ptnr2_ccd_atoms_list  = ccd_id2atom_ids[ptnr2_ccd_id]
+
+      logging.debug(f'{ptnr2_ccd_id=}: {ptnr2_ccd_atoms_list=}')
 
       if ptnr1_ccd_id in ccd_standard_set:  
           ## if ccd_id is in the standard residue in HF3 (table 13), we didn't have to map to atom-leval index

diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py
@@ -454,7 +454,7 @@ def process_input_json(all_entitys: List[Entity], ccd_preprocessed_path,
 
 
     # gather all defined covalent bonds
-    all_covalent_bonds=[bond for entity in all_entitys for bond in entity.msa_seqs if bond.dtype == 'bond']
+    all_covalent_bonds=[bond for entity in all_entitys for bond in entity.msa_seqs if entity.dtype == 'bond']
 
     assert num_chains == len(all_chain_features.keys())
     all_feats = add_assembly_features(all_chain_features, ccd_preprocessed_dict, no_msa_templ_feats, all_covalent_bonds)

diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py
@@ -180,16 +180,20 @@ def _perform_conversion(self, input_type: str, input_value: str, generate_3d: bo
         if input_type == 'mol2' and input_value.endswith('.mol2'):
             return self._load_mol(mol2_file=input_value)
 
-        with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file, utils.timing(f'converting {input_type} to mol2: {input_value}'):
+        save_path=os.path.join('ligands',f'{os.path.basename(input_value)[:-(len(input_type)+1)] if input_type != "smiles" else "UNK"}.mol2')
+
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+
+        with utils.timing(f'converting {input_type} to mol2: {input_value}'):
             if input_type == 'smiles':
-                obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} {'--gen3d' if generate_3d else ''}"
+                obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{save_path} {'--gen3d' if generate_3d else ''}"
                 if len(input_value)>60:
                     logging.warning(f'This takes a while ...')
             else:
-                obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} {'--gen3d' if generate_3d else ''}"
+                obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{save_path} {'--gen3d' if generate_3d else ''}"
             logging.debug(f'Launching command: `{obabel_cmd}`')
             ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True)
-            return self._load_mol(mol2_file=temp_file.name, ret=ret)
+            return self._load_mol(mol2_file=save_path, ret=ret)
 
     def _convert_to_mol(self, input_type: str, input_value: str, generate_3d: bool=True) -> Chem.Mol:
         if input_type not in self.supported_formats: