diff --git a/apps/protein_folding/helixfold3/data/demo_7s69_coval.json b/apps/protein_folding/helixfold3/data/demo_7s69_coval.json index fc53c9e9..a3890224 100644 --- a/apps/protein_folding/helixfold3/data/demo_7s69_coval.json +++ b/apps/protein_folding/helixfold3/data/demo_7s69_coval.json @@ -12,7 +12,7 @@ }, { "type": "bond", - "bond": "A,ASN,74,ND2,B,UNK1,1,C,covale,2.3", + "bond": "A,ASN,74,ND2,B,UNK-1,1,C16,covale,2.3", "_comment": "'A,74,ND2:B,1:CW,null' from RF2AA" } ] diff --git a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py index 7a9c2960..9e5b7d2e 100644 --- a/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py +++ b/apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py @@ -115,6 +115,7 @@ def parse_covalent_bond_input(input_string: str) -> List[CovalentBond]: # Append the CovalentBond instance to the list covalent_bonds.append(covalent_bond) + logging.info(f"Added {len(covalent_bonds)} bonds: {covalent_bonds}") return covalent_bonds @@ -306,6 +307,8 @@ def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_pr chainId_to_type = {} ligand_bond_type = [] # (i, j, bond_type), represent the bond between token i and token j bond_index = [] # (i,j) represent the bond between token i and token j + + ccd_id2atom_ids: dict[str, list]={} ccd_standard_set = residue_constants.STANDARD_LIST for chain_type_id, ccd_seq in all_chain_info.items(): chain_type, chain_id = chain_type_id.rsplit('_', 1) @@ -326,6 +329,8 @@ def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_pr else: _ccd_feats = ccd_preprocessed_dict[ccd_id] atom_ids = _ccd_feats['atom_ids'] + + ccd_id2atom_ids[ccd_id] = atom_ids assert len(atom_ids) > 0, f'TODO filter - Got CCD <{ccd_id}>: 0 atom nums.' all_token_nums += len(atom_ids) @@ -372,16 +377,43 @@ def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_pr ptnr2_label_seq_id = ptnr2_auth_seq_id try: - assert ptnr1_label_asym_id in chainId_to_ccd_list and ptnr2_label_asym_id in chainId_to_ccd_list + if not (ptnr1_label_asym_id in chainId_to_ccd_list and ptnr2_label_asym_id in chainId_to_ccd_list): + raise ValueError(f"Invalid chain id:\n{ptnr1_label_asym_id}/{ptnr2_label_asym_id}\n{chainId_to_ccd_list}") ptnr1_ccd_id = chainId_to_ccd_list[ptnr1_label_asym_id][int(ptnr1_label_seq_id) - 1] ptnr2_ccd_id = chainId_to_ccd_list[ptnr2_label_asym_id][int(ptnr2_label_seq_id) - 1] - assert ptnr1_ccd_id == ptnr1_label_comp_id and ptnr2_ccd_id == ptnr2_label_comp_id - except: + + + # renamed ligand residues + + + if ptnr1_ccd_id != ptnr1_label_comp_id: + logging.warning(f"Find ligand residue: {ptnr1_label_comp_id} -> {ptnr1_ccd_id}") + #ptnr1_label_comp_id = ptnr1_ccd_id + + if ptnr2_ccd_id != ptnr2_label_comp_id: + logging.warning(f"Find ligand residue: {ptnr2_label_comp_id} -> {ptnr2_ccd_id}") + #ptnr2_label_comp_id = ptnr2_ccd_id + + except ValueError as e: ## some convalent-bond from mmcif is misslead, pass it. + logging.warning(f'Error occurred during covalent bond processing: {e}') continue + + - ptnr1_ccd_atoms_list = ccd_preprocessed_dict[ptnr1_ccd_id]['atom_ids'] - ptnr2_ccd_atoms_list = ccd_preprocessed_dict[ptnr2_ccd_id]['atom_ids'] + if ptnr1_ccd_id in ccd_preprocessed_dict: + ptnr1_ccd_atoms_list = ccd_preprocessed_dict[ptnr1_ccd_id]['atom_ids'] + else: + ptnr1_ccd_atoms_list = ccd_id2atom_ids[ptnr1_ccd_id] + + logging.debug(f'{ptnr1_ccd_id=}: {ptnr1_ccd_atoms_list=}') + + if ptnr2_ccd_id in ccd_preprocessed_dict: + ptnr2_ccd_atoms_list = ccd_preprocessed_dict[ptnr2_ccd_id]['atom_ids'] + else: + ptnr2_ccd_atoms_list = ccd_id2atom_ids[ptnr2_ccd_id] + + logging.debug(f'{ptnr2_ccd_id=}: {ptnr2_ccd_atoms_list=}') if ptnr1_ccd_id in ccd_standard_set: ## if ccd_id is in the standard residue in HF3 (table 13), we didn't have to map to atom-leval index diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py index 37b09a99..31272c32 100644 --- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py +++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py @@ -454,7 +454,7 @@ def process_input_json(all_entitys: List[Entity], ccd_preprocessed_path, # gather all defined covalent bonds - all_covalent_bonds=[bond for entity in all_entitys for bond in entity.msa_seqs if bond.dtype == 'bond'] + all_covalent_bonds=[bond for entity in all_entitys for bond in entity.msa_seqs if entity.dtype == 'bond'] assert num_chains == len(all_chain_features.keys()) all_feats = add_assembly_features(all_chain_features, ccd_preprocessed_dict, no_msa_templ_feats, all_covalent_bonds) diff --git a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py index 6aa66b55..2fffbb96 100644 --- a/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py +++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py @@ -180,16 +180,20 @@ def _perform_conversion(self, input_type: str, input_value: str, generate_3d: bo if input_type == 'mol2' and input_value.endswith('.mol2'): return self._load_mol(mol2_file=input_value) - with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file, utils.timing(f'converting {input_type} to mol2: {input_value}'): + save_path=os.path.join('ligands',f'{os.path.basename(input_value)[:-(len(input_type)+1)] if input_type != "smiles" else "UNK"}.mol2') + + os.makedirs(os.path.dirname(save_path), exist_ok=True) + + with utils.timing(f'converting {input_type} to mol2: {input_value}'): if input_type == 'smiles': - obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} {'--gen3d' if generate_3d else ''}" + obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{save_path} {'--gen3d' if generate_3d else ''}" if len(input_value)>60: logging.warning(f'This takes a while ...') else: - obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} {'--gen3d' if generate_3d else ''}" + obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{save_path} {'--gen3d' if generate_3d else ''}" logging.debug(f'Launching command: `{obabel_cmd}`') ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True) - return self._load_mol(mol2_file=temp_file.name, ret=ret) + return self._load_mol(mol2_file=save_path, ret=ret) def _convert_to_mol(self, input_type: str, input_value: str, generate_3d: bool=True) -> Chem.Mol: if input_type not in self.supported_formats: