Skip to content

Commit

Permalink
feat: ligand: support obable readable files
Browse files Browse the repository at this point in the history
  • Loading branch information
YaoYinYing committed Aug 19, 2024
1 parent 7180fe7 commit 330e455
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 29 deletions.
19 changes: 19 additions & 0 deletions apps/protein_folding/helixfold3/data/demo_p450_heme.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"entities": [
{
"type": "protein",
"sequence": "MDALYKSTVAKFNEVIQLDCSTEFFSIALSSIAGILLLLLLFRSKRHSSLKLPPGKLGIPFIGESFIFLRALRSNSLEQFFDERVKKFGLVFKTSLIGHPTVVLCGPAGNRLILSNEEKLVQMSWPAQFMKLMGENSVATRRGEDHIVMRSALAGFFGPGALQSYIGKMNTEIQSHINEKWKGKDEVNVLPLVRELVFNISAILFFNIYDKQEQDRLHKLLETILVGSFALPIDLPGFGFHRALQGRAKLNKIMLSLIKKRKEDLQSGSATATQDLLSVLLTFRDDKGTPLTNDEILDNFSSLLHASYDTTTSPMALIFKLLSSNPECYQKVVQEQLEILSNKEEGEEITWKDLKAMKYTWQVAQETLRMFPPVFGTFRKAITDIQYDGYTIPKGWKLLWTTYSTHPKDLYFNEPEKFMPSRFDQEGKHVAPYTFLPFGGGQRSCVGWEFSKMEILLFVHHFVKTFSSYTPVDPDEKISGDPLPPLPSKGFSIKLFPRP",
"count": 1
},
{
"type": "ligand",
"ccd": "HEM",
"count": 1
},
{
"type": "ligand",
"smiles": "CC1=C2CC[C@@]3(CCCC(=C)[C@H]3C[C@@H](C2(C)C)CC1)C",
"count": 1
}
]
}
19 changes: 19 additions & 0 deletions apps/protein_folding/helixfold3/data/demo_p450_heme_sdf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"entities": [
{
"type": "protein",
"sequence": "MDALYKSTVAKFNEVIQLDCSTEFFSIALSSIAGILLLLLLFRSKRHSSLKLPPGKLGIPFIGESFIFLRALRSNSLEQFFDERVKKFGLVFKTSLIGHPTVVLCGPAGNRLILSNEEKLVQMSWPAQFMKLMGENSVATRRGEDHIVMRSALAGFFGPGALQSYIGKMNTEIQSHINEKWKGKDEVNVLPLVRELVFNISAILFFNIYDKQEQDRLHKLLETILVGSFALPIDLPGFGFHRALQGRAKLNKIMLSLIKKRKEDLQSGSATATQDLLSVLLTFRDDKGTPLTNDEILDNFSSLLHASYDTTTSPMALIFKLLSSNPECYQKVVQEQLEILSNKEEGEEITWKDLKAMKYTWQVAQETLRMFPPVFGTFRKAITDIQYDGYTIPKGWKLLWTTYSTHPKDLYFNEPEKFMPSRFDQEGKHVAPYTFLPFGGGQRSCVGWEFSKMEILLFVHHFVKTFSSYTPVDPDEKISGDPLPPLPSKGFSIKLFPRP",
"count": 1
},
{
"type": "ligand",
"ccd": "HEM",
"count": 1
},
{
"type": "ligand",
"sdf": "/mnt/data/yinying/tests/helixfold/ligands/60119277-3d.sdf",
"count": 1
}
]
}
19 changes: 19 additions & 0 deletions apps/protein_folding/helixfold3/data/demo_p450_heme_smiles.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"entities": [
{
"type": "protein",
"sequence": "MDALYKSTVAKFNEVIQLDCSTEFFSIALSSIAGILLLLLLFRSKRHSSLKLPPGKLGIPFIGESFIFLRALRSNSLEQFFDERVKKFGLVFKTSLIGHPTVVLCGPAGNRLILSNEEKLVQMSWPAQFMKLMGENSVATRRGEDHIVMRSALAGFFGPGALQSYIGKMNTEIQSHINEKWKGKDEVNVLPLVRELVFNISAILFFNIYDKQEQDRLHKLLETILVGSFALPIDLPGFGFHRALQGRAKLNKIMLSLIKKRKEDLQSGSATATQDLLSVLLTFRDDKGTPLTNDEILDNFSSLLHASYDTTTSPMALIFKLLSSNPECYQKVVQEQLEILSNKEEGEEITWKDLKAMKYTWQVAQETLRMFPPVFGTFRKAITDIQYDGYTIPKGWKLLWTTYSTHPKDLYFNEPEKFMPSRFDQEGKHVAPYTFLPFGGGQRSCVGWEFSKMEILLFVHHFVKTFSSYTPVDPDEKISGDPLPPLPSKGFSIKLFPRP",
"count": 1
},
{
"type": "ligand",
"smiles": "CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)O)CCC(=O)O.[Fe+2]",
"count": 1
},
{
"type": "ligand",
"smiles": "CC1=C2CC[C@@]3(CCCC(=C)[C@H]3C[C@@H](C2(C)C)CC1)C",
"count": 1
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import subprocess
import tempfile
import itertools
from absl import logging
from typing import Tuple, Union, Mapping, Literal, Callable, Any
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
Expand Down Expand Up @@ -138,26 +140,54 @@ def smiles_to_ETKDGMol(smiles):
return optimal_mol_wo_H


def smiles_toMol_obabel(smiles):
"""
generate mol from smiles using obabel;
"""

OBABEL_BIN = os.getenv('OBABEL_BIN')
if not (OBABEL_BIN and os.path.isfile(OBABEL_BIN)):
raise FileNotFoundError(f'Cannot find obabel binary at {OBABEL_BIN}.')

with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file:
print(f"[OBABEL] Temporary file created: {temp_file.name}")
obabel_cmd = f"{OBABEL_BIN} -:'{smiles}' -omol2 -O{temp_file.name} --gen3d"
class Mol2MolObabel:
def __init__(self):
self.obabel_bin = os.getenv('OBABEL_BIN')
if not (self.obabel_bin and os.path.isfile(self.obabel_bin)):
raise FileNotFoundError(f'Cannot find obabel binary at {self.obabel_bin}.')

# Get the supported formats
self.supported_formats: Tuple[str] = self._get_supported_formats()

def _get_supported_formats(self) -> Tuple[str]:
"""
Retrieves the list of supported formats from obabel and filters out write-only formats.
Returns:
tuple: A tuple of supported input formats.
"""
obabel_cmd = f"{self.obabel_bin} -L formats"
ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True)
mol = Chem.MolFromMol2File(temp_file.name, sanitize=False)
if '3D coordinate generation failed' in ret.stderr:
mol = generate_ETKDGv3_conformer(mol)
optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False)
return optimal_mol_wo_H
formats = [line.split()[0] for line in ret.stdout.splitlines() if '[Write-only]' not in line]
formats.append('smiles')

return tuple(formats)

def _perform_conversion(self, input_type: str, input_value: str) -> Chem.Mol:
with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file:
print(f"[OBABEL] Temporary file created: {temp_file.name}")
if input_type == 'smiles':
obabel_cmd = f"{self.obabel_bin} -:'{input_value}' -omol2 -O{temp_file.name} --gen3d"
else:
obabel_cmd = f"{self.obabel_bin} -i {input_type} {input_value} -omol2 -O{temp_file.name} --gen3d"
ret = subprocess.run(obabel_cmd, shell=True, capture_output=True, text=True)
mol = Chem.MolFromMol2File(temp_file.name, sanitize=False)
if '3D coordinate generation failed' in ret.stderr:
mol = generate_ETKDGv3_conformer(mol)
optimal_mol_wo_H = Chem.RemoveAllHs(mol, sanitize=False)
logging.debug(f'Converted `{input_type}`: {input_value}')
return optimal_mol_wo_H

def _convert_to_mol(self, input_type: str, input_value: str) -> Chem.Mol:
if input_type not in self.supported_formats:
raise ValueError(f'Unsupported small molecule input: {input_type}. \nSupported formats: \n{self.supported_formats}\n')

if input_type != 'smiles' and not os.path.isfile(input_value):
raise FileNotFoundError(f'Cannot find the {input_type.upper()} file at {input_value}.')

return self._perform_conversion(input_type, input_value)

__call__: Callable[[str, str], Chem.Mol] = _convert_to_mol
def polymer_convert(items):
"""
"type": "protein",
Expand Down Expand Up @@ -192,30 +222,39 @@ def polymer_convert(items):
}


def ligand_convert(items):
def ligand_convert(items: Mapping[str, Union[int, str]]):
"""
"type": "ligand",
"ccd": "ATP", or "smiles": "CCccc(O)ccc",
"count": 1
"""
dtype = items['type']
count = items['count']
converter=Mol2MolObabel()

msa_seqs = ""
_ccd_seqs = []
ccd_to_extra_mol_infos = {}
if 'ccd' in items:
_ccd_seqs.append(f"({items['ccd']})")
elif 'smiles' in items:
_ccd_seqs.append(f"(UNK-)")


elif any(f in items for f in converter.supported_formats):
for k in converter.supported_formats:
if k in items:
break

ligand_name="UNK-"
_ccd_seqs.append(f"({ligand_name})")
# mol_wo_h = smiles_to_ETKDGMol(items['smiles'])
mol_wo_h = smiles_toMol_obabel(items['smiles'])

mol_wo_h = converter(k, items[k])
_extra_mol_infos = make_basic_info_fromMol(mol_wo_h)
ccd_to_extra_mol_infos = {
"UNK-": _extra_mol_infos
ligand_name: _extra_mol_infos
}
else:
raise ValueError(f'not support for the {dtype} in ligand_convert')
raise ValueError(f'not support for the {dtype} in ligand_convert, please check the input. \nSupported input: {converter.supported_formats}')
ccd_seqs = ''.join(_ccd_seqs) ## (GLY)(ALA).....

# repeat_ccds, repeat_fasta = [ccd_seqs], [msa_seqs]
Expand Down
11 changes: 5 additions & 6 deletions apps/protein_folding/helixfold3/helixfold/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,16 +102,15 @@ def preprocess_json_entity(json_path, out_dir):
def convert_to_json_compatible(obj):
if isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, np.integer):
if isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
if isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, dict):
if isinstance(obj, dict):
return {k: convert_to_json_compatible(v) for k, v in obj.items()}
elif isinstance(obj, list):
if isinstance(obj, list):
return [convert_to_json_compatible(i) for i in obj]
else:
return obj
return obj

def resolve_bin_path(cfg_path: str, default_binary_name: str)-> str:
"""Helper function to resolve the binary path."""
Expand Down

0 comments on commit 330e455

Please sign in to comment.