Skip to content

Commit

Permalink
dev:major: covalent bond
Browse files Browse the repository at this point in the history
  • Loading branch information
YaoYinYing committed Aug 23, 2024
1 parent 96a3de8 commit c39a3e4
Show file tree
Hide file tree
Showing 5 changed files with 362 additions and 75 deletions.
155 changes: 155 additions & 0 deletions apps/protein_folding/helixfold3/data/7s69_glycan.sdf
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@

OpenBabel03042416223D

72 77 0 0 1 0 0 0 0 0999 V2000
29.7340 3.2540 76.7430 C 0 0 0 0 0 2 0 0 0 0 0 0
29.8160 4.4760 77.6460 C 0 0 1 0 0 3 0 0 0 0 0 0
28.5260 5.2840 77.5530 C 0 0 2 0 0 3 0 0 0 0 0 0
28.1780 5.5830 76.1020 C 0 0 1 0 0 3 0 0 0 0 0 0
28.2350 4.3240 75.2420 C 0 0 1 0 0 3 0 0 0 0 0 0
28.1040 4.6170 73.7650 C 0 0 0 0 0 2 0 0 0 0 0 0
31.3020 3.8250 79.4830 C 0 0 0 0 0 0 0 0 0 0 0 0
31.3910 3.4410 80.9280 C 0 0 0 0 0 1 0 0 0 0 0 0
30.0760 4.0880 79.0210 N 0 0 0 0 0 2 0 0 0 0 0 0
28.6870 6.5050 78.2670 O 0 0 0 0 0 1 0 0 0 0 0 0
26.8490 6.0910 76.0350 O 0 0 0 0 0 0 0 0 0 0 0 0
29.4950 3.6650 75.4130 O 0 0 0 0 0 0 0 0 0 0 0 0
29.3670 4.5550 73.1150 O 0 0 0 0 0 1 0 0 0 0 0 0
32.2950 3.8940 78.7640 O 0 0 0 0 0 0 0 0 0 0 0 0
26.7420 7.4140 75.6950 C 0 0 1 0 0 3 0 0 0 0 0 0
25.2700 7.7830 75.6110 C 0 0 1 0 0 3 0 0 0 0 0 0
25.1290 9.2300 75.1610 C 0 0 2 0 0 3 0 0 0 0 0 0
25.9180 10.1440 76.0880 C 0 0 1 0 0 3 0 0 0 0 0 0
27.3630 9.6720 76.2210 C 0 0 1 0 0 3 0 0 0 0 0 0
28.1310 10.4360 77.2730 C 0 0 0 0 0 2 0 0 0 0 0 0
23.8820 5.8170 75.1400 C 0 0 0 0 0 0 0 0 0 0 0 0
23.1980 5.0100 74.0810 C 0 0 0 0 0 1 0 0 0 0 0 0
24.5530 6.8930 74.7160 N 0 0 0 0 0 2 0 0 0 0 0 0
23.7530 9.5950 75.1670 O 0 0 0 0 0 1 0 0 0 0 0 0
25.9170 11.4700 75.5730 O 0 0 0 0 0 0 0 0 0 0 0 0
27.4050 8.2900 76.6040 O 0 0 0 0 0 0 0 0 0 0 0 0
29.5300 10.4030 77.0280 O 0 0 0 0 0 1 0 0 0 0 0 0
23.8300 5.5110 76.3290 O 0 0 0 0 0 0 0 0 0 0 0 0
25.3940 12.4250 76.4090 C 0 0 1 0 0 3 0 0 0 0 0 0
25.9490 13.7680 75.9090 C 0 0 2 0 0 3 0 0 0 0 0 0
25.1320 14.9560 76.4900 C 0 0 2 0 0 3 0 0 0 0 0 0
23.6130 14.6900 76.6390 C 0 0 1 0 0 3 0 0 0 0 0 0
23.3700 13.3000 77.2280 C 0 0 1 0 0 3 0 0 0 0 0 0
21.9020 12.9360 77.3500 C 0 0 0 0 0 2 0 0 0 0 0 0
25.9010 13.8490 74.4810 O 0 0 0 0 0 1 0 0 0 0 0 0
25.3420 16.1410 75.7110 O 0 0 0 0 0 0 0 0 0 0 0 0
23.0420 15.6520 77.5170 O 0 0 0 0 0 1 0 0 0 0 0 0
23.9910 12.3690 76.3570 O 0 0 0 0 0 0 0 0 0 0 0 0
21.3660 12.8480 76.0500 O 0 0 0 0 0 0 0 0 0 0 0 0
20.8090 11.6500 75.6780 C 0 0 2 0 0 3 0 0 0 0 0 0
20.6800 11.6410 74.1740 C 0 0 2 0 0 3 0 0 0 0 0 0
19.5510 12.5850 73.8180 C 0 0 2 0 0 3 0 0 0 0 0 0
18.2370 12.0940 74.4540 C 0 0 1 0 0 3 0 0 0 0 0 0
18.4030 11.9240 75.9810 C 0 0 1 0 0 3 0 0 0 0 0 0
17.2710 11.1260 76.6120 C 0 0 0 0 0 2 0 0 0 0 0 0
20.2900 10.3510 73.7080 O 0 0 0 0 0 1 0 0 0 0 0 0
19.4280 12.7380 72.4110 O 0 0 0 0 0 0 0 0 0 0 0 0
17.2120 13.0460 74.2030 O 0 0 0 0 0 1 0 0 0 0 0 0
19.6260 11.2000 76.3010 O 0 0 0 0 0 0 0 0 0 0 0 0
16.0670 11.4490 75.9360 O 0 0 0 0 0 1 0 0 0 0 0 0
20.2190 13.6280 71.7260 C 0 0 2 0 0 3 0 0 0 0 0 0
19.6090 14.0000 70.3810 C 0 0 2 0 0 3 0 0 0 0 0 0
19.6360 12.7820 69.4880 C 0 0 2 0 0 3 0 0 0 0 0 0
21.0860 12.3100 69.3240 C 0 0 1 0 0 3 0 0 0 0 0 0
21.7030 12.0240 70.7120 C 0 0 1 0 0 3 0 0 0 0 0 0
23.1940 11.7460 70.6620 C 0 0 0 0 0 2 0 0 0 0 0 0
20.4080 14.9810 69.7000 O 0 0 0 0 0 1 0 0 0 0 0 0
19.0310 13.0500 68.2340 O 0 0 0 0 0 1 0 0 0 0 0 0
21.1060 11.1280 68.5380 O 0 0 0 0 0 1 0 0 0 0 0 0
21.5380 13.1700 71.5840 O 0 0 0 0 0 0 0 0 0 0 0 0
23.8240 12.5210 71.6820 O 0 0 0 0 0 1 0 0 0 0 0 0
26.0070 17.3020 76.0200 C 0 0 2 0 0 3 0 0 0 0 0 0
27.0750 17.5250 74.9350 C 0 0 2 0 0 3 0 0 0 0 0 0
28.3660 16.8320 75.3290 C 0 0 2 0 0 3 0 0 0 0 0 0
28.7820 17.2470 76.7510 C 0 0 1 0 0 3 0 0 0 0 0 0
27.6930 16.8120 77.7320 C 0 0 1 0 0 3 0 0 0 0 0 0
27.9770 17.2020 79.1710 C 0 0 0 0 0 2 0 0 0 0 0 0
27.3990 18.9140 74.8010 O 0 0 0 0 0 1 0 0 0 0 0 0
29.4060 17.0990 74.3950 O 0 0 0 0 0 1 0 0 0 0 0 0
30.0160 16.6410 77.0930 O 0 0 0 0 0 1 0 0 0 0 0 0
26.4610 17.4820 77.3520 O 0 0 0 0 0 0 0 0 0 0 0 0
27.3660 18.4620 79.4040 O 0 0 0 0 0 1 0 0 0 0 0 0
1 2 1 0 0 0 0
1 12 1 0 0 0 0
2 3 1 0 0 0 0
2 9 1 1 0 0 0
3 10 1 1 0 0 0
3 4 1 0 0 0 0
4 5 1 0 0 0 0
4 11 1 1 0 0 0
5 6 1 6 0 0 0
5 12 1 0 0 0 0
6 13 1 0 0 0 0
7 14 2 0 0 0 0
7 8 1 0 0 0 0
7 9 1 0 0 0 0
15 16 1 0 0 0 0
15 11 1 1 0 0 0
15 26 1 0 0 0 0
16 23 1 6 0 0 0
16 17 1 0 0 0 0
17 18 1 0 0 0 0
17 24 1 1 0 0 0
18 25 1 6 0 0 0
18 19 1 0 0 0 0
19 20 1 1 0 0 0
19 26 1 0 0 0 0
20 27 1 0 0 0 0
21 22 1 0 0 0 0
21 23 1 0 0 0 0
21 28 2 0 0 0 0
29 38 1 0 0 0 0
29 25 1 6 0 0 0
29 30 1 0 0 0 0
30 35 1 6 0 0 0
30 31 1 0 0 0 0
31 32 1 0 0 0 0
31 36 1 6 0 0 0
32 33 1 0 0 0 0
32 37 1 1 0 0 0
33 38 1 0 0 0 0
33 34 1 6 0 0 0
34 39 1 0 0 0 0
40 49 1 0 0 0 0
40 41 1 0 0 0 0
40 39 1 1 0 0 0
41 46 1 1 0 0 0
41 42 1 0 0 0 0
42 43 1 0 0 0 0
42 47 1 6 0 0 0
43 48 1 1 0 0 0
43 44 1 0 0 0 0
44 49 1 0 0 0 0
44 45 1 6 0 0 0
45 50 1 0 0 0 0
51 47 1 6 0 0 0
51 60 1 0 0 0 0
51 52 1 0 0 0 0
52 53 1 0 0 0 0
52 57 1 6 0 0 0
53 54 1 0 0 0 0
53 58 1 6 0 0 0
54 59 1 6 0 0 0
54 55 1 0 0 0 0
55 56 1 6 0 0 0
55 60 1 0 0 0 0
56 61 1 0 0 0 0
62 71 1 0 0 0 0
62 36 1 1 0 0 0
62 63 1 0 0 0 0
63 68 1 1 0 0 0
63 64 1 0 0 0 0
64 69 1 6 0 0 0
64 65 1 0 0 0 0
65 70 1 1 0 0 0
65 66 1 0 0 0 0
66 67 1 1 0 0 0
66 71 1 0 0 0 0
67 72 1 0 0 0 0
M END
$$$$
19 changes: 19 additions & 0 deletions apps/protein_folding/helixfold3/data/demo_7s69_coval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"entities": [
{
"type": "protein",
"sequence": "DRHHHHHHKLGKMKIVEEPNSFGLNNPFLSQTNKLQPRVQPSPVSGPSHLFRLAGKCFNLVESTYKYELCPFHNVTQHEQTFRWNAYSGILGIWQEWDIENNTFSGMWMREGDSCGNKNRQTKVLLVCGKANKLSSVSEPSTCLYSLTFETPLVCHPHSLLVYPTLSEGLQEKWNEAEQALYDELITEQGHGKILKEIFREAGYLKTTKPDGEGKETQDKPKEFDSLEKCNKGYTELTSEIQRLKKMLNEHGISYVTNGTSRSEGQPAEVNTTFARGEDKVHLRGDTGIRDGQ",
"count": 1
},
{
"type": "ligand",
"sdf": "/repo/PaddleHelix/apps/protein_folding/helixfold3/data/7s69_glycan.sdf",
"count": 1
},
{
"type": "bond",
"bond": "A,ASN,74,ND2,B,UNK1,1,C,covale,2.3",
"_comment": "'A,74,ND2:B,1:CW,null' from RF2AA"
}
]
}
121 changes: 111 additions & 10 deletions apps/protein_folding/helixfold3/helixfold/data/pipeline_conf_bonds.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
"""Functions for building the input features (reference ccd features) for the HelixFold model."""

import collections
from dataclasses import dataclass
import gzip
import os
import pickle
from typing import Any, Optional
import re
from typing import Any, List, Literal, Optional, Tuple

from absl import logging
from immutabledict import immutabledict
from helixfold.common import residue_constants
import numpy as np
from openbabel import openbabel

from helixfold.common import residue_constants
from helixfold.data.tools import utils



ALLOWED_LIGAND_BONDS_TYPE = {
"SING": 1,
"DOUB": 2,
Expand All @@ -22,6 +26,98 @@
"AROM": 12,
}

# Define the possible bond types as a Literal
BondType = Literal["covale", "metal", "hydrogen", "ionic", "disulfide", "aromatic"]


@dataclass
class AtomPartner:
"""
Represents one partner atom in a covalent bond.
Attributes:
label_asym_id (str): The asymmetry identifier for the partner atom (i.e., chain ID).
label_comp_id (str): The component identifier for the partner atom (i.e., residue name).
seq_id (str): The sequence identifier for the partner atom (merged label_seq_id and auth_seq_id).
label_atom_id (str): The atom identifier for the partner atom (i.e., atom name).
"""

label_asym_id: str # Chain ID
label_comp_id: str # Residue name
seq_id: str # Merged sequence ID
label_atom_id: str # Atom name


@dataclass
class CovalentBond:
"""
Represents a covalent bond between two atoms in a molecular structure.
Attributes:
atom_1 (AtomPartner): The first partner atom in the bond.
atom_2 (AtomPartner): The second partner atom in the bond.
bond_type (BondType): The type of the bond.
pdbx_dist_value (float): The distance value as defined in the PDBx/mmCIF format.
"""

atom_1: AtomPartner
atom_2: AtomPartner
bond_type: BondType
pdbx_dist_value: float

def parse_covalent_bond_input(input_string: str) -> List[CovalentBond]:
"""
Parses a human-readable string into a list of CovalentBond objects.
Args:
input_string (str): A string representing covalent bonds, where each bond is
described by two atom partners separated by a comma,
and multiple bonds are separated by semicolons.
Example: "A,GLY,25,CA,A,GLY,25,N,covale,1.32; B,HIS,58,ND1,B,HIS,58,CE1,covale,1.39"
Returns:
List[CovalentBond]: A list of CovalentBond objects.
"""
covalent_bonds = []

# Split the input string by semicolons to separate individual covalent bonds
bond_strings = input_string.split(';')

for bond_str in bond_strings:
# Split the individual bond string by commas to separate attributes
bond_parts = bond_str.split(',')

if len(bond_parts) != 10:
raise ValueError(f"Invalid bond format: {bond_str}. Expected 10 fields per bond.")

# Create AtomPartner instances for the two atoms in the bond
atom_1 = AtomPartner(
label_asym_id=bond_parts[0].strip(),
label_comp_id=bond_parts[1].strip(),
seq_id=bond_parts[2].strip(),
label_atom_id=bond_parts[3].strip()
)

atom_2 = AtomPartner(
label_asym_id=bond_parts[4].strip(),
label_comp_id=bond_parts[5].strip(),
seq_id=bond_parts[6].strip(),
label_atom_id=bond_parts[7].strip()
)

# Create a CovalentBond instance
covalent_bond = CovalentBond(
atom_1=atom_1,
atom_2=atom_2,
bond_type=bond_parts[8].strip(),
pdbx_dist_value=float(bond_parts[9].strip())
)

# Append the CovalentBond instance to the list
covalent_bonds.append(covalent_bond)

return covalent_bonds

def load_ccd_dict(ccd_preprocessed_path: str) -> immutabledict[str, Any]:
if not os.path.exists(ccd_preprocessed_path):
raise FileNotFoundError(f'[CCD] ccd_preprocessed_path: {ccd_preprocessed_path} not exist.')
Expand Down Expand Up @@ -151,7 +247,7 @@ def make_ccd_conf_features(all_chain_info, ccd_preprocessed_dict,
return features


def make_bond_features(covalent_bond, all_chain_info, ccd_preprocessed_dict,
def make_bond_features(covalent_bond: List[CovalentBond], all_chain_info, ccd_preprocessed_dict,
extra_feats: Optional[dict]=None):
"""
all_chain_info: dict, (chain_type_chain_id): ccd_seq (list of ccd), such as: protein_A: ['ALA', 'MET', 'GLY']
Expand All @@ -172,24 +268,29 @@ def make_bond_features(covalent_bond, all_chain_info, ccd_preprocessed_dict,
_set_chain_id_list = set(chain_id_list)
parsed_covalent_bond = []
for _bond in covalent_bond:
left_bond_atomid, right_bond_atomid = _bond['ptnr1_label_atom_id'], _bond['ptnr2_label_atom_id']
left_bond_name, right_bond_name = _bond['ptnr1_label_comp_id'], _bond['ptnr2_label_comp_id']
left_bond, right_bond = _bond['ptnr1_label_asym_id'], _bond['ptnr2_label_asym_id']
# Accessing the AtomPartner attributes for both atoms in the covalent bond
left_bond_atomid, right_bond_atomid = _bond.atom_1.label_atom_id, _bond.atom_2.label_atom_id
left_bond_name, right_bond_name = _bond.atom_1.label_comp_id, _bond.atom_2.label_comp_id
left_bond, right_bond = _bond.atom_1.label_asym_id, _bond.atom_2.label_asym_id

left_bond_idx, right_bond_idx = _bond['ptnr1_label_seq_id'], _bond['ptnr2_label_seq_id']
auth_left_idx, auth_right_idx = _bond['ptnr1_auth_seq_id'], _bond['ptnr2_auth_seq_id']
left_bond_idx, right_bond_idx = _bond.atom_1.seq_id, _bond.atom_2.seq_id
auth_left_idx, auth_right_idx = _bond.atom_1.seq_id, _bond.atom_2.seq_id

left_bond_idx = 1 if left_bond_idx == '.' else left_bond_idx
right_bond_idx = 1 if right_bond_idx == '.' else right_bond_idx

if _bond['bond_type'] != "covale":
if _bond.bond_type != "covale":
logging.warning(f'Ignore non-covale bond type: {_bond.bond_type}')
continue

if _bond['pdbx_dist_value'] > 2.4:
if _bond.pdbx_dist_value > 2.4:
# the covalent_bond is cut off by distance=2.4
logging.warning(f'Ignore bonding with distance > 2.4: {_bond.pdbx_dist_value}')
continue

## When some chainID is filtered, bond need to be filtered too.
if (left_bond not in _set_chain_id_list) or (right_bond not in _set_chain_id_list):
logging.warning(f'Ignore bonding with left and right out of chain list: ')
continue

parsed_covalent_bond.append([left_bond, left_bond_name, left_bond_idx, left_bond_atomid, auth_left_idx,
Expand Down
Loading

0 comments on commit c39a3e4

Please sign in to comment.