Skip to content

Commit

Permalink
Merge pull request #69 from UCLOrengoGroup/make-chopping-work-with-pd…
Browse files Browse the repository at this point in the history
…breslabel

Make choppings work with (non-numeric, non-sequential) pdb residue labels
  • Loading branch information
sillitoe authored Feb 13, 2024
2 parents 0a4a2ae + c9a0ddb commit 37ef75f
Show file tree
Hide file tree
Showing 10 changed files with 282 additions and 114 deletions.
11 changes: 6 additions & 5 deletions cath_alphaflow/chopping.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from Bio.PDB.mmcifio import MMCIFIO
from Bio.PDB import PDBIO

from .models.domains import Chopping
from .models.domains import ChoppingSeqres, ChoppingPdbResLabel
from .errors import ChoppingError, MultipleModelsError, MultipleChainsError, ParseError

LOG = logging.getLogger(__name__)
Expand All @@ -32,7 +32,7 @@ def chop_cif(
domain_id: str,
chain_cif_path: Path,
domain_cif_path: Path,
chopping: Chopping,
chopping: Union[ChoppingPdbResLabel, ChoppingSeqres],
map_to_pdb_resid: Callable = default_map_to_pdb_resid,
):
"""
Expand Down Expand Up @@ -61,7 +61,7 @@ def chop_structure(
domain_id: str,
chain_path: Path,
domain_path: Path,
chopping: Chopping,
chopping: Union[ChoppingPdbResLabel, ChoppingSeqres],
map_to_pdb_resid: Callable = default_map_to_pdb_resid,
file_type: StructureFileType = StructureFileType.PDB,
):
Expand Down Expand Up @@ -181,10 +181,11 @@ class ChoppingProcessor:

def __init__(
self,
chopping: Union[str, Chopping],
chopping: Union[str, ChoppingPdbResLabel, ChoppingSeqres],
on_segment_residue: Callable,
*,
map_to_pdb_resid: Callable = default_map_to_pdb_resid,
chopping_class=ChoppingPdbResLabel,
):
"""
Args:
Expand All @@ -194,7 +195,7 @@ def __init__(
"""

if isinstance(chopping, str):
chopping = Chopping.from_str(chopping)
chopping = chopping_class.from_str(chopping)
self.chopping = chopping
self.map_to_pdb_resid = map_to_pdb_resid
self.on_segment_residue = on_segment_residue
Expand Down
4 changes: 2 additions & 2 deletions cath_alphaflow/commands/create_dataset_cath_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from cath_alphaflow.io_utils import get_csv_dictwriter
from cath_alphaflow.io_utils import chunked_iterable
from cath_alphaflow.db_utils import OraDB
from cath_alphaflow.models.domains import Chopping, AFChainID, AFDomainID
from cath_alphaflow.models.domains import ChoppingSeqres, AFChainID, AFDomainID
from cath_alphaflow.settings import DEFAULT_AF_VERSION, DEFAULT_AF_FRAGMENT
from cath_alphaflow.predicted_domain_provider import (
Gene3DCrhPredictedCathDomainProvider,
Expand Down Expand Up @@ -236,7 +236,7 @@ def process_uniprot_ids(self, uniprot_ids):
uniprot_acc=entry.uniprot_acc,
fragment_number=self.fragment_number,
version=self.af_version,
chopping=Chopping.from_str(chopping_str=chopping),
chopping=ChoppingSeqres.from_str(chopping_str=chopping),
).to_str()

af_chain_id = AFChainID(
Expand Down
20 changes: 10 additions & 10 deletions cath_alphaflow/commands/measure_globularity.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from cath_alphaflow.io_utils import get_csv_dictwriter
from cath_alphaflow.io_utils import get_pdb_structure
from cath_alphaflow.models.domains import GeneralDomainID
from cath_alphaflow.models.domains import Chopping
from cath_alphaflow.models.domains import Segment
from cath_alphaflow.models.domains import ChoppingPdbResLabel
from cath_alphaflow.models.domains import SegmentStr

from cath_alphaflow.constants import DEFAULT_GLOB_DISTANCE, DEFAULT_GLOB_VOLUME

Expand Down Expand Up @@ -120,7 +120,7 @@ def measure_globularity(
)

for domain_id in general_domain_provider:
LOG.debug(f"Working on: {domain_id} ...")
LOG.info(f"Working on: {domain_id} ...")

model_structure = get_pdb_structure(
domain_id, pdb_dir, chains_are_gzipped, pdb_suffix=pdb_suffix
Expand All @@ -133,7 +133,7 @@ def measure_globularity(
domain_normed_radius_gyration = calculate_normed_radius_of_gyration(
domain_id, model_structure, volume_resolution
)
LOG.debug(
LOG.info(
f"Processed entry: {domain_id}\tpacking_density:{domain_packing_density}\tgyration:{domain_normed_radius_gyration}"
)
globularity_writer.writerow(
Expand Down Expand Up @@ -283,7 +283,7 @@ def calculate_normed_radius_of_gyration(

def guess_chopping_from_pdb_file(
pdbfile, target_chain_id=None, assume_all_atom_breaks_are_segments=True
) -> Chopping:
) -> ChoppingPdbResLabel:
"""
Reverse engineer a `Chopping` object from a PDB file
Expand Down Expand Up @@ -330,17 +330,17 @@ def guess_chopping_from_pdb_file(

if assume_all_atom_breaks_are_segments:
if res_label != last_res_label and atom_num != last_atom_num + 1:
segments.append(Segment(start=start_res, end=last_res_label))
segments.append(SegmentStr(start=start_res, end=last_res_label))
start_res = None
end_res = None

last_res_label = res_label
last_atom_num = atom_num

if start_res and end_res:
segments.append(Segment(start=start_res, end=end_res))
segments.append(SegmentStr(start=start_res, end=end_res))

return Chopping(segments=segments)
return ChoppingPdbResLabel(segments=segments)


def yield_domain_from_pdbdir(pdbdir, pdb_suffix=".pdb") -> GeneralDomainID:
Expand Down Expand Up @@ -375,7 +375,7 @@ def yield_domain_from_consensus_domain_list(consensus_domain_list) -> GeneralDom
for domain_row in consensus_domain_list_reader:
domain = GeneralDomainID(
raw_id=domain_row["domain_id"],
chopping=Chopping.from_str(domain_row["chopping"]),
chopping=ChoppingPdbResLabel.from_str(domain_row["chopping"]),
)
yield domain

Expand Down Expand Up @@ -425,6 +425,6 @@ def yield_domain_from_chainsaw_domain_list_csv(
domain = GeneralDomainID(
# raw_id=domain_id,
raw_id=chain_id,
chopping=Chopping.from_str(domain_row["chopping"]),
chopping=ChoppingPdbResLabel.from_str(domain_row["chopping"]),
)
yield domain
96 changes: 61 additions & 35 deletions cath_alphaflow/commands/optimise_domain_boundaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@

from cath_alphaflow.io_utils import get_af_domain_id_reader
from cath_alphaflow.io_utils import get_csv_dictwriter
from cath_alphaflow.models.domains import AFDomainID, Segment, Chopping
from cath_alphaflow.models.domains import AFDomainID, SegmentStr, ChoppingPdbResLabel
from cath_alphaflow.errors import NoMatchingResiduesError
from cath_alphaflow.io_utils import get_status_log_dictwriter
from cath_alphaflow.constants import STATUS_LOG_SUCCESS,STATUS_LOG_FAIL
from cath_alphaflow.constants import STATUS_LOG_SUCCESS, STATUS_LOG_FAIL
from cath_alphaflow.seq_utils import get_local_plddt_for_res

LOG = logging.getLogger()
Expand Down Expand Up @@ -48,7 +48,7 @@
"--gzipped_af_chains",
type=bool,
default=False,
help="Set True if AF-chain files are stored in .gz files"
help="Set True if AF-chain files are stored in .gz files",
)
@click.option(
"--af_domain_mapping_post_tailchop",
Expand All @@ -68,7 +68,7 @@
"status_log_file",
type=click.File("wt"),
required=True,
help="Log file recording if domains have been optimised or reason for skipping"
help="Log file recording if domains have been optimised or reason for skipping",
)
def optimise_domain_boundaries(
af_domain_list,
Expand Down Expand Up @@ -108,18 +108,21 @@ def optimise_domain_boundaries(
af_domain_id_post_tailchop = calculate_domain_id_post_tailchop(
af_domain_id, af_chain_mmcif_dir, cutoff_plddt_score, gzipped_af_chains
)

if af_domain_id == af_domain_id_post_tailchop:
description = f"boundaries unchanged"
else:
description = f"adjusted boundaries from {af_domain_id} to {af_domain_id_post_tailchop}"
write_status_log(status_log, af_domain_id, STATUS_LOG_SUCCESS, None, description)

write_status_log(
status_log, af_domain_id, STATUS_LOG_SUCCESS, None, description
)

except NoMatchingResiduesError:
description = 'boundaries not adjusted due to low pLDDT'
description = "boundaries not adjusted due to low pLDDT"
af_domain_id_post_tailchop = af_domain_id
write_status_log(status_log,af_domain_id,STATUS_LOG_SUCCESS,None,description)

write_status_log(
status_log, af_domain_id, STATUS_LOG_SUCCESS, None, description
)

af_domain_list_post_tailchop_writer.writerow(
{"af_domain_id": af_domain_id_post_tailchop}
Expand All @@ -134,24 +137,47 @@ def optimise_domain_boundaries(

click.echo("DONE")


def get_residues_from_range(residues, start_res: str, end_res: str):
chopping = ChoppingPdbResLabel(
segments=[SegmentStr(start=str(start_res), end=str(end_res))]
)
selected_residues = chopping.filter_bio_residues(residues)
return selected_residues


def biopdb_residue_to_str(res):
res_num = res.id[1]
ins_code = res.id[2]
if ins_code == " ":
ins_code = ""
return f"{res_num}{ins_code}"


def cut_segment(
structure, segment_to_cut: Segment, cutoff_plddt_score, cut_start, cut_end
) -> Segment:
structure, segment_to_cut: SegmentStr, cutoff_plddt_score, cut_start, cut_end
) -> SegmentStr:
# Cuts one boundary based on the cutoff_plddt_score and returns the reduced boundary

new_boundary_lower_value = boundary_lower_value = segment_to_cut.start
new_boundary_higher_value = boundary_higher_value = segment_to_cut.end

exception_info = (
f"(structure: {structure}, start:{boundary_lower_value}"
f", end:{boundary_higher_value}, cutoff:{cutoff_plddt_score})"
)
all_residues = structure.get_residues()

bounded_residues = get_residues_from_range(
all_residues, boundary_lower_value, boundary_higher_value
)

if cut_start:
for res in range(boundary_lower_value, boundary_higher_value):
for res in bounded_residues:
local_plddt = get_local_plddt_for_res(structure, res)
if local_plddt > cutoff_plddt_score:
break
new_boundary_lower_value = res
new_boundary_lower_value = biopdb_residue_to_str(res)
else:
# this only runs if we haven't already broken out of the loop
msg = (
Expand All @@ -160,12 +186,12 @@ def cut_segment(
raise NoMatchingResiduesError(msg)

if cut_end:
for res in range(boundary_higher_value, boundary_lower_value, -1):
for res in bounded_residues[::-1]: # use slice step to reverse through residues
local_plddt = get_local_plddt_for_res(structure, res)
if local_plddt > cutoff_plddt_score:
break

new_boundary_higher_value = res
new_boundary_higher_value = biopdb_residue_to_str(res)
else:
# this only runs if we haven't already broken out of the loop
msg = f"failed to find residues over plddt cutoff from end {exception_info}"
Expand All @@ -175,16 +201,16 @@ def cut_segment(
msg = f"matching new boundary start/end ({new_boundary_higher_value}) {exception_info}"
raise NoMatchingResiduesError(msg)

new_segment = Segment(
start=new_boundary_lower_value, end=new_boundary_higher_value
new_segment = SegmentStr(
start=str(new_boundary_lower_value), end=str(new_boundary_higher_value)
)

return new_segment


def cut_chopping_start(
structure, chopping: Chopping, cutoff_plddt_score: float
) -> Chopping:
structure, chopping: ChoppingPdbResLabel, cutoff_plddt_score: float
) -> ChoppingPdbResLabel:
# Cut from the start of the first boundary until a residue has a pLDDT score > cutoff_plddt_score

# take a copy so we don't change existing data
Expand Down Expand Up @@ -212,12 +238,12 @@ def cut_chopping_start(
msg = f"failed to get new start segment with chopping {chopping}"
raise NoMatchingResiduesError(msg)

return Chopping(segments=new_segments)
return chopping.__class__(segments=new_segments)


def cut_chopping_end(
structure, chopping: Chopping, cutoff_plddt_score: float
) -> Chopping:
structure, chopping: ChoppingPdbResLabel, cutoff_plddt_score: float
) -> ChoppingPdbResLabel:
# Cut from the end of the last boundary until a residue has a pLDDT score > cutoff_plddt_score

# take a copy so we don't change existing data
Expand Down Expand Up @@ -245,7 +271,7 @@ def cut_chopping_end(
msg = f"failed to get new end segment with chopping {chopping}"
raise NoMatchingResiduesError(msg)

return Chopping(segments=new_segments)
return chopping.__class__(segments=new_segments)


def calculate_domain_id_post_tailchop(
Expand Down Expand Up @@ -292,7 +318,7 @@ def calculate_domain_id_post_tailchop(
# We do this by first cutting from the start and then from the end.

# make a copy so we don't touch the old chopping
_chopping = old_chopping.deep_copy()
_chopping = old_chopping.deep_copy().as_pdbreslabel()

# adjust the start of the new chopping
_chopping = cut_chopping_start(structure, _chopping, cutoff_plddt_score)
Expand All @@ -304,17 +330,17 @@ def calculate_domain_id_post_tailchop(

# create a new AF domain id with the new chopping
af_domain_id_post_tailchop = af_domain_id.deep_copy()
af_domain_id_post_tailchop.chopping = Chopping(segments=new_segments)
af_domain_id_post_tailchop.chopping = ChoppingPdbResLabel(segments=new_segments)

return af_domain_id_post_tailchop


def write_status_log(status_log, entry_id, status, error, description):
status_log.writerow(
{
"entry_id": entry_id,
"status": status,
"error": error,
"description": description,
}
)

{
"entry_id": entry_id,
"status": status,
"error": error,
"description": description,
}
)
Loading

0 comments on commit 37ef75f

Please sign in to comment.