From 3c05f46b295e80fb2afd67dc3f73a03fe99e5d3e Mon Sep 17 00:00:00 2001 From: kelleherkj Date: Fri, 24 Apr 2026 12:24:35 -0400 Subject: [PATCH] Add BioPlex and Reactome PPI ingests for Pharos --- AGENTS.md | 1 + designs/ppi/bioplex_ppi_ingest_design.md | 99 ++++++++++++++ designs/ppi/reactome_ppi_ingest_design.md | 127 ++++++++++++++++++ designs/ppi/string_ppi_ingest_design.md | 13 +- src/constants.py | 1 + src/input_adapters/bioplex/bioplex_ppi.py | 108 +++++++++++++++ src/input_adapters/pharos_arango/tcrd/ppi.py | 9 +- src/input_adapters/reactome/reactome_ppi.py | 103 ++++++++++++++ src/input_adapters/string/string_ppi.py | 1 - src/interfaces/input_adapter.py | 6 +- src/models/ppi.py | 9 +- src/output_adapters/sql_converters/tcrd.py | 11 +- src/use_cases/pharos/pharos.yaml | 18 +++ src/use_cases/pharos/target_graph.yaml | 18 +++ src/use_cases/working.yaml | 21 +++ tests/test_bioplex_ppi.py | 111 +++++++++++++++ ...test_input_adapter_canonical_edge_remap.py | 28 ++++ tests/test_reactome_ppi.py | 74 ++++++++++ tests/test_string_ppi.py | 2 +- tests/test_tcrd_output_converter.py | 22 +++ workflows/pharos.Snakefile | 30 ++++- 21 files changed, 787 insertions(+), 25 deletions(-) create mode 100644 designs/ppi/bioplex_ppi_ingest_design.md create mode 100644 designs/ppi/reactome_ppi_ingest_design.md create mode 100644 src/input_adapters/bioplex/bioplex_ppi.py create mode 100644 src/input_adapters/reactome/reactome_ppi.py create mode 100644 tests/test_bioplex_ppi.py create mode 100644 tests/test_reactome_ppi.py diff --git a/AGENTS.md b/AGENTS.md index 1656799..0fc2884 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -30,6 +30,7 @@ ## Lessons Learned - Keep adapters focused on source parsing and structural graph emission; move cross-ontology ID normalization to resolvers. +- Raw source adapters should not populate `sources` or `provenance`; let the framework stamp canonical datasource/version metadata during ETL. - For ontology xrefs, maintain an explicit allowlist and perform case-insensitive prefix checks. - When adding new datasource version handling, use named parameters for `DatasourceVersionInfo` to avoid argument-order regressions. - When an edge can be emitted by multiple sources and later merged, keep source-specific payload in a `details` list instead of top-level edge fields. diff --git a/designs/ppi/bioplex_ppi_ingest_design.md b/designs/ppi/bioplex_ppi_ingest_design.md new file mode 100644 index 0000000..65b7817 --- /dev/null +++ b/designs/ppi/bioplex_ppi_ingest_design.md @@ -0,0 +1,99 @@ +# BioPlex PPI Ingest Design + +## Status + +Implemented, validated in the working graph and working MySQL paths, and promoted to `pharos.yaml` / `target_graph.yaml`. + +## Goal + +Add a first-pass BioPlex protein-protein interaction ingest for Pharos. + +## Source Choice + +Use the official undirected BioPlex 3.0 interaction releases from the BioPlex download page: + +- `BioPlex_293T_Network_10K_Dec_2019.tsv` +- `BioPlex_HCT116_Network_5.5K_Dec_2019.tsv` + +Rationale: + +- These are the current official-release network files exposed on the BioPlex site. +- They match the current graph `PPIEdge` model better than the directed bait-prey files. +- They avoid the noisier unfiltered candidate-interaction lists. + +## Source URLs + +- Landing page: `https://bioplex.hms.harvard.edu/interactions.php` +- Data index: `https://bioplex.hms.harvard.edu/data/` +- 293T release: `https://bioplex.hms.harvard.edu/data/BioPlex_293T_Network_10K_Dec_2019.tsv` +- HCT116 release: `https://bioplex.hms.harvard.edu/data/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv` + +## Version Strategy + +- Use BioPlex release label `3.0` as the dataset version. +- Capture per-file `Last-Modified` dates into `input_files/auto/bioplex/bioplex_version.tsv`. +- Let adapter-side `download_date` come from file mtime unless we later decide to persist it explicitly. + +Observed current `/data` filenames include a December 2019 stamp even though the site still presents them as the current BioPlex 3.0 official releases. + +## Observed File Shape + +Current BioPlex 3.0 files have the same shape as the legacy TCRD BioPlex loader expected: + +- `GeneA` +- `GeneB` +- `UniprotA` +- `UniprotB` +- `SymbolA` +- `SymbolB` +- `pW` +- `pNI` +- `pInt` + +Observed counts: + +- `BioPlex_293T_Network_10K_Dec_2019.tsv`: `118,162` rows +- `BioPlex_HCT116_Network_5.5K_Dec_2019.tsv`: `70,966` rows + +Observed payload details: + +- no self-pairs in either file +- gene IDs are numeric Entrez Gene identifiers +- isoform-suffixed UniProt accessions are common +- `UniprotA` can be the literal string `UNKNOWN` + - `293T`: `4,950` rows + - `HCT116`: `1,688` rows +- `UniprotB` did not contain `UNKNOWN` in the profiled files +- `pInt` values range from about `0.75` to `1.0` + +## Implemented Mapping + +Current graph mapping: + +- emit `PPIEdge` +- use UniProt accessions as the primary emitted identifier family for endpoint proteins +- fall back to `NCBIGene` when BioPlex reports `UniprotA='UNKNOWN'` +- preserve BioPlex confidence-style fields into: + - `p_wrong` + - `p_ni` + - `p_int` + +Implementation choice: + +- configure one adapter instance per file so provenance distinguishes `293T` versus `HCT116` via the version string +- do not populate adapter-level `sources`; the ETL framework stamps canonical datasource/version metadata + +## Validation Summary + +Validated outcomes: + +- merged graph edges can carry multiple `p_int` values when the same canonical pair is supported by both BioPlex cell lines +- `UNKNOWN` UniProt rows resolve through `NCBIGene:*` fallback when the reviewed target graph contains the mapped protein +- downstream `ncats_ppi` exports BioPlex rows with scalar `p_int`, `p_ni`, and `p_wrong` using `max(...)` collapse for merged graph lists +- promoted into: + - `src/use_cases/pharos/pharos.yaml` + - `src/use_cases/pharos/target_graph.yaml` + +Open follow-up questions: + +- whether cell-line provenance should eventually be carried in a dedicated edge field instead of only in provenance / sources diff --git a/designs/ppi/reactome_ppi_ingest_design.md b/designs/ppi/reactome_ppi_ingest_design.md new file mode 100644 index 0000000..5653863 --- /dev/null +++ b/designs/ppi/reactome_ppi_ingest_design.md @@ -0,0 +1,127 @@ +# Reactome PPI Ingest Design + +## Status + +Implemented, validated in the working graph and working MySQL paths, and promoted to `pharos.yaml` / `target_graph.yaml`. + +## Goal + +Add a first-pass Reactome-derived protein-protein interaction ingest for Pharos. + +## Source Choice + +Use the official human Reactome tab-delimited interaction file: + +- `reactome.homo_sapiens.interactions.tab-delimited.txt` + +Rationale: + +- This is the current official human interaction export on the Reactome download site. +- It matches the old TCRD loader input format. +- It includes interaction type and context/PMID fields that can map naturally into the current PPI model and TCRD export. + +## Source URLs + +- Download docs: `https://reactome.org/download-data?id=62&ml=1` +- Directory index: `https://reactome.org/download/current/interactors/` +- Human tab-delimited file: `https://reactome.org/download/current/interactors/reactome.homo_sapiens.interactions.tab-delimited.txt` + +## Version Strategy + +- Use the Reactome database version recorded in `input_files/auto/reactome/reactome_version.tsv` +- Use the PPI file `Last-Modified` header as `version_date` +- Let adapter-side `download_date` come from file mtime unless we later decide to persist it explicitly + +## Documented File Shape + +Reactome documents the tab-delimited human interaction file as: + +1. interactor 1 protein ID +2. interactor 1 Ensembl gene ID(s) +3. interactor 1 Entrez Gene ID(s) +4. interactor 2 protein ID +5. interactor 2 Ensembl gene ID(s) +6. interactor 2 Entrez Gene ID(s) +7. interaction type +8. interaction context +9. PubMed IDs + +## Legacy Comparison + +The old UNM TCRD loader used the same human tab-delimited Reactome interaction file and: + +- required both interactors to have UniProt IDs +- populated `interaction_type` +- skipped duplicate interaction rows +- skipped self-pairs + +This should be treated as a comparison point only; current behavior should still be validated against the real file after download. + +## Observed File Profile + +Observed counts from the downloaded file: + +- total rows: `123,895` +- rows where both interactors are UniProt proteins: `83,545` +- filtered non-protein rows: `40,350` +- protein self-pairs: `7,677` +- duplicate unordered protein-pair-plus-type rows: `57,386` +- distinct unordered protein-pair-plus-type combinations: `26,159` + +Observed payload behavior: + +- non-protein rows include identifiers such as `ChEBI:*` +- almost every protein-protein row has PubMed references +- every row has a Reactome context string like `reactome:R-HSA-...` +- current interaction types include values such as: + - `physical association` + - `enzymatic reaction` + - `cleavage reaction` + - `dephosphorylation reaction` + +## Implemented Mapping + +Current first-pass graph mapping: + +- emit `PPIEdge` +- keep only rows where both interactors are UniProt IDs +- skip self-pairs +- canonicalize unordered protein pairs +- dedupe repeated source rows by unordered pair plus interaction type +- preserve: + - `interaction_type` as a graph list field + - `contexts` as a graph list field + - `pmids` as a graph list field +- do not populate adapter-level `sources`; the ETL framework stamps canonical datasource/version metadata + +## Legacy Downstream Comparison + +Direct inspection of `pharos319.ncats_ppi` showed: + +- `StringDB` populated only `score` +- `BioPlex` populated only `p_int`, `p_ni`, and `p_wrong` +- `Reactome` rows left `evidence`, `interaction_type`, `score`, `p_int`, `p_ni`, and `p_wrong` empty + +## Current Downstream Mapping + +Current IFX_ODIN downstream decision: + +- keep Reactome `pmids`, `contexts`, and `interaction_type` in the graph +- map `pmids` to `ncats_ppi.evidence` as pipe-delimited PMIDs +- map `interaction_type` to `ncats_ppi.interaction_type` +- keep `contexts` graph-only for now + +## Validation Summary + +Validated outcomes: + +- Reactome-backed graph edges landed with non-empty `pmids`, `contexts`, and `interaction_type` +- Reactome merged cleanly with both BioPlex and STRING on shared canonical pairs +- downstream `ncats_ppi` rows now carry Reactome PMIDs in `evidence` and Reactome interaction types in `interaction_type` +- promoted into: + - `src/use_cases/pharos/pharos.yaml` + - `src/use_cases/pharos/target_graph.yaml` + +## Open Follow-Ups + +- decide whether Reactome context should eventually have its own dedicated downstream column or lookup table diff --git a/designs/ppi/string_ppi_ingest_design.md b/designs/ppi/string_ppi_ingest_design.md index 097639e..ea89f3f 100644 --- a/designs/ppi/string_ppi_ingest_design.md +++ b/designs/ppi/string_ppi_ingest_design.md @@ -4,8 +4,7 @@ Implemented and validated in the working graph and working MySQL paths. -This first pass covers **STRING human protein-protein interactions** only. -BioPlex and Reactome PPI remain follow-up sources. +STRING, BioPlex, and Reactome PPI are now all implemented for Pharos. ## Scope @@ -15,9 +14,7 @@ Implemented source: Explicitly deferred: -- BioPlex PPI -- Reactome PPI -- source-specific `interaction_type` / `evidence` population +- richer STRING channel-specific fields from `.protein.links.full...` ## Files Added / Changed @@ -102,7 +99,6 @@ Confirmed old IFX_ODIN / Pharos readback behavior: - `PPIEdge` - `start_node`: `Protein(id="ENSEMBL:ENSP...")` - `end_node`: `Protein(id="ENSEMBL:ENSP...")` - - `sources`: STRING provenance list - `score`: list-valued, emitted as `[combined_score]` Implementation choices: @@ -110,6 +106,7 @@ Implementation choices: - `score_cutoff` is an adapter parameter with default `400` - rows below the cutoff are discarded before they enter the graph - self-pairs are discarded before they enter the graph +- adapter does not populate `sources`; the ETL framework stamps canonical datasource/version metadata - `max_rows` is supported for bounded validation runs and counts **kept emitted edges**, not scanned raw lines @@ -237,10 +234,6 @@ pairs before export. - Profile whether STRING `.protein.links.full...` is worth revisiting for richer channel-specific fields -- Add Reactome PPI ingest - - populate `interaction_type` - - decide whether the Reactome evidence/context column should map to `evidence` -- Add BioPlex PPI ingest - Decide whether downstream `ncats_ppi` export should collapse duplicate canonical pairs before reciprocal row generation, or continue to preserve one SQL row pair per graph edge diff --git a/src/constants.py b/src/constants.py index 746005f..63f9177 100755 --- a/src/constants.py +++ b/src/constants.py @@ -18,6 +18,7 @@ class DataSourceName(SimpleEnum): Cellosaurus = "Cellosaurus" Reactome = "Reactome" STRING = "STRING" + BioPlex = "BioPlex" WikiPathways = "WikiPathways" PathwayCommons = "PathwayCommons" CLO = "Cell Line Ontology (CLO)" diff --git a/src/input_adapters/bioplex/bioplex_ppi.py b/src/input_adapters/bioplex/bioplex_ppi.py new file mode 100644 index 0000000..58002b6 --- /dev/null +++ b/src/input_adapters/bioplex/bioplex_ppi.py @@ -0,0 +1,108 @@ +import csv +from datetime import date +from pathlib import Path +from typing import Generator, List, Optional + +from src.constants import DataSourceName, Prefix +from src.input_adapters.flat_file_adapter import FlatFileAdapter +from src.models.datasource_version_info import DatasourceVersionInfo +from src.models.node import EquivalentId +from src.models.ppi import PPIEdge +from src.models.protein import Protein + + +class BioPlexPPIAdapter(FlatFileAdapter): + version_info: DatasourceVersionInfo + + def __init__( + self, + file_path: str, + version_file_path: Optional[str] = None, + max_rows: Optional[int] = None, + ): + FlatFileAdapter.__init__(self, file_path=file_path) + self.max_rows = max_rows + self.version_info = self._load_version_info(version_file_path) + + def _load_version_info(self, version_file_path: Optional[str]) -> DatasourceVersionInfo: + version = None + version_date = None + download_date = self.download_date + if version_file_path: + with open(version_file_path, "r", encoding="utf-8", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + matching_row = None + for row in reader: + if row.get("file") == Path(self.file_path).name: + matching_row = row + break + if matching_row: + version_label = matching_row.get("version") or None + dataset_label = matching_row.get("dataset") or None + if version_label and dataset_label: + cell_line = dataset_label.replace("BioPlex", "", 1).strip().replace(version_label, "", 1).strip() + version = f"{version_label} ({cell_line})" if cell_line else version_label + else: + version = version_label or dataset_label or None + version_date_str = matching_row.get("version_date") or None + download_date_str = matching_row.get("download_date") or None + version_date = date.fromisoformat(version_date_str) if version_date_str else None + download_date = date.fromisoformat(download_date_str) if download_date_str else download_date + return DatasourceVersionInfo( + version=version, + version_date=version_date, + download_date=download_date, + ) + + def get_datasource_name(self) -> DataSourceName: + return DataSourceName.BioPlex + + def get_version(self) -> DatasourceVersionInfo: + return self.version_info + + @staticmethod + def _normalize_value(value: str) -> str: + return value.strip().strip('"') + + @classmethod + def _protein_id(cls, uniprot_id: str, gene_id: str) -> str: + if uniprot_id and uniprot_id != "UNKNOWN": + return EquivalentId(id=uniprot_id, type=Prefix.UniProtKB).id_str() + return EquivalentId(id=gene_id, type=Prefix.NCBIGene).id_str() + + def get_all(self) -> Generator[List[PPIEdge], None, None]: + batch: List[PPIEdge] = [] + kept_rows = 0 + with open(self.file_path, "r", encoding="utf-8", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + for row in reader: + if self.max_rows is not None and kept_rows >= self.max_rows: + break + + gene_a = self._normalize_value(row["GeneA"]) + gene_b = self._normalize_value(row["GeneB"]) + uniprot_a = self._normalize_value(row["UniprotA"]) + uniprot_b = self._normalize_value(row["UniprotB"]) + + if gene_a == gene_b and uniprot_a == uniprot_b: + continue + + protein_a = self._protein_id(uniprot_a, gene_a) + protein_b = self._protein_id(uniprot_b, gene_b) + if protein_a == protein_b: + continue + protein_a, protein_b = sorted((protein_a, protein_b)) + + edge = PPIEdge( + start_node=Protein(id=protein_a), + end_node=Protein(id=protein_b), + p_wrong=[float(self._normalize_value(row["pW"]))], + p_ni=[float(self._normalize_value(row["pNI"]))], + p_int=[float(self._normalize_value(row["pInt"]))], + ) + batch.append(edge) + kept_rows += 1 + if len(batch) >= self.batch_size: + yield batch + batch = [] + yield batch diff --git a/src/input_adapters/pharos_arango/tcrd/ppi.py b/src/input_adapters/pharos_arango/tcrd/ppi.py index 5a86526..5d28597 100644 --- a/src/input_adapters/pharos_arango/tcrd/ppi.py +++ b/src/input_adapters/pharos_arango/tcrd/ppi.py @@ -37,9 +37,12 @@ def get_all(self) -> Generator[List[PPIEdge], None, None]: end_node=Protein(id=row["end_id"]), provenance=row.get("provenance"), sources=row.get("sources") or [], - p_int=row.get("p_int"), - p_ni=row.get("p_ni"), - p_wrong=row.get("p_wrong"), + p_int=row.get("p_int") or [], + p_ni=row.get("p_ni") or [], + p_wrong=row.get("p_wrong") or [], + pmids=row.get("pmids") or [], + contexts=row.get("contexts") or [], + interaction_type=row.get("interaction_type") or [], score=row.get("score") or [], ) for row in rows diff --git a/src/input_adapters/reactome/reactome_ppi.py b/src/input_adapters/reactome/reactome_ppi.py new file mode 100644 index 0000000..a1a3c2f --- /dev/null +++ b/src/input_adapters/reactome/reactome_ppi.py @@ -0,0 +1,103 @@ +import csv +from datetime import date +from collections import OrderedDict +from typing import Generator, List, Optional + +from src.constants import DataSourceName, Prefix +from src.input_adapters.flat_file_adapter import FlatFileAdapter +from src.models.datasource_version_info import DatasourceVersionInfo +from src.models.node import EquivalentId +from src.models.ppi import PPIEdge +from src.models.protein import Protein + + +class ReactomePPIAdapter(FlatFileAdapter): + version_info: DatasourceVersionInfo + + def __init__(self, file_path: str, version_file_path: Optional[str] = None, max_rows: Optional[int] = None): + FlatFileAdapter.__init__(self, file_path=file_path) + self.max_rows = max_rows + self.version_info = self._load_version_info(version_file_path) + + def _load_version_info(self, version_file_path: Optional[str]) -> DatasourceVersionInfo: + version = None + version_date = None + download_date = self.download_date + if version_file_path: + with open(version_file_path, "r", encoding="utf-8", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + first_row = next(reader, None) + if first_row: + version = first_row.get("version") or None + version_date_str = first_row.get("version_date") or None + download_date_str = first_row.get("download_date") or None + version_date = date.fromisoformat(version_date_str) if version_date_str else None + download_date = date.fromisoformat(download_date_str) if download_date_str else download_date + return DatasourceVersionInfo(version=version, version_date=version_date, download_date=download_date) + + def get_datasource_name(self) -> DataSourceName: + return DataSourceName.Reactome + + def get_version(self) -> DatasourceVersionInfo: + return self.version_info + + @staticmethod + def _protein_id(raw_value: str) -> Optional[str]: + value = raw_value.strip() + if not value.startswith("uniprotkb:"): + return None + return EquivalentId(id=value.split(":", 1)[1], type=Prefix.UniProtKB).id_str() + + @staticmethod + def _parse_pmids(raw_value: str) -> List[int]: + if not raw_value: + return [] + pmids = [] + for token in raw_value.replace(",", "|").replace(";", "|").split("|"): + token = token.strip() + if token.isdigit(): + pmids.append(int(token)) + return list(dict.fromkeys(pmids)) + + def get_all(self) -> Generator[List[PPIEdge], None, None]: + edges_by_key = OrderedDict() + kept_rows = 0 + with open(self.file_path, "r", encoding="utf-8", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + for row in reader: + protein1 = self._protein_id(row["# Interactor 1 uniprot id"]) + protein2 = self._protein_id(row["Interactor 2 uniprot id"]) + if protein1 is None or protein2 is None: + continue + if protein1 == protein2: + continue + protein1, protein2 = sorted((protein1, protein2)) + + interaction_type = row["Interaction type"].strip() + key = (protein1, protein2, interaction_type) + if key not in edges_by_key: + if self.max_rows is not None and kept_rows >= self.max_rows: + continue + edges_by_key[key] = PPIEdge( + start_node=Protein(id=protein1), + end_node=Protein(id=protein2), + interaction_type=[interaction_type] if interaction_type else [], + contexts=[], + pmids=[], + ) + kept_rows += 1 + edge = edges_by_key[key] + context = row["Interaction context"].strip() + if context and context not in edge.contexts: + edge.contexts.append(context) + for pmid in self._parse_pmids(row["Pubmed references"].strip()): + if pmid not in edge.pmids: + edge.pmids.append(pmid) + + batch: List[PPIEdge] = [] + for edge in edges_by_key.values(): + batch.append(edge) + if len(batch) >= self.batch_size: + yield batch + batch = [] + yield batch diff --git a/src/input_adapters/string/string_ppi.py b/src/input_adapters/string/string_ppi.py index 9256f4c..bd2eb5a 100644 --- a/src/input_adapters/string/string_ppi.py +++ b/src/input_adapters/string/string_ppi.py @@ -89,7 +89,6 @@ def get_all(self) -> Generator[List[PPIEdge], None, None]: edge = PPIEdge( start_node=Protein(id=f"{Prefix.ENSEMBL}:{protein1}"), end_node=Protein(id=f"{Prefix.ENSEMBL}:{protein2}"), - sources=[self.get_datasource_name().value], score=[score], ) batch.append(edge) diff --git a/src/interfaces/input_adapter.py b/src/interfaces/input_adapter.py index 3af0858..a2af52d 100644 --- a/src/interfaces/input_adapter.py +++ b/src/interfaces/input_adapter.py @@ -16,7 +16,6 @@ class InputAdapter(ABC): def _canonicalize_relationship_class(rel: Relationship, start_node: Node, end_node: Node) -> Relationship: from src.models.disease import GeneDiseaseEdge, ProteinDiseaseEdge from src.models.expression import GeneTissueExpressionEdge, ProteinTissueExpressionEdge - from src.models.gene import Gene from src.models.pathway import GenePathwayEdge, ProteinPathwayEdge from src.models.protein import Protein from src.models.tissue import Tissue @@ -85,8 +84,9 @@ def get_and_delete_old_id(node): version_info = self.get_version() version_data = [self.get_datasource_name(), version_info.version, version_info.version_date, version_info.download_date] version_string = '\t'.join([str(e) for e in version_data]) - entry.provenance = version_string - if self.get_datasource_name() != DataSourceName.PostProcessing: + if not getattr(entry, 'provenance', None): + entry.provenance = version_string + if self.get_datasource_name() != DataSourceName.PostProcessing and not getattr(entry, 'sources', None): entry.sources = [version_string] nodes = [e for e in entries if isinstance(e, Node)] diff --git a/src/models/ppi.py b/src/models/ppi.py index 4ec3d32..e1f8683 100644 --- a/src/models/ppi.py +++ b/src/models/ppi.py @@ -10,7 +10,10 @@ class PPIEdge(Relationship): start_node: Protein end_node: Protein sources: List[str] = field(default_factory=list) - p_int: float = None - p_ni: float = None - p_wrong: float = None + p_int: List[float] = field(default_factory=list) + p_ni: List[float] = field(default_factory=list) + p_wrong: List[float] = field(default_factory=list) + pmids: List[int] = field(default_factory=list) + contexts: List[str] = field(default_factory=list) + interaction_type: List[str] = field(default_factory=list) score: List[float] = field(default_factory=list) diff --git a/src/output_adapters/sql_converters/tcrd.py b/src/output_adapters/sql_converters/tcrd.py index cc1d692..efda18d 100755 --- a/src/output_adapters/sql_converters/tcrd.py +++ b/src/output_adapters/sql_converters/tcrd.py @@ -637,6 +637,13 @@ def _max_or_value(value): return max(value) if value else None return value + @staticmethod + def _join_list(value): + if isinstance(value, list): + normalized = [str(v) for v in value if v is not None and str(v) != ""] + return "|".join(dict.fromkeys(normalized)) if normalized else None + return value + def ppi_converter(self, obj: dict) -> List[mysqlPPI]: protein_id = self.resolve_id('protein', obj['start_id']) other_id = self.resolve_id('protein', obj['end_id']) @@ -652,8 +659,8 @@ def ppi_converter(self, obj: dict) -> List[mysqlPPI]: p_int=self._max_or_value(obj.get('p_int')), p_ni=self._max_or_value(obj.get('p_ni')), p_wrong=self._max_or_value(obj.get('p_wrong')), - evidence=obj.get('evidence'), - interaction_type=obj.get('interaction_type'), + evidence=self._join_list(obj.get('pmids')) or obj.get('evidence'), + interaction_type=self._join_list(obj.get('interaction_type')), score=self._max_or_value(obj.get('score')), ) diff --git a/src/use_cases/pharos/pharos.yaml b/src/use_cases/pharos/pharos.yaml index 1b1f768..ac017f4 100644 --- a/src/use_cases/pharos/pharos.yaml +++ b/src/use_cases/pharos/pharos.yaml @@ -145,6 +145,24 @@ input_adapters: version_file_path: ./input_files/auto/string/string_version.tsv score_cutoff: 400 + - import: ./src/input_adapters/bioplex/bioplex_ppi.py + class: BioPlexPPIAdapter + kwargs: + file_path: ./input_files/auto/bioplex/BioPlex_293T_Network_10K_Dec_2019.tsv + version_file_path: ./input_files/auto/bioplex/bioplex_version.tsv + + - import: ./src/input_adapters/bioplex/bioplex_ppi.py + class: BioPlexPPIAdapter + kwargs: + file_path: ./input_files/auto/bioplex/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv + version_file_path: ./input_files/auto/bioplex/bioplex_version.tsv + + - import: ./src/input_adapters/reactome/reactome_ppi.py + class: ReactomePPIAdapter + kwargs: + file_path: ./input_files/auto/reactome/reactome.homo_sapiens.interactions.tab-delimited.txt + version_file_path: ./input_files/auto/reactome/reactome_version.tsv + - import: ./src/input_adapters/target_graph/generif_node.py class: GeneRifNodeAdapter kwargs: diff --git a/src/use_cases/pharos/target_graph.yaml b/src/use_cases/pharos/target_graph.yaml index 872caff..8983319 100644 --- a/src/use_cases/pharos/target_graph.yaml +++ b/src/use_cases/pharos/target_graph.yaml @@ -157,6 +157,24 @@ input_adapters: version_file_path: ./input_files/auto/string/string_version.tsv score_cutoff: 400 + - import: ./src/input_adapters/bioplex/bioplex_ppi.py + class: BioPlexPPIAdapter + kwargs: + file_path: ./input_files/auto/bioplex/BioPlex_293T_Network_10K_Dec_2019.tsv + version_file_path: ./input_files/auto/bioplex/bioplex_version.tsv + + - import: ./src/input_adapters/bioplex/bioplex_ppi.py + class: BioPlexPPIAdapter + kwargs: + file_path: ./input_files/auto/bioplex/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv + version_file_path: ./input_files/auto/bioplex/bioplex_version.tsv + + - import: ./src/input_adapters/reactome/reactome_ppi.py + class: ReactomePPIAdapter + kwargs: + file_path: ./input_files/auto/reactome/reactome.homo_sapiens.interactions.tab-delimited.txt + version_file_path: ./input_files/auto/reactome/reactome_version.tsv + - import: ./src/input_adapters/target_graph/protein_nodes_and_edges.py class: IsoformProteinEdgeAdapter kwargs: diff --git a/src/use_cases/working.yaml b/src/use_cases/working.yaml index 1d38496..8043215 100644 --- a/src/use_cases/working.yaml +++ b/src/use_cases/working.yaml @@ -42,6 +42,27 @@ input_adapters: score_cutoff: 400 max_rows: 10000 + - import: ./src/input_adapters/bioplex/bioplex_ppi.py + class: BioPlexPPIAdapter + kwargs: + file_path: ./input_files/auto/bioplex/BioPlex_293T_Network_10K_Dec_2019.tsv + version_file_path: ./input_files/auto/bioplex/bioplex_version.tsv + max_rows: 10000 + + - import: ./src/input_adapters/bioplex/bioplex_ppi.py + class: BioPlexPPIAdapter + kwargs: + file_path: ./input_files/auto/bioplex/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv + version_file_path: ./input_files/auto/bioplex/bioplex_version.tsv + max_rows: 10000 + + - import: ./src/input_adapters/reactome/reactome_ppi.py + class: ReactomePPIAdapter + kwargs: + file_path: ./input_files/auto/reactome/reactome.homo_sapiens.interactions.tab-delimited.txt + version_file_path: ./input_files/auto/reactome/reactome_version.tsv + max_rows: 10000 + output_adapters: - import: ./src/output_adapters/arango_output_adapter.py class: ArangoOutputAdapter diff --git a/tests/test_bioplex_ppi.py b/tests/test_bioplex_ppi.py new file mode 100644 index 0000000..6602ad3 --- /dev/null +++ b/tests/test_bioplex_ppi.py @@ -0,0 +1,111 @@ +from src.input_adapters.bioplex.bioplex_ppi import BioPlexPPIAdapter + + +def test_bioplex_adapter_preserves_probability_fields_and_canonicalizes_direction(tmp_path): + data_path = tmp_path / "BioPlex_293T_Network_10K_Dec_2019.tsv" + version_path = tmp_path / "bioplex_version.tsv" + + data_path.write_text( + "\n".join( + [ + '"GeneA"\t"GeneB"\t"UniprotA"\t"UniprotB"\t"SymbolA"\t"SymbolB"\t"pW"\t"pNI"\t"pInt"', + '"2"\t"1"\t"Q8N7W2-2"\t"P00813"\t"BEND7"\t"ADA"\t"0.01"\t"0.02"\t"0.97"', + ] + ), + encoding="utf-8", + ) + version_path.write_text( + "\n".join( + [ + "dataset\tfile\tversion\tversion_date", + "BioPlex 3.0 293T\tBioPlex_293T_Network_10K_Dec_2019.tsv\t3.0\t2024-01-19", + ] + ), + encoding="utf-8", + ) + + adapter = BioPlexPPIAdapter( + file_path=str(data_path), + version_file_path=str(version_path), + ) + + batches = list(adapter.get_all()) + edges = [edge for batch in batches for edge in batch] + + assert len(edges) == 1 + assert edges[0].start_node.id == "UniProtKB:P00813" + assert edges[0].end_node.id == "UniProtKB:Q8N7W2-2" + assert edges[0].p_wrong == [0.01] + assert edges[0].p_ni == [0.02] + assert edges[0].p_int == [0.97] + assert edges[0].sources == [] + + version = adapter.get_version() + assert version.version == "3.0 (293T)" + assert version.version_date.isoformat() == "2024-01-19" + + +def test_bioplex_adapter_falls_back_to_ncbi_gene_for_unknown_uniprot(tmp_path): + data_path = tmp_path / "BioPlex_HCT116_Network_5.5K_Dec_2019.tsv" + version_path = tmp_path / "bioplex_version.tsv" + + data_path.write_text( + "\n".join( + [ + '"GeneA"\t"GeneB"\t"UniprotA"\t"UniprotB"\t"SymbolA"\t"SymbolB"\t"pW"\t"pNI"\t"pInt"', + '"3012"\t"4673"\t"UNKNOWN"\t"P55209"\t"HIST1H2AE"\t"NAP1L1"\t"0.001"\t"0.002"\t"0.997"', + ] + ), + encoding="utf-8", + ) + version_path.write_text( + "\n".join( + [ + "dataset\tfile\tversion\tversion_date", + "BioPlex 3.0 HCT116\tBioPlex_HCT116_Network_5.5K_Dec_2019.tsv\t3.0\t2024-01-19", + ] + ), + encoding="utf-8", + ) + + adapter = BioPlexPPIAdapter( + file_path=str(data_path), + version_file_path=str(version_path), + ) + + batches = list(adapter.get_all()) + edges = [edge for batch in batches for edge in batch] + + assert len(edges) == 1 + assert {edges[0].start_node.id, edges[0].end_node.id} == {"NCBIGene:3012", "UniProtKB:P55209"} + + +def test_bioplex_adapter_honors_max_rows_on_kept_edges(tmp_path): + data_path = tmp_path / "BioPlex_293T_Network_10K_Dec_2019.tsv" + + data_path.write_text( + "\n".join( + [ + '"GeneA"\t"GeneB"\t"UniprotA"\t"UniprotB"\t"SymbolA"\t"SymbolB"\t"pW"\t"pNI"\t"pInt"', + '"1"\t"1"\t"P11142"\t"P11142"\t"HSPA8"\t"HSPA8"\t"0.1"\t"0.2"\t"0.7"', + '"1"\t"2"\t"P11142"\t"P00813"\t"HSPA8"\t"ADA"\t"0.01"\t"0.02"\t"0.97"', + '"3"\t"4"\t"Q9Y3U8"\t"P36578"\t"RPL36"\t"RPL4"\t"0.03"\t"0.04"\t"0.93"', + '"5"\t"6"\t"P26373"\t"Q09028-3"\t"RPL13"\t"RBBP4"\t"0.05"\t"0.06"\t"0.89"', + ] + ), + encoding="utf-8", + ) + + adapter = BioPlexPPIAdapter( + file_path=str(data_path), + max_rows=2, + ) + + batches = list(adapter.get_all()) + edges = [edge for batch in batches for edge in batch] + + assert len(edges) == 2 + assert [(edge.start_node.id, edge.end_node.id) for edge in edges] == [ + ("UniProtKB:P00813", "UniProtKB:P11142"), + ("UniProtKB:P36578", "UniProtKB:Q9Y3U8"), + ] diff --git a/tests/test_input_adapter_canonical_edge_remap.py b/tests/test_input_adapter_canonical_edge_remap.py index d7bf1c1..10ff25a 100644 --- a/tests/test_input_adapter_canonical_edge_remap.py +++ b/tests/test_input_adapter_canonical_edge_remap.py @@ -145,3 +145,31 @@ def test_input_adapter_keeps_measured_protein_edge_start_node_typed_as_measured_ assert rel.start_node.id == "UniProtKB:P12345" assert isinstance(rel.end_node, Protein) assert rel.end_node.id == "UniProtKB:P12345" + + +def test_input_adapter_preserves_existing_edge_sources_and_provenance(): + edge = ProteinPathwayEdge( + start_node=Protein(id="UniProtKB:P1"), + end_node=Pathway(id="Reactome:R-HSA-1"), + source="Reactome", + ) + edge.sources = ["BioPlex\t3.0 (293T)\t2024-01-19\t2026-04-24", "Reactome\t96\t2026-03-24\t2026-04-24"] + edge.provenance = "BioPlex\t3.0 (293T)\t2024-01-19\t2026-04-24" + adapter = _SingleBatchAdapter([edge]) + + batches = list(adapter.get_resolved_and_provenanced_list({ + "Protein": _IdentityResolver( + types=["Protein"], + no_match_behavior=NoMatchBehavior.Skip, + multi_match_behavior=MultiMatchBehavior.All, + ), + "Pathway": _IdentityResolver( + types=["Pathway"], + no_match_behavior=NoMatchBehavior.Skip, + multi_match_behavior=MultiMatchBehavior.All, + ), + })) + rel = batches[-1][0] + + assert rel.sources == ["BioPlex\t3.0 (293T)\t2024-01-19\t2026-04-24", "Reactome\t96\t2026-03-24\t2026-04-24"] + assert rel.provenance == "BioPlex\t3.0 (293T)\t2024-01-19\t2026-04-24" diff --git a/tests/test_reactome_ppi.py b/tests/test_reactome_ppi.py new file mode 100644 index 0000000..ed855ab --- /dev/null +++ b/tests/test_reactome_ppi.py @@ -0,0 +1,74 @@ +from src.input_adapters.reactome.reactome_ppi import ReactomePPIAdapter + + +def test_reactome_ppi_adapter_filters_non_protein_rows_and_keeps_context_and_pmids(tmp_path): + data_path = tmp_path / "reactome.homo_sapiens.interactions.tab-delimited.txt" + version_path = tmp_path / "reactome_version.tsv" + + data_path.write_text( + "\n".join( + [ + "# Interactor 1 uniprot id\tInteractor 1 Ensembl gene id\tInteractor 1 Entrez Gene id\tInteractor 2 uniprot id\tInteractor 2 Ensembl gene id\tInteractor 2 Entrez Gene id\tInteraction type\tInteraction context\tPubmed references", + "uniprotkb:P08123\t-\t-\tuniprotkb:P02452\t-\t-\tphysical association\treactome:R-HSA-2428940\t24243840", + "ChEBI:29035\t-\t-\tuniprotkb:P02452\t-\t-\tphysical association\treactome:R-HSA-123\t111", + ] + ), + encoding="utf-8", + ) + version_path.write_text("version\tversion_date\n96\t2026-03-24\n", encoding="utf-8") + + adapter = ReactomePPIAdapter(file_path=str(data_path), version_file_path=str(version_path)) + edges = [edge for batch in adapter.get_all() for edge in batch] + + assert len(edges) == 1 + assert edges[0].start_node.id == "UniProtKB:P02452" + assert edges[0].end_node.id == "UniProtKB:P08123" + assert edges[0].interaction_type == ["physical association"] + assert edges[0].contexts == ["reactome:R-HSA-2428940"] + assert edges[0].pmids == [24243840] + assert edges[0].sources == [] + + +def test_reactome_ppi_adapter_skips_self_pairs_and_honors_max_rows(tmp_path): + data_path = tmp_path / "reactome.homo_sapiens.interactions.tab-delimited.txt" + + data_path.write_text( + "\n".join( + [ + "# Interactor 1 uniprot id\tInteractor 1 Ensembl gene id\tInteractor 1 Entrez Gene id\tInteractor 2 uniprot id\tInteractor 2 Ensembl gene id\tInteractor 2 Entrez Gene id\tInteraction type\tInteraction context\tPubmed references", + "uniprotkb:P08123\t-\t-\tuniprotkb:P08123\t-\t-\tphysical association\treactome:R-HSA-1\t1", + "uniprotkb:P08123\t-\t-\tuniprotkb:P02452\t-\t-\tphysical association\treactome:R-HSA-2\t2|3", + "uniprotkb:P01160\t-\t-\tuniprotkb:P06727\t-\t-\tenzymatic reaction\treactome:R-HSA-3\t4", + ] + ), + encoding="utf-8", + ) + + adapter = ReactomePPIAdapter(file_path=str(data_path), max_rows=1) + edges = [edge for batch in adapter.get_all() for edge in batch] + + assert len(edges) == 1 + assert edges[0].pmids == [2, 3] + assert edges[0].contexts == ["reactome:R-HSA-2"] + + +def test_reactome_ppi_adapter_dedupes_pair_plus_type_and_accumulates_contexts_and_pmids(tmp_path): + data_path = tmp_path / "reactome.homo_sapiens.interactions.tab-delimited.txt" + + data_path.write_text( + "\n".join( + [ + "# Interactor 1 uniprot id\tInteractor 1 Ensembl gene id\tInteractor 1 Entrez Gene id\tInteractor 2 uniprot id\tInteractor 2 Ensembl gene id\tInteractor 2 Entrez Gene id\tInteraction type\tInteraction context\tPubmed references", + "uniprotkb:P08123\t-\t-\tuniprotkb:P02452\t-\t-\tphysical association\treactome:R-HSA-1\t1|2", + "uniprotkb:P02452\t-\t-\tuniprotkb:P08123\t-\t-\tphysical association\treactome:R-HSA-2\t2|3", + ] + ), + encoding="utf-8", + ) + + adapter = ReactomePPIAdapter(file_path=str(data_path)) + edges = [edge for batch in adapter.get_all() for edge in batch] + + assert len(edges) == 1 + assert edges[0].contexts == ["reactome:R-HSA-1", "reactome:R-HSA-2"] + assert edges[0].pmids == [1, 2, 3] diff --git a/tests/test_string_ppi.py b/tests/test_string_ppi.py index 86c227a..5d71c89 100644 --- a/tests/test_string_ppi.py +++ b/tests/test_string_ppi.py @@ -35,7 +35,7 @@ def test_string_ppi_adapter_applies_default_cutoff_and_skips_self_pairs(tmp_path ("ENSEMBL:ENSP0002", "ENSEMBL:ENSP0003", [400]), ("ENSEMBL:ENSP0003", "ENSEMBL:ENSP0004", [700]), ] - assert all(edge.sources == ["STRING"] for edge in edges) + assert all(edge.sources == [] for edge in edges) version = adapter.get_version() assert version.version == "12.0" diff --git a/tests/test_tcrd_output_converter.py b/tests/test_tcrd_output_converter.py index d25eeac..c9e75fb 100644 --- a/tests/test_tcrd_output_converter.py +++ b/tests/test_tcrd_output_converter.py @@ -179,6 +179,28 @@ def test_ppi_converter_joins_multiple_source_labels(): assert all(row.score == 800 for row in rows) +def test_ppi_converter_maps_reactome_pmids_and_interaction_type_downstream(): + converter = TCRDOutputConverter() + converter.id_mapping["protein"] = { + "IFX123": 123, + "IFX456": 456, + } + + rows = converter.ppi_converter({ + "start_id": "IFX123", + "end_id": "IFX456", + "sources": ["Reactome\t96\t2026-03-24\t2026-04-24"], + "pmids": [24243840, 11163199], + "contexts": ["reactome:R-HSA-2428940"], + "interaction_type": ["physical association"], + "provenance": "Reactome\t96\t2026-03-24\t2026-04-24", + }) + + assert len(rows) == 2 + assert all(row.evidence == "24243840|11163199" for row in rows) + assert all(row.interaction_type == "physical association" for row in rows) + + def test_gtex_converter_branches_gtex_details_from_shared_expression_edge(): converter = TCRDOutputConverter() converter.id_mapping["protein"] = {"IFX123": 123} diff --git a/workflows/pharos.Snakefile b/workflows/pharos.Snakefile index 386f304..4e31fd5 100644 --- a/workflows/pharos.Snakefile +++ b/workflows/pharos.Snakefile @@ -24,7 +24,11 @@ rule all: "../input_files/auto/reactome/ReactomePathways.gmt.zip", "../input_files/auto/reactome/ReactomePathwaysRelation.txt", "../input_files/auto/reactome/UniProt2Reactome_All_Levels.txt", + "../input_files/auto/reactome/reactome.homo_sapiens.interactions.tab-delimited.txt", "../input_files/auto/reactome/reactome_version.tsv", + "../input_files/auto/bioplex/BioPlex_293T_Network_10K_Dec_2019.tsv", + "../input_files/auto/bioplex/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv", + "../input_files/auto/bioplex/bioplex_version.tsv", "../input_files/auto/string/9606.protein.links.v12.0.txt.gz", "../input_files/auto/string/string_version.tsv", "../input_files/auto/gtex/GTEx_Analysis_2025_08_22_v11_RNASeQCv2.4.3_gene_tpm.gct.gz", @@ -182,15 +186,37 @@ rule download_reactome: "../input_files/auto/reactome/ReactomePathways.gmt.zip", "../input_files/auto/reactome/ReactomePathwaysRelation.txt", "../input_files/auto/reactome/UniProt2Reactome_All_Levels.txt", + "../input_files/auto/reactome/reactome.homo_sapiens.interactions.tab-delimited.txt", "../input_files/auto/reactome/reactome_version.tsv" shell: """ curl -o {output[0]} https://reactome.org/download/current/ReactomePathways.gmt.zip curl -o {output[1]} https://reactome.org/download/current/ReactomePathwaysRelation.txt curl -o {output[2]} https://reactome.org/download/current/UniProt2Reactome_All_Levels.txt - last_modified=$(curl -fsI https://reactome.org/download/current/ReactomePathways.gmt.zip | awk -F': ' 'tolower($1)=="last-modified"{{print $2}}') + curl -o {output[3]} https://reactome.org/download/current/interactors/reactome.homo_sapiens.interactions.tab-delimited.txt + last_modified=$(curl -fsI https://reactome.org/download/current/interactors/reactome.homo_sapiens.interactions.tab-delimited.txt | awk -F': ' 'tolower($1)=="last-modified"{{print $2}}') version=$(curl -fs https://reactome.org/ContentService/data/database/version) - python3 -c "import email.utils,sys; lm=sys.argv[1]; v=sys.argv[2].strip(); out=sys.argv[3]; dt=email.utils.parsedate_to_datetime(lm).date().isoformat(); open(out,'w').write('version\\tversion_date\\n'+v+'\\t'+dt+'\\n')" "$last_modified" "$version" {output[3]} + python3 -c "import email.utils,sys; lm=sys.argv[1]; v=sys.argv[2].strip(); out=sys.argv[3]; dt=email.utils.parsedate_to_datetime(lm).date().isoformat(); open(out,'w').write('version\\tversion_date\\n'+v+'\\t'+dt+'\\n')" "$last_modified" "$version" {output[4]} + """ + +rule download_bioplex: + output: + "../input_files/auto/bioplex/BioPlex_293T_Network_10K_Dec_2019.tsv", + "../input_files/auto/bioplex/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv", + "../input_files/auto/bioplex/bioplex_version.tsv" + shell: + """ + mkdir -p ../input_files/auto/bioplex + p293t_url='https://bioplex.hms.harvard.edu/data/BioPlex_293T_Network_10K_Dec_2019.tsv' + hct116_url='https://bioplex.hms.harvard.edu/data/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv' + + curl -fL -o {output[0]} "$p293t_url" + curl -fL -o {output[1]} "$hct116_url" + + p293t_lm=$(curl -fsI "$p293t_url" | awk -F': ' 'tolower($1)=="last-modified"{{print $2}}') + hct116_lm=$(curl -fsI "$hct116_url" | awk -F': ' 'tolower($1)=="last-modified"{{print $2}}') + + python3 -c "import email.utils,sys; p293t_lm,hct116_lm,out=sys.argv[1:4]; p293t_dt=email.utils.parsedate_to_datetime(p293t_lm).date().isoformat(); hct116_dt=email.utils.parsedate_to_datetime(hct116_lm).date().isoformat(); open(out,'w').write('dataset\\tfile\\tversion\\tversion_date\\nBioPlex 3.0 293T\\tBioPlex_293T_Network_10K_Dec_2019.tsv\\t3.0\\t'+p293t_dt+'\\nBioPlex 3.0 HCT116\\tBioPlex_HCT116_Network_5.5K_Dec_2019.tsv\\t3.0\\t'+hct116_dt+'\\n')" "$p293t_lm" "$hct116_lm" {output[2]} """ rule download_string: