From 3c05f46b295e80fb2afd67dc3f73a03fe99e5d3e Mon Sep 17 00:00:00 2001
From: kelleherkj <keith.kelleher@ncats.nih.gov>
Date: Fri, 24 Apr 2026 12:24:35 -0400
Subject: [PATCH] Add BioPlex and Reactome PPI ingests for Pharos

---
 AGENTS.md                                     |   1 +
 designs/ppi/bioplex_ppi_ingest_design.md      |  99 ++++++++++++++
 designs/ppi/reactome_ppi_ingest_design.md     | 127 ++++++++++++++++++
 designs/ppi/string_ppi_ingest_design.md       |  13 +-
 src/constants.py                              |   1 +
 src/input_adapters/bioplex/bioplex_ppi.py     | 108 +++++++++++++++
 src/input_adapters/pharos_arango/tcrd/ppi.py  |   9 +-
 src/input_adapters/reactome/reactome_ppi.py   | 103 ++++++++++++++
 src/input_adapters/string/string_ppi.py       |   1 -
 src/interfaces/input_adapter.py               |   6 +-
 src/models/ppi.py                             |   9 +-
 src/output_adapters/sql_converters/tcrd.py    |  11 +-
 src/use_cases/pharos/pharos.yaml              |  18 +++
 src/use_cases/pharos/target_graph.yaml        |  18 +++
 src/use_cases/working.yaml                    |  21 +++
 tests/test_bioplex_ppi.py                     | 111 +++++++++++++++
 ...test_input_adapter_canonical_edge_remap.py |  28 ++++
 tests/test_reactome_ppi.py                    |  74 ++++++++++
 tests/test_string_ppi.py                      |   2 +-
 tests/test_tcrd_output_converter.py           |  22 +++
 workflows/pharos.Snakefile                    |  30 ++++-
 21 files changed, 787 insertions(+), 25 deletions(-)
 create mode 100644 designs/ppi/bioplex_ppi_ingest_design.md
 create mode 100644 designs/ppi/reactome_ppi_ingest_design.md
 create mode 100644 src/input_adapters/bioplex/bioplex_ppi.py
 create mode 100644 src/input_adapters/reactome/reactome_ppi.py
 create mode 100644 tests/test_bioplex_ppi.py
 create mode 100644 tests/test_reactome_ppi.py

diff --git a/AGENTS.md b/AGENTS.md
index 1656799..0fc2884 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -30,6 +30,7 @@
 ## Lessons Learned
 
 - Keep adapters focused on source parsing and structural graph emission; move cross-ontology ID normalization to resolvers.
+- Raw source adapters should not populate `sources` or `provenance`; let the framework stamp canonical datasource/version metadata during ETL.
 - For ontology xrefs, maintain an explicit allowlist and perform case-insensitive prefix checks.
 - When adding new datasource version handling, use named parameters for `DatasourceVersionInfo` to avoid argument-order regressions.
 - When an edge can be emitted by multiple sources and later merged, keep source-specific payload in a `details` list instead of top-level edge fields.
diff --git a/designs/ppi/bioplex_ppi_ingest_design.md b/designs/ppi/bioplex_ppi_ingest_design.md
new file mode 100644
index 0000000..65b7817
--- /dev/null
+++ b/designs/ppi/bioplex_ppi_ingest_design.md
@@ -0,0 +1,99 @@
+# BioPlex PPI Ingest Design
+
+## Status
+
+Implemented, validated in the working graph and working MySQL paths, and promoted to `pharos.yaml` / `target_graph.yaml`.
+
+## Goal
+
+Add a first-pass BioPlex protein-protein interaction ingest for Pharos.
+
+## Source Choice
+
+Use the official undirected BioPlex 3.0 interaction releases from the BioPlex download page:
+
+- `BioPlex_293T_Network_10K_Dec_2019.tsv`
+- `BioPlex_HCT116_Network_5.5K_Dec_2019.tsv`
+
+Rationale:
+
+- These are the current official-release network files exposed on the BioPlex site.
+- They match the current graph `PPIEdge` model better than the directed bait-prey files.
+- They avoid the noisier unfiltered candidate-interaction lists.
+
+## Source URLs
+
+- Landing page: `https://bioplex.hms.harvard.edu/interactions.php`
+- Data index: `https://bioplex.hms.harvard.edu/data/`
+- 293T release: `https://bioplex.hms.harvard.edu/data/BioPlex_293T_Network_10K_Dec_2019.tsv`
+- HCT116 release: `https://bioplex.hms.harvard.edu/data/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv`
+
+## Version Strategy
+
+- Use BioPlex release label `3.0` as the dataset version.
+- Capture per-file `Last-Modified` dates into `input_files/auto/bioplex/bioplex_version.tsv`.
+- Let adapter-side `download_date` come from file mtime unless we later decide to persist it explicitly.
+
+Observed current `/data` filenames include a December 2019 stamp even though the site still presents them as the current BioPlex 3.0 official releases.
+
+## Observed File Shape
+
+Current BioPlex 3.0 files have the same shape as the legacy TCRD BioPlex loader expected:
+
+- `GeneA`
+- `GeneB`
+- `UniprotA`
+- `UniprotB`
+- `SymbolA`
+- `SymbolB`
+- `pW`
+- `pNI`
+- `pInt`
+
+Observed counts:
+
+- `BioPlex_293T_Network_10K_Dec_2019.tsv`: `118,162` rows
+- `BioPlex_HCT116_Network_5.5K_Dec_2019.tsv`: `70,966` rows
+
+Observed payload details:
+
+- no self-pairs in either file
+- gene IDs are numeric Entrez Gene identifiers
+- isoform-suffixed UniProt accessions are common
+- `UniprotA` can be the literal string `UNKNOWN`
+  - `293T`: `4,950` rows
+  - `HCT116`: `1,688` rows
+- `UniprotB` did not contain `UNKNOWN` in the profiled files
+- `pInt` values range from about `0.75` to `1.0`
+
+## Implemented Mapping
+
+Current graph mapping:
+
+- emit `PPIEdge`
+- use UniProt accessions as the primary emitted identifier family for endpoint proteins
+- fall back to `NCBIGene` when BioPlex reports `UniprotA='UNKNOWN'`
+- preserve BioPlex confidence-style fields into:
+  - `p_wrong`
+  - `p_ni`
+  - `p_int`
+
+Implementation choice:
+
+- configure one adapter instance per file so provenance distinguishes `293T` versus `HCT116` via the version string
+- do not populate adapter-level `sources`; the ETL framework stamps canonical datasource/version metadata
+
+## Validation Summary
+
+Validated outcomes:
+
+- merged graph edges can carry multiple `p_int` values when the same canonical pair is supported by both BioPlex cell lines
+- `UNKNOWN` UniProt rows resolve through `NCBIGene:*` fallback when the reviewed target graph contains the mapped protein
+- downstream `ncats_ppi` exports BioPlex rows with scalar `p_int`, `p_ni`, and `p_wrong` using `max(...)` collapse for merged graph lists
+- promoted into:
+  - `src/use_cases/pharos/pharos.yaml`
+  - `src/use_cases/pharos/target_graph.yaml`
+
+Open follow-up questions:
+
+- whether cell-line provenance should eventually be carried in a dedicated edge field instead of only in provenance / sources
diff --git a/designs/ppi/reactome_ppi_ingest_design.md b/designs/ppi/reactome_ppi_ingest_design.md
new file mode 100644
index 0000000..5653863
--- /dev/null
+++ b/designs/ppi/reactome_ppi_ingest_design.md
@@ -0,0 +1,127 @@
+# Reactome PPI Ingest Design
+
+## Status
+
+Implemented, validated in the working graph and working MySQL paths, and promoted to `pharos.yaml` / `target_graph.yaml`.
+
+## Goal
+
+Add a first-pass Reactome-derived protein-protein interaction ingest for Pharos.
+
+## Source Choice
+
+Use the official human Reactome tab-delimited interaction file:
+
+- `reactome.homo_sapiens.interactions.tab-delimited.txt`
+
+Rationale:
+
+- This is the current official human interaction export on the Reactome download site.
+- It matches the old TCRD loader input format.
+- It includes interaction type and context/PMID fields that can map naturally into the current PPI model and TCRD export.
+
+## Source URLs
+
+- Download docs: `https://reactome.org/download-data?id=62&ml=1`
+- Directory index: `https://reactome.org/download/current/interactors/`
+- Human tab-delimited file: `https://reactome.org/download/current/interactors/reactome.homo_sapiens.interactions.tab-delimited.txt`
+
+## Version Strategy
+
+- Use the Reactome database version recorded in `input_files/auto/reactome/reactome_version.tsv`
+- Use the PPI file `Last-Modified` header as `version_date`
+- Let adapter-side `download_date` come from file mtime unless we later decide to persist it explicitly
+
+## Documented File Shape
+
+Reactome documents the tab-delimited human interaction file as:
+
+1. interactor 1 protein ID
+2. interactor 1 Ensembl gene ID(s)
+3. interactor 1 Entrez Gene ID(s)
+4. interactor 2 protein ID
+5. interactor 2 Ensembl gene ID(s)
+6. interactor 2 Entrez Gene ID(s)
+7. interaction type
+8. interaction context
+9. PubMed IDs
+
+## Legacy Comparison
+
+The old UNM TCRD loader used the same human tab-delimited Reactome interaction file and:
+
+- required both interactors to have UniProt IDs
+- populated `interaction_type`
+- skipped duplicate interaction rows
+- skipped self-pairs
+
+This should be treated as a comparison point only; current behavior should still be validated against the real file after download.
+
+## Observed File Profile
+
+Observed counts from the downloaded file:
+
+- total rows: `123,895`
+- rows where both interactors are UniProt proteins: `83,545`
+- filtered non-protein rows: `40,350`
+- protein self-pairs: `7,677`
+- duplicate unordered protein-pair-plus-type rows: `57,386`
+- distinct unordered protein-pair-plus-type combinations: `26,159`
+
+Observed payload behavior:
+
+- non-protein rows include identifiers such as `ChEBI:*`
+- almost every protein-protein row has PubMed references
+- every row has a Reactome context string like `reactome:R-HSA-...`
+- current interaction types include values such as:
+  - `physical association`
+  - `enzymatic reaction`
+  - `cleavage reaction`
+  - `dephosphorylation reaction`
+
+## Implemented Mapping
+
+Current first-pass graph mapping:
+
+- emit `PPIEdge`
+- keep only rows where both interactors are UniProt IDs
+- skip self-pairs
+- canonicalize unordered protein pairs
+- dedupe repeated source rows by unordered pair plus interaction type
+- preserve:
+  - `interaction_type` as a graph list field
+  - `contexts` as a graph list field
+  - `pmids` as a graph list field
+- do not populate adapter-level `sources`; the ETL framework stamps canonical datasource/version metadata
+
+## Legacy Downstream Comparison
+
+Direct inspection of `pharos319.ncats_ppi` showed:
+
+- `StringDB` populated only `score`
+- `BioPlex` populated only `p_int`, `p_ni`, and `p_wrong`
+- `Reactome` rows left `evidence`, `interaction_type`, `score`, `p_int`, `p_ni`, and `p_wrong` empty
+
+## Current Downstream Mapping
+
+Current IFX_ODIN downstream decision:
+
+- keep Reactome `pmids`, `contexts`, and `interaction_type` in the graph
+- map `pmids` to `ncats_ppi.evidence` as pipe-delimited PMIDs
+- map `interaction_type` to `ncats_ppi.interaction_type`
+- keep `contexts` graph-only for now
+
+## Validation Summary
+
+Validated outcomes:
+
+- Reactome-backed graph edges landed with non-empty `pmids`, `contexts`, and `interaction_type`
+- Reactome merged cleanly with both BioPlex and STRING on shared canonical pairs
+- downstream `ncats_ppi` rows now carry Reactome PMIDs in `evidence` and Reactome interaction types in `interaction_type`
+- promoted into:
+  - `src/use_cases/pharos/pharos.yaml`
+  - `src/use_cases/pharos/target_graph.yaml`
+
+## Open Follow-Ups
+
+- decide whether Reactome context should eventually have its own dedicated downstream column or lookup table
diff --git a/designs/ppi/string_ppi_ingest_design.md b/designs/ppi/string_ppi_ingest_design.md
index 097639e..ea89f3f 100644
--- a/designs/ppi/string_ppi_ingest_design.md
+++ b/designs/ppi/string_ppi_ingest_design.md
@@ -4,8 +4,7 @@
 
 Implemented and validated in the working graph and working MySQL paths.
 
-This first pass covers **STRING human protein-protein interactions** only.
-BioPlex and Reactome PPI remain follow-up sources.
+STRING, BioPlex, and Reactome PPI are now all implemented for Pharos.
 
 ## Scope
 
@@ -15,9 +14,7 @@ Implemented source:
 
 Explicitly deferred:
 
-- BioPlex PPI
-- Reactome PPI
-- source-specific `interaction_type` / `evidence` population
+- richer STRING channel-specific fields from `.protein.links.full...`
 
 ## Files Added / Changed
 
@@ -102,7 +99,6 @@ Confirmed old IFX_ODIN / Pharos readback behavior:
 - `PPIEdge`
   - `start_node`: `Protein(id="ENSEMBL:ENSP...")`
   - `end_node`: `Protein(id="ENSEMBL:ENSP...")`
-  - `sources`: STRING provenance list
   - `score`: list-valued, emitted as `[combined_score]`
 
 Implementation choices:
@@ -110,6 +106,7 @@ Implementation choices:
 - `score_cutoff` is an adapter parameter with default `400`
 - rows below the cutoff are discarded before they enter the graph
 - self-pairs are discarded before they enter the graph
+- adapter does not populate `sources`; the ETL framework stamps canonical datasource/version metadata
 - `max_rows` is supported for bounded validation runs and counts **kept emitted
   edges**, not scanned raw lines
 
@@ -237,10 +234,6 @@ pairs before export.
 
 - Profile whether STRING `.protein.links.full...` is worth revisiting for richer
   channel-specific fields
-- Add Reactome PPI ingest
-  - populate `interaction_type`
-  - decide whether the Reactome evidence/context column should map to `evidence`
-- Add BioPlex PPI ingest
 - Decide whether downstream `ncats_ppi` export should collapse duplicate canonical
   pairs before reciprocal row generation, or continue to preserve one SQL row pair
   per graph edge
diff --git a/src/constants.py b/src/constants.py
index 746005f..63f9177 100755
--- a/src/constants.py
+++ b/src/constants.py
@@ -18,6 +18,7 @@ class DataSourceName(SimpleEnum):
     Cellosaurus = "Cellosaurus"
     Reactome = "Reactome"
     STRING = "STRING"
+    BioPlex = "BioPlex"
     WikiPathways = "WikiPathways"
     PathwayCommons = "PathwayCommons"
     CLO = "Cell Line Ontology (CLO)"
diff --git a/src/input_adapters/bioplex/bioplex_ppi.py b/src/input_adapters/bioplex/bioplex_ppi.py
new file mode 100644
index 0000000..58002b6
--- /dev/null
+++ b/src/input_adapters/bioplex/bioplex_ppi.py
@@ -0,0 +1,108 @@
+import csv
+from datetime import date
+from pathlib import Path
+from typing import Generator, List, Optional
+
+from src.constants import DataSourceName, Prefix
+from src.input_adapters.flat_file_adapter import FlatFileAdapter
+from src.models.datasource_version_info import DatasourceVersionInfo
+from src.models.node import EquivalentId
+from src.models.ppi import PPIEdge
+from src.models.protein import Protein
+
+
+class BioPlexPPIAdapter(FlatFileAdapter):
+    version_info: DatasourceVersionInfo
+
+    def __init__(
+        self,
+        file_path: str,
+        version_file_path: Optional[str] = None,
+        max_rows: Optional[int] = None,
+    ):
+        FlatFileAdapter.__init__(self, file_path=file_path)
+        self.max_rows = max_rows
+        self.version_info = self._load_version_info(version_file_path)
+
+    def _load_version_info(self, version_file_path: Optional[str]) -> DatasourceVersionInfo:
+        version = None
+        version_date = None
+        download_date = self.download_date
+        if version_file_path:
+            with open(version_file_path, "r", encoding="utf-8", newline="") as handle:
+                reader = csv.DictReader(handle, delimiter="\t")
+                matching_row = None
+                for row in reader:
+                    if row.get("file") == Path(self.file_path).name:
+                        matching_row = row
+                        break
+                if matching_row:
+                    version_label = matching_row.get("version") or None
+                    dataset_label = matching_row.get("dataset") or None
+                    if version_label and dataset_label:
+                        cell_line = dataset_label.replace("BioPlex", "", 1).strip().replace(version_label, "", 1).strip()
+                        version = f"{version_label} ({cell_line})" if cell_line else version_label
+                    else:
+                        version = version_label or dataset_label or None
+                    version_date_str = matching_row.get("version_date") or None
+                    download_date_str = matching_row.get("download_date") or None
+                    version_date = date.fromisoformat(version_date_str) if version_date_str else None
+                    download_date = date.fromisoformat(download_date_str) if download_date_str else download_date
+        return DatasourceVersionInfo(
+            version=version,
+            version_date=version_date,
+            download_date=download_date,
+        )
+
+    def get_datasource_name(self) -> DataSourceName:
+        return DataSourceName.BioPlex
+
+    def get_version(self) -> DatasourceVersionInfo:
+        return self.version_info
+
+    @staticmethod
+    def _normalize_value(value: str) -> str:
+        return value.strip().strip('"')
+
+    @classmethod
+    def _protein_id(cls, uniprot_id: str, gene_id: str) -> str:
+        if uniprot_id and uniprot_id != "UNKNOWN":
+            return EquivalentId(id=uniprot_id, type=Prefix.UniProtKB).id_str()
+        return EquivalentId(id=gene_id, type=Prefix.NCBIGene).id_str()
+
+    def get_all(self) -> Generator[List[PPIEdge], None, None]:
+        batch: List[PPIEdge] = []
+        kept_rows = 0
+        with open(self.file_path, "r", encoding="utf-8", newline="") as handle:
+            reader = csv.DictReader(handle, delimiter="\t")
+            for row in reader:
+                if self.max_rows is not None and kept_rows >= self.max_rows:
+                    break
+
+                gene_a = self._normalize_value(row["GeneA"])
+                gene_b = self._normalize_value(row["GeneB"])
+                uniprot_a = self._normalize_value(row["UniprotA"])
+                uniprot_b = self._normalize_value(row["UniprotB"])
+
+                if gene_a == gene_b and uniprot_a == uniprot_b:
+                    continue
+
+                protein_a = self._protein_id(uniprot_a, gene_a)
+                protein_b = self._protein_id(uniprot_b, gene_b)
+                if protein_a == protein_b:
+                    continue
+                protein_a, protein_b = sorted((protein_a, protein_b))
+
+                edge = PPIEdge(
+                    start_node=Protein(id=protein_a),
+                    end_node=Protein(id=protein_b),
+                    p_wrong=[float(self._normalize_value(row["pW"]))],
+                    p_ni=[float(self._normalize_value(row["pNI"]))],
+                    p_int=[float(self._normalize_value(row["pInt"]))],
+                )
+                batch.append(edge)
+                kept_rows += 1
+                if len(batch) >= self.batch_size:
+                    yield batch
+                    batch = []
+        yield batch
diff --git a/src/input_adapters/pharos_arango/tcrd/ppi.py b/src/input_adapters/pharos_arango/tcrd/ppi.py
index 5a86526..5d28597 100644
--- a/src/input_adapters/pharos_arango/tcrd/ppi.py
+++ b/src/input_adapters/pharos_arango/tcrd/ppi.py
@@ -37,9 +37,12 @@ def get_all(self) -> Generator[List[PPIEdge], None, None]:
                     end_node=Protein(id=row["end_id"]),
                     provenance=row.get("provenance"),
                     sources=row.get("sources") or [],
-                    p_int=row.get("p_int"),
-                    p_ni=row.get("p_ni"),
-                    p_wrong=row.get("p_wrong"),
+                    p_int=row.get("p_int") or [],
+                    p_ni=row.get("p_ni") or [],
+                    p_wrong=row.get("p_wrong") or [],
+                    pmids=row.get("pmids") or [],
+                    contexts=row.get("contexts") or [],
+                    interaction_type=row.get("interaction_type") or [],
                     score=row.get("score") or [],
                 )
                 for row in rows
diff --git a/src/input_adapters/reactome/reactome_ppi.py b/src/input_adapters/reactome/reactome_ppi.py
new file mode 100644
index 0000000..a1a3c2f
--- /dev/null
+++ b/src/input_adapters/reactome/reactome_ppi.py
@@ -0,0 +1,103 @@
+import csv
+from datetime import date
+from collections import OrderedDict
+from typing import Generator, List, Optional
+
+from src.constants import DataSourceName, Prefix
+from src.input_adapters.flat_file_adapter import FlatFileAdapter
+from src.models.datasource_version_info import DatasourceVersionInfo
+from src.models.node import EquivalentId
+from src.models.ppi import PPIEdge
+from src.models.protein import Protein
+
+
+class ReactomePPIAdapter(FlatFileAdapter):
+    version_info: DatasourceVersionInfo
+
+    def __init__(self, file_path: str, version_file_path: Optional[str] = None, max_rows: Optional[int] = None):
+        FlatFileAdapter.__init__(self, file_path=file_path)
+        self.max_rows = max_rows
+        self.version_info = self._load_version_info(version_file_path)
+
+    def _load_version_info(self, version_file_path: Optional[str]) -> DatasourceVersionInfo:
+        version = None
+        version_date = None
+        download_date = self.download_date
+        if version_file_path:
+            with open(version_file_path, "r", encoding="utf-8", newline="") as handle:
+                reader = csv.DictReader(handle, delimiter="\t")
+                first_row = next(reader, None)
+                if first_row:
+                    version = first_row.get("version") or None
+                    version_date_str = first_row.get("version_date") or None
+                    download_date_str = first_row.get("download_date") or None
+                    version_date = date.fromisoformat(version_date_str) if version_date_str else None
+                    download_date = date.fromisoformat(download_date_str) if download_date_str else download_date
+        return DatasourceVersionInfo(version=version, version_date=version_date, download_date=download_date)
+
+    def get_datasource_name(self) -> DataSourceName:
+        return DataSourceName.Reactome
+
+    def get_version(self) -> DatasourceVersionInfo:
+        return self.version_info
+
+    @staticmethod
+    def _protein_id(raw_value: str) -> Optional[str]:
+        value = raw_value.strip()
+        if not value.startswith("uniprotkb:"):
+            return None
+        return EquivalentId(id=value.split(":", 1)[1], type=Prefix.UniProtKB).id_str()
+
+    @staticmethod
+    def _parse_pmids(raw_value: str) -> List[int]:
+        if not raw_value:
+            return []
+        pmids = []
+        for token in raw_value.replace(",", "|").replace(";", "|").split("|"):
+            token = token.strip()
+            if token.isdigit():
+                pmids.append(int(token))
+        return list(dict.fromkeys(pmids))
+
+    def get_all(self) -> Generator[List[PPIEdge], None, None]:
+        edges_by_key = OrderedDict()
+        kept_rows = 0
+        with open(self.file_path, "r", encoding="utf-8", newline="") as handle:
+            reader = csv.DictReader(handle, delimiter="\t")
+            for row in reader:
+                protein1 = self._protein_id(row["# Interactor 1 uniprot id"])
+                protein2 = self._protein_id(row["Interactor 2 uniprot id"])
+                if protein1 is None or protein2 is None:
+                    continue
+                if protein1 == protein2:
+                    continue
+                protein1, protein2 = sorted((protein1, protein2))
+
+                interaction_type = row["Interaction type"].strip()
+                key = (protein1, protein2, interaction_type)
+                if key not in edges_by_key:
+                    if self.max_rows is not None and kept_rows >= self.max_rows:
+                        continue
+                    edges_by_key[key] = PPIEdge(
+                        start_node=Protein(id=protein1),
+                        end_node=Protein(id=protein2),
+                        interaction_type=[interaction_type] if interaction_type else [],
+                        contexts=[],
+                        pmids=[],
+                    )
+                    kept_rows += 1
+                edge = edges_by_key[key]
+                context = row["Interaction context"].strip()
+                if context and context not in edge.contexts:
+                    edge.contexts.append(context)
+                for pmid in self._parse_pmids(row["Pubmed references"].strip()):
+                    if pmid not in edge.pmids:
+                        edge.pmids.append(pmid)
+
+        batch: List[PPIEdge] = []
+        for edge in edges_by_key.values():
+            batch.append(edge)
+            if len(batch) >= self.batch_size:
+                yield batch
+                batch = []
+        yield batch
diff --git a/src/input_adapters/string/string_ppi.py b/src/input_adapters/string/string_ppi.py
index 9256f4c..bd2eb5a 100644
--- a/src/input_adapters/string/string_ppi.py
+++ b/src/input_adapters/string/string_ppi.py
@@ -89,7 +89,6 @@ def get_all(self) -> Generator[List[PPIEdge], None, None]:
                 edge = PPIEdge(
                     start_node=Protein(id=f"{Prefix.ENSEMBL}:{protein1}"),
                     end_node=Protein(id=f"{Prefix.ENSEMBL}:{protein2}"),
-                    sources=[self.get_datasource_name().value],
                     score=[score],
                 )
                 batch.append(edge)
diff --git a/src/interfaces/input_adapter.py b/src/interfaces/input_adapter.py
index 3af0858..a2af52d 100644
--- a/src/interfaces/input_adapter.py
+++ b/src/interfaces/input_adapter.py
@@ -16,7 +16,6 @@ class InputAdapter(ABC):
     def _canonicalize_relationship_class(rel: Relationship, start_node: Node, end_node: Node) -> Relationship:
         from src.models.disease import GeneDiseaseEdge, ProteinDiseaseEdge
         from src.models.expression import GeneTissueExpressionEdge, ProteinTissueExpressionEdge
-        from src.models.gene import Gene
         from src.models.pathway import GenePathwayEdge, ProteinPathwayEdge
         from src.models.protein import Protein
         from src.models.tissue import Tissue
@@ -85,8 +84,9 @@ def get_and_delete_old_id(node):
                 version_info = self.get_version()
                 version_data = [self.get_datasource_name(), version_info.version, version_info.version_date, version_info.download_date]
                 version_string = '\t'.join([str(e) for e in version_data])
-                entry.provenance = version_string
-                if self.get_datasource_name() != DataSourceName.PostProcessing:
+                if not getattr(entry, 'provenance', None):
+                    entry.provenance = version_string
+                if self.get_datasource_name() != DataSourceName.PostProcessing and not getattr(entry, 'sources', None):
                     entry.sources = [version_string]
 
             nodes = [e for e in entries if isinstance(e, Node)]
diff --git a/src/models/ppi.py b/src/models/ppi.py
index 4ec3d32..e1f8683 100644
--- a/src/models/ppi.py
+++ b/src/models/ppi.py
@@ -10,7 +10,10 @@ class PPIEdge(Relationship):
     start_node: Protein
     end_node: Protein
     sources: List[str] = field(default_factory=list)
-    p_int: float = None
-    p_ni: float = None
-    p_wrong: float = None
+    p_int: List[float] = field(default_factory=list)
+    p_ni: List[float] = field(default_factory=list)
+    p_wrong: List[float] = field(default_factory=list)
+    pmids: List[int] = field(default_factory=list)
+    contexts: List[str] = field(default_factory=list)
+    interaction_type: List[str] = field(default_factory=list)
     score: List[float] = field(default_factory=list)
diff --git a/src/output_adapters/sql_converters/tcrd.py b/src/output_adapters/sql_converters/tcrd.py
index cc1d692..efda18d 100755
--- a/src/output_adapters/sql_converters/tcrd.py
+++ b/src/output_adapters/sql_converters/tcrd.py
@@ -637,6 +637,13 @@ def _max_or_value(value):
             return max(value) if value else None
         return value
 
+    @staticmethod
+    def _join_list(value):
+        if isinstance(value, list):
+            normalized = [str(v) for v in value if v is not None and str(v) != ""]
+            return "|".join(dict.fromkeys(normalized)) if normalized else None
+        return value
+
     def ppi_converter(self, obj: dict) -> List[mysqlPPI]:
         protein_id = self.resolve_id('protein', obj['start_id'])
         other_id = self.resolve_id('protein', obj['end_id'])
@@ -652,8 +659,8 @@ def ppi_converter(self, obj: dict) -> List[mysqlPPI]:
             p_int=self._max_or_value(obj.get('p_int')),
             p_ni=self._max_or_value(obj.get('p_ni')),
             p_wrong=self._max_or_value(obj.get('p_wrong')),
-            evidence=obj.get('evidence'),
-            interaction_type=obj.get('interaction_type'),
+            evidence=self._join_list(obj.get('pmids')) or obj.get('evidence'),
+            interaction_type=self._join_list(obj.get('interaction_type')),
             score=self._max_or_value(obj.get('score')),
         )
 
diff --git a/src/use_cases/pharos/pharos.yaml b/src/use_cases/pharos/pharos.yaml
index 1b1f768..ac017f4 100644
--- a/src/use_cases/pharos/pharos.yaml
+++ b/src/use_cases/pharos/pharos.yaml
@@ -145,6 +145,24 @@ input_adapters:
       version_file_path: ./input_files/auto/string/string_version.tsv
       score_cutoff: 400
 
+  - import: ./src/input_adapters/bioplex/bioplex_ppi.py
+    class: BioPlexPPIAdapter
+    kwargs:
+      file_path: ./input_files/auto/bioplex/BioPlex_293T_Network_10K_Dec_2019.tsv
+      version_file_path: ./input_files/auto/bioplex/bioplex_version.tsv
+
+  - import: ./src/input_adapters/bioplex/bioplex_ppi.py
+    class: BioPlexPPIAdapter
+    kwargs:
+      file_path: ./input_files/auto/bioplex/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv
+      version_file_path: ./input_files/auto/bioplex/bioplex_version.tsv
+
+  - import: ./src/input_adapters/reactome/reactome_ppi.py
+    class: ReactomePPIAdapter
+    kwargs:
+      file_path: ./input_files/auto/reactome/reactome.homo_sapiens.interactions.tab-delimited.txt
+      version_file_path: ./input_files/auto/reactome/reactome_version.tsv
+
   - import: ./src/input_adapters/target_graph/generif_node.py
     class: GeneRifNodeAdapter
     kwargs:
diff --git a/src/use_cases/pharos/target_graph.yaml b/src/use_cases/pharos/target_graph.yaml
index 872caff..8983319 100644
--- a/src/use_cases/pharos/target_graph.yaml
+++ b/src/use_cases/pharos/target_graph.yaml
@@ -157,6 +157,24 @@ input_adapters:
       version_file_path: ./input_files/auto/string/string_version.tsv
       score_cutoff: 400
 
+  - import: ./src/input_adapters/bioplex/bioplex_ppi.py
+    class: BioPlexPPIAdapter
+    kwargs:
+      file_path: ./input_files/auto/bioplex/BioPlex_293T_Network_10K_Dec_2019.tsv
+      version_file_path: ./input_files/auto/bioplex/bioplex_version.tsv
+
+  - import: ./src/input_adapters/bioplex/bioplex_ppi.py
+    class: BioPlexPPIAdapter
+    kwargs:
+      file_path: ./input_files/auto/bioplex/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv
+      version_file_path: ./input_files/auto/bioplex/bioplex_version.tsv
+
+  - import: ./src/input_adapters/reactome/reactome_ppi.py
+    class: ReactomePPIAdapter
+    kwargs:
+      file_path: ./input_files/auto/reactome/reactome.homo_sapiens.interactions.tab-delimited.txt
+      version_file_path: ./input_files/auto/reactome/reactome_version.tsv
+
   - import: ./src/input_adapters/target_graph/protein_nodes_and_edges.py
     class: IsoformProteinEdgeAdapter
     kwargs:
diff --git a/src/use_cases/working.yaml b/src/use_cases/working.yaml
index 1d38496..8043215 100644
--- a/src/use_cases/working.yaml
+++ b/src/use_cases/working.yaml
@@ -42,6 +42,27 @@ input_adapters:
       score_cutoff: 400
       max_rows: 10000
 
+  - import: ./src/input_adapters/bioplex/bioplex_ppi.py
+    class: BioPlexPPIAdapter
+    kwargs:
+      file_path: ./input_files/auto/bioplex/BioPlex_293T_Network_10K_Dec_2019.tsv
+      version_file_path: ./input_files/auto/bioplex/bioplex_version.tsv
+      max_rows: 10000
+
+  - import: ./src/input_adapters/bioplex/bioplex_ppi.py
+    class: BioPlexPPIAdapter
+    kwargs:
+      file_path: ./input_files/auto/bioplex/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv
+      version_file_path: ./input_files/auto/bioplex/bioplex_version.tsv
+      max_rows: 10000
+
+  - import: ./src/input_adapters/reactome/reactome_ppi.py
+    class: ReactomePPIAdapter
+    kwargs:
+      file_path: ./input_files/auto/reactome/reactome.homo_sapiens.interactions.tab-delimited.txt
+      version_file_path: ./input_files/auto/reactome/reactome_version.tsv
+      max_rows: 10000
+
 output_adapters:
   - import: ./src/output_adapters/arango_output_adapter.py
     class: ArangoOutputAdapter
diff --git a/tests/test_bioplex_ppi.py b/tests/test_bioplex_ppi.py
new file mode 100644
index 0000000..6602ad3
--- /dev/null
+++ b/tests/test_bioplex_ppi.py
@@ -0,0 +1,111 @@
+from src.input_adapters.bioplex.bioplex_ppi import BioPlexPPIAdapter
+
+
+def test_bioplex_adapter_preserves_probability_fields_and_canonicalizes_direction(tmp_path):
+    data_path = tmp_path / "BioPlex_293T_Network_10K_Dec_2019.tsv"
+    version_path = tmp_path / "bioplex_version.tsv"
+
+    data_path.write_text(
+        "\n".join(
+            [
+                '"GeneA"\t"GeneB"\t"UniprotA"\t"UniprotB"\t"SymbolA"\t"SymbolB"\t"pW"\t"pNI"\t"pInt"',
+                '"2"\t"1"\t"Q8N7W2-2"\t"P00813"\t"BEND7"\t"ADA"\t"0.01"\t"0.02"\t"0.97"',
+            ]
+        ),
+        encoding="utf-8",
+    )
+    version_path.write_text(
+        "\n".join(
+            [
+                "dataset\tfile\tversion\tversion_date",
+                "BioPlex 3.0 293T\tBioPlex_293T_Network_10K_Dec_2019.tsv\t3.0\t2024-01-19",
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    adapter = BioPlexPPIAdapter(
+        file_path=str(data_path),
+        version_file_path=str(version_path),
+    )
+
+    batches = list(adapter.get_all())
+    edges = [edge for batch in batches for edge in batch]
+
+    assert len(edges) == 1
+    assert edges[0].start_node.id == "UniProtKB:P00813"
+    assert edges[0].end_node.id == "UniProtKB:Q8N7W2-2"
+    assert edges[0].p_wrong == [0.01]
+    assert edges[0].p_ni == [0.02]
+    assert edges[0].p_int == [0.97]
+    assert edges[0].sources == []
+
+    version = adapter.get_version()
+    assert version.version == "3.0 (293T)"
+    assert version.version_date.isoformat() == "2024-01-19"
+
+
+def test_bioplex_adapter_falls_back_to_ncbi_gene_for_unknown_uniprot(tmp_path):
+    data_path = tmp_path / "BioPlex_HCT116_Network_5.5K_Dec_2019.tsv"
+    version_path = tmp_path / "bioplex_version.tsv"
+
+    data_path.write_text(
+        "\n".join(
+            [
+                '"GeneA"\t"GeneB"\t"UniprotA"\t"UniprotB"\t"SymbolA"\t"SymbolB"\t"pW"\t"pNI"\t"pInt"',
+                '"3012"\t"4673"\t"UNKNOWN"\t"P55209"\t"HIST1H2AE"\t"NAP1L1"\t"0.001"\t"0.002"\t"0.997"',
+            ]
+        ),
+        encoding="utf-8",
+    )
+    version_path.write_text(
+        "\n".join(
+            [
+                "dataset\tfile\tversion\tversion_date",
+                "BioPlex 3.0 HCT116\tBioPlex_HCT116_Network_5.5K_Dec_2019.tsv\t3.0\t2024-01-19",
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    adapter = BioPlexPPIAdapter(
+        file_path=str(data_path),
+        version_file_path=str(version_path),
+    )
+
+    batches = list(adapter.get_all())
+    edges = [edge for batch in batches for edge in batch]
+
+    assert len(edges) == 1
+    assert {edges[0].start_node.id, edges[0].end_node.id} == {"NCBIGene:3012", "UniProtKB:P55209"}
+
+
+def test_bioplex_adapter_honors_max_rows_on_kept_edges(tmp_path):
+    data_path = tmp_path / "BioPlex_293T_Network_10K_Dec_2019.tsv"
+
+    data_path.write_text(
+        "\n".join(
+            [
+                '"GeneA"\t"GeneB"\t"UniprotA"\t"UniprotB"\t"SymbolA"\t"SymbolB"\t"pW"\t"pNI"\t"pInt"',
+                '"1"\t"1"\t"P11142"\t"P11142"\t"HSPA8"\t"HSPA8"\t"0.1"\t"0.2"\t"0.7"',
+                '"1"\t"2"\t"P11142"\t"P00813"\t"HSPA8"\t"ADA"\t"0.01"\t"0.02"\t"0.97"',
+                '"3"\t"4"\t"Q9Y3U8"\t"P36578"\t"RPL36"\t"RPL4"\t"0.03"\t"0.04"\t"0.93"',
+                '"5"\t"6"\t"P26373"\t"Q09028-3"\t"RPL13"\t"RBBP4"\t"0.05"\t"0.06"\t"0.89"',
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    adapter = BioPlexPPIAdapter(
+        file_path=str(data_path),
+        max_rows=2,
+    )
+
+    batches = list(adapter.get_all())
+    edges = [edge for batch in batches for edge in batch]
+
+    assert len(edges) == 2
+    assert [(edge.start_node.id, edge.end_node.id) for edge in edges] == [
+        ("UniProtKB:P00813", "UniProtKB:P11142"),
+        ("UniProtKB:P36578", "UniProtKB:Q9Y3U8"),
+    ]
diff --git a/tests/test_input_adapter_canonical_edge_remap.py b/tests/test_input_adapter_canonical_edge_remap.py
index d7bf1c1..10ff25a 100644
--- a/tests/test_input_adapter_canonical_edge_remap.py
+++ b/tests/test_input_adapter_canonical_edge_remap.py
@@ -145,3 +145,31 @@ def test_input_adapter_keeps_measured_protein_edge_start_node_typed_as_measured_
     assert rel.start_node.id == "UniProtKB:P12345"
     assert isinstance(rel.end_node, Protein)
     assert rel.end_node.id == "UniProtKB:P12345"
+
+
+def test_input_adapter_preserves_existing_edge_sources_and_provenance():
+    edge = ProteinPathwayEdge(
+        start_node=Protein(id="UniProtKB:P1"),
+        end_node=Pathway(id="Reactome:R-HSA-1"),
+        source="Reactome",
+    )
+    edge.sources = ["BioPlex\t3.0 (293T)\t2024-01-19\t2026-04-24", "Reactome\t96\t2026-03-24\t2026-04-24"]
+    edge.provenance = "BioPlex\t3.0 (293T)\t2024-01-19\t2026-04-24"
+    adapter = _SingleBatchAdapter([edge])
+
+    batches = list(adapter.get_resolved_and_provenanced_list({
+        "Protein": _IdentityResolver(
+            types=["Protein"],
+            no_match_behavior=NoMatchBehavior.Skip,
+            multi_match_behavior=MultiMatchBehavior.All,
+        ),
+        "Pathway": _IdentityResolver(
+            types=["Pathway"],
+            no_match_behavior=NoMatchBehavior.Skip,
+            multi_match_behavior=MultiMatchBehavior.All,
+        ),
+    }))
+    rel = batches[-1][0]
+
+    assert rel.sources == ["BioPlex\t3.0 (293T)\t2024-01-19\t2026-04-24", "Reactome\t96\t2026-03-24\t2026-04-24"]
+    assert rel.provenance == "BioPlex\t3.0 (293T)\t2024-01-19\t2026-04-24"
diff --git a/tests/test_reactome_ppi.py b/tests/test_reactome_ppi.py
new file mode 100644
index 0000000..ed855ab
--- /dev/null
+++ b/tests/test_reactome_ppi.py
@@ -0,0 +1,74 @@
+from src.input_adapters.reactome.reactome_ppi import ReactomePPIAdapter
+
+
+def test_reactome_ppi_adapter_filters_non_protein_rows_and_keeps_context_and_pmids(tmp_path):
+    data_path = tmp_path / "reactome.homo_sapiens.interactions.tab-delimited.txt"
+    version_path = tmp_path / "reactome_version.tsv"
+
+    data_path.write_text(
+        "\n".join(
+            [
+                "# Interactor 1 uniprot id\tInteractor 1 Ensembl gene id\tInteractor 1 Entrez Gene id\tInteractor 2 uniprot id\tInteractor 2 Ensembl gene id\tInteractor 2 Entrez Gene id\tInteraction type\tInteraction context\tPubmed references",
+                "uniprotkb:P08123\t-\t-\tuniprotkb:P02452\t-\t-\tphysical association\treactome:R-HSA-2428940\t24243840",
+                "ChEBI:29035\t-\t-\tuniprotkb:P02452\t-\t-\tphysical association\treactome:R-HSA-123\t111",
+            ]
+        ),
+        encoding="utf-8",
+    )
+    version_path.write_text("version\tversion_date\n96\t2026-03-24\n", encoding="utf-8")
+
+    adapter = ReactomePPIAdapter(file_path=str(data_path), version_file_path=str(version_path))
+    edges = [edge for batch in adapter.get_all() for edge in batch]
+
+    assert len(edges) == 1
+    assert edges[0].start_node.id == "UniProtKB:P02452"
+    assert edges[0].end_node.id == "UniProtKB:P08123"
+    assert edges[0].interaction_type == ["physical association"]
+    assert edges[0].contexts == ["reactome:R-HSA-2428940"]
+    assert edges[0].pmids == [24243840]
+    assert edges[0].sources == []
+
+
+def test_reactome_ppi_adapter_skips_self_pairs_and_honors_max_rows(tmp_path):
+    data_path = tmp_path / "reactome.homo_sapiens.interactions.tab-delimited.txt"
+
+    data_path.write_text(
+        "\n".join(
+            [
+                "# Interactor 1 uniprot id\tInteractor 1 Ensembl gene id\tInteractor 1 Entrez Gene id\tInteractor 2 uniprot id\tInteractor 2 Ensembl gene id\tInteractor 2 Entrez Gene id\tInteraction type\tInteraction context\tPubmed references",
+                "uniprotkb:P08123\t-\t-\tuniprotkb:P08123\t-\t-\tphysical association\treactome:R-HSA-1\t1",
+                "uniprotkb:P08123\t-\t-\tuniprotkb:P02452\t-\t-\tphysical association\treactome:R-HSA-2\t2|3",
+                "uniprotkb:P01160\t-\t-\tuniprotkb:P06727\t-\t-\tenzymatic reaction\treactome:R-HSA-3\t4",
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    adapter = ReactomePPIAdapter(file_path=str(data_path), max_rows=1)
+    edges = [edge for batch in adapter.get_all() for edge in batch]
+
+    assert len(edges) == 1
+    assert edges[0].pmids == [2, 3]
+    assert edges[0].contexts == ["reactome:R-HSA-2"]
+
+
+def test_reactome_ppi_adapter_dedupes_pair_plus_type_and_accumulates_contexts_and_pmids(tmp_path):
+    data_path = tmp_path / "reactome.homo_sapiens.interactions.tab-delimited.txt"
+
+    data_path.write_text(
+        "\n".join(
+            [
+                "# Interactor 1 uniprot id\tInteractor 1 Ensembl gene id\tInteractor 1 Entrez Gene id\tInteractor 2 uniprot id\tInteractor 2 Ensembl gene id\tInteractor 2 Entrez Gene id\tInteraction type\tInteraction context\tPubmed references",
+                "uniprotkb:P08123\t-\t-\tuniprotkb:P02452\t-\t-\tphysical association\treactome:R-HSA-1\t1|2",
+                "uniprotkb:P02452\t-\t-\tuniprotkb:P08123\t-\t-\tphysical association\treactome:R-HSA-2\t2|3",
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    adapter = ReactomePPIAdapter(file_path=str(data_path))
+    edges = [edge for batch in adapter.get_all() for edge in batch]
+
+    assert len(edges) == 1
+    assert edges[0].contexts == ["reactome:R-HSA-1", "reactome:R-HSA-2"]
+    assert edges[0].pmids == [1, 2, 3]
diff --git a/tests/test_string_ppi.py b/tests/test_string_ppi.py
index 86c227a..5d71c89 100644
--- a/tests/test_string_ppi.py
+++ b/tests/test_string_ppi.py
@@ -35,7 +35,7 @@ def test_string_ppi_adapter_applies_default_cutoff_and_skips_self_pairs(tmp_path
         ("ENSEMBL:ENSP0002", "ENSEMBL:ENSP0003", [400]),
         ("ENSEMBL:ENSP0003", "ENSEMBL:ENSP0004", [700]),
     ]
-    assert all(edge.sources == ["STRING"] for edge in edges)
+    assert all(edge.sources == [] for edge in edges)
 
     version = adapter.get_version()
     assert version.version == "12.0"
diff --git a/tests/test_tcrd_output_converter.py b/tests/test_tcrd_output_converter.py
index d25eeac..c9e75fb 100644
--- a/tests/test_tcrd_output_converter.py
+++ b/tests/test_tcrd_output_converter.py
@@ -179,6 +179,28 @@ def test_ppi_converter_joins_multiple_source_labels():
     assert all(row.score == 800 for row in rows)
 
 
+def test_ppi_converter_maps_reactome_pmids_and_interaction_type_downstream():
+    converter = TCRDOutputConverter()
+    converter.id_mapping["protein"] = {
+        "IFX123": 123,
+        "IFX456": 456,
+    }
+
+    rows = converter.ppi_converter({
+        "start_id": "IFX123",
+        "end_id": "IFX456",
+        "sources": ["Reactome\t96\t2026-03-24\t2026-04-24"],
+        "pmids": [24243840, 11163199],
+        "contexts": ["reactome:R-HSA-2428940"],
+        "interaction_type": ["physical association"],
+        "provenance": "Reactome\t96\t2026-03-24\t2026-04-24",
+    })
+
+    assert len(rows) == 2
+    assert all(row.evidence == "24243840|11163199" for row in rows)
+    assert all(row.interaction_type == "physical association" for row in rows)
+
+
 def test_gtex_converter_branches_gtex_details_from_shared_expression_edge():
     converter = TCRDOutputConverter()
     converter.id_mapping["protein"] = {"IFX123": 123}
diff --git a/workflows/pharos.Snakefile b/workflows/pharos.Snakefile
index 386f304..4e31fd5 100644
--- a/workflows/pharos.Snakefile
+++ b/workflows/pharos.Snakefile
@@ -24,7 +24,11 @@ rule all:
         "../input_files/auto/reactome/ReactomePathways.gmt.zip",
         "../input_files/auto/reactome/ReactomePathwaysRelation.txt",
         "../input_files/auto/reactome/UniProt2Reactome_All_Levels.txt",
+        "../input_files/auto/reactome/reactome.homo_sapiens.interactions.tab-delimited.txt",
         "../input_files/auto/reactome/reactome_version.tsv",
+        "../input_files/auto/bioplex/BioPlex_293T_Network_10K_Dec_2019.tsv",
+        "../input_files/auto/bioplex/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv",
+        "../input_files/auto/bioplex/bioplex_version.tsv",
         "../input_files/auto/string/9606.protein.links.v12.0.txt.gz",
         "../input_files/auto/string/string_version.tsv",
         "../input_files/auto/gtex/GTEx_Analysis_2025_08_22_v11_RNASeQCv2.4.3_gene_tpm.gct.gz",
@@ -182,15 +186,37 @@ rule download_reactome:
         "../input_files/auto/reactome/ReactomePathways.gmt.zip",
         "../input_files/auto/reactome/ReactomePathwaysRelation.txt",
         "../input_files/auto/reactome/UniProt2Reactome_All_Levels.txt",
+        "../input_files/auto/reactome/reactome.homo_sapiens.interactions.tab-delimited.txt",
         "../input_files/auto/reactome/reactome_version.tsv"
     shell:
         """
         curl -o {output[0]} https://reactome.org/download/current/ReactomePathways.gmt.zip
         curl -o {output[1]} https://reactome.org/download/current/ReactomePathwaysRelation.txt
         curl -o {output[2]} https://reactome.org/download/current/UniProt2Reactome_All_Levels.txt
-        last_modified=$(curl -fsI https://reactome.org/download/current/ReactomePathways.gmt.zip | awk -F': ' 'tolower($1)=="last-modified"{{print $2}}')
+        curl -o {output[3]} https://reactome.org/download/current/interactors/reactome.homo_sapiens.interactions.tab-delimited.txt
+        last_modified=$(curl -fsI https://reactome.org/download/current/interactors/reactome.homo_sapiens.interactions.tab-delimited.txt | awk -F': ' 'tolower($1)=="last-modified"{{print $2}}')
         version=$(curl -fs https://reactome.org/ContentService/data/database/version)
-        python3 -c "import email.utils,sys; lm=sys.argv[1]; v=sys.argv[2].strip(); out=sys.argv[3]; dt=email.utils.parsedate_to_datetime(lm).date().isoformat(); open(out,'w').write('version\\tversion_date\\n'+v+'\\t'+dt+'\\n')" "$last_modified" "$version" {output[3]}
+        python3 -c "import email.utils,sys; lm=sys.argv[1]; v=sys.argv[2].strip(); out=sys.argv[3]; dt=email.utils.parsedate_to_datetime(lm).date().isoformat(); open(out,'w').write('version\\tversion_date\\n'+v+'\\t'+dt+'\\n')" "$last_modified" "$version" {output[4]}
+        """
+
+rule download_bioplex:
+    output:
+        "../input_files/auto/bioplex/BioPlex_293T_Network_10K_Dec_2019.tsv",
+        "../input_files/auto/bioplex/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv",
+        "../input_files/auto/bioplex/bioplex_version.tsv"
+    shell:
+        """
+        mkdir -p ../input_files/auto/bioplex
+        p293t_url='https://bioplex.hms.harvard.edu/data/BioPlex_293T_Network_10K_Dec_2019.tsv'
+        hct116_url='https://bioplex.hms.harvard.edu/data/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv'
+
+        curl -fL -o {output[0]} "$p293t_url"
+        curl -fL -o {output[1]} "$hct116_url"
+
+        p293t_lm=$(curl -fsI "$p293t_url" | awk -F': ' 'tolower($1)=="last-modified"{{print $2}}')
+        hct116_lm=$(curl -fsI "$hct116_url" | awk -F': ' 'tolower($1)=="last-modified"{{print $2}}')
+
+        python3 -c "import email.utils,sys; p293t_lm,hct116_lm,out=sys.argv[1:4]; p293t_dt=email.utils.parsedate_to_datetime(p293t_lm).date().isoformat(); hct116_dt=email.utils.parsedate_to_datetime(hct116_lm).date().isoformat(); open(out,'w').write('dataset\\tfile\\tversion\\tversion_date\\nBioPlex 3.0 293T\\tBioPlex_293T_Network_10K_Dec_2019.tsv\\t3.0\\t'+p293t_dt+'\\nBioPlex 3.0 HCT116\\tBioPlex_HCT116_Network_5.5K_Dec_2019.tsv\\t3.0\\t'+hct116_dt+'\\n')" "$p293t_lm" "$hct116_lm" {output[2]}
         """
 
 rule download_string: