diff --git a/designs/ppi/string_ppi_ingest_design.md b/designs/ppi/string_ppi_ingest_design.md new file mode 100644 index 0000000..097639e --- /dev/null +++ b/designs/ppi/string_ppi_ingest_design.md @@ -0,0 +1,246 @@ +# STRING PPI Ingest Design + +## Status + +Implemented and validated in the working graph and working MySQL paths. + +This first pass covers **STRING human protein-protein interactions** only. +BioPlex and Reactome PPI remain follow-up sources. + +## Scope + +Implemented source: + +- STRING human PPI + +Explicitly deferred: + +- BioPlex PPI +- Reactome PPI +- source-specific `interaction_type` / `evidence` population + +## Files Added / Changed + +### New / updated ingest code + +- `src/input_adapters/string/string_ppi.py` +- `src/models/ppi.py` +- `workflows/pharos.Snakefile` +- `src/use_cases/working.yaml` + +### New / updated downstream code + +- `src/input_adapters/pharos_arango/tcrd/ppi.py` +- `src/output_adapters/sql_converters/tcrd.py` +- `src/use_cases/working_mysql.yaml` + +### Tests + +- `tests/test_string_ppi.py` +- `tests/test_ppi_record_merging.py` +- `tests/test_tcrd_output_converter.py` + +## Source Inputs + +Implemented download target: + +- `https://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz` + +Stored under: + +- `input_files/auto/string/9606.protein.links.v12.0.txt.gz` +- `input_files/auto/string/string_version.tsv` + +Version strategy: + +- `version`: hardcoded as `12.0` in the Snakemake rule +- `version_date`: derived from HTTP `Last-Modified` +- `download_date`: adapter file mtime unless explicitly present in version TSV + +## Raw File Profiling + +Observed file shape: + +- header: `protein1 protein2 combined_score` +- IDs are `9606.ENSP...` +- no self-pairs were observed in the raw file + +Observed counts from the local v12.0 file: + +- total rows: `13,715,404` +- rows kept at `score >= 400`: `1,858,944` +- rows filtered out below `400`: `11,856,460` +- percent filtered out by the cutoff: `86.45%` + +## Legacy Comparison + +Old TCRD loader reviewed: + +- `https://github.com/unmtransinfo/TCRD/blob/master/loaders/load-STRINGDB.py` + +Confirmed old behavior: + +- input file: `9606.protein.links.v11.0.txt` +- strips `9606.` and works with bare `ENSP...` +- inserts `ppitype='STRINGDB'` +- inserts scalar `score=` +- skips unmapped proteins +- skips self-pairs +- does **not** apply a score cutoff during load + +Confirmed old IFX_ODIN / Pharos readback behavior: + +- `src/input_adapters/pharos_mysql/ppi_adapter.py` keeps only `StringDB` rows with + `score >= 400` +- it filters `protein_id < other_id`, which implies `ncats_ppi` stores reciprocal + rows and the graph readback collapses them to one undirected edge + +## Implemented Graph Mapping + +`StringPPIAdapter` emits: + +- `PPIEdge` + - `start_node`: `Protein(id="ENSEMBL:ENSP...")` + - `end_node`: `Protein(id="ENSEMBL:ENSP...")` + - `sources`: STRING provenance list + - `score`: list-valued, emitted as `[combined_score]` + +Implementation choices: + +- `score_cutoff` is an adapter parameter with default `400` +- rows below the cutoff are discarded before they enter the graph +- self-pairs are discarded before they enter the graph +- `max_rows` is supported for bounded validation runs and counts **kept emitted + edges**, not scanned raw lines + +Intentionally not populated for STRING first pass: + +- `p_int` +- `p_ni` +- `p_wrong` +- `interaction_type` +- `evidence` + +Reason: + +- the selected STRING file only provides `combined_score` +- preserving guessed mappings into legacy fields would be speculative + +## Resolver Path + +STRING emits `ENSEMBL:ENSP...` protein IDs and relies on `tcrd_targets` for +canonicalization into reviewed target-graph proteins. + +Observed partial-run behavior with: + +- `score_cutoff: 400` +- `max_rows: 10000` +- `collapse_reviewed_targets: true` + +Results: + +- adapter emitted `10,000` edges +- `test_pharos` stored `9,425` `PPIEdge` docs +- the difference was mostly resolver drop-off from unresolved reviewed-target + coverage, not adapter parsing or cutoff logic + +Spot checks performed during validation: + +- no adapter-emitted self-loops +- no graph self-loop `PPIEdge` docs +- graph `PPIEdge` score range: `400..999` + +## Graph Merge Behavior + +`PPIEdge.score` is now list-valued. + +Reason: + +- some final graph edges aggregate multiple STRING rows after target resolution and + reviewed-target collapse +- a scalar `score` was being overwritten under `KeepLast` +- a list-valued `score` preserves all merged values in the graph + +This was validated with a dedicated record-merging test. + +## Downstream MySQL Mapping + +`TCRDOutputConverter` now exports `PPIEdge` to `ncats_ppi`. + +Implemented behavior: + +- export reciprocal rows for parity with `pharos319` +- normalize graph source label `STRING` to legacy `StringDB` in `ppitypes` +- write `score = max(score_list)` for the SQL row + +Why `max(score)`: + +- the graph preserves all merged STRING scores +- the legacy SQL schema stores only one scalar `score` +- `max(score)` is a conservative first-pass collapse rule + +## Validation Results + +### Local test coverage + +Executed: + +- `pytest tests/test_string_ppi.py` +- `pytest tests/test_string_ppi.py tests/test_ppi_record_merging.py` +- `pytest tests/test_tcrd_output_converter.py` + +Result: + +- all targeted tests passed + +### Partial graph validation + +Working graph setup used: + +- reviewed UniProt proteins +- `StringPPIAdapter(score_cutoff=400, max_rows=10000)` +- database: `test_pharos` + +Observed graph contents: + +- `Protein`: `20,332` +- `PPIEdge`: `9,425` +- all `PPIEdge` rows carried STRING provenance +- score range in graph: `400..999` +- no graph self-loops + +### Working MySQL validation + +`working_mysql.yaml` was reduced to: + +- `ProteinAdapter` +- `ProteinPPIAdapter` + +Observed downstream contents in `pharos400_working`: + +- `ncats_ppi` rows: `18,850` +- score range: `400..999` + +Important note: + +- `18,850` rows corresponds to reciprocal export of `9,425` graph edges +- however, not every unordered protein pair appears exactly twice +- some unordered pairs appear four times because multiple graph `PPIEdge` docs + collapse to the same canonical protein pair after reviewed-target resolution, + and each graph edge is then exported in both directions + +This is currently acceptable for parity-oriented validation, but it is a follow-up +cleanup topic if we later want the downstream SQL path to deduplicate canonical +pairs before export. + +## Open Follow-Ups + +- Profile whether STRING `.protein.links.full...` is worth revisiting for richer + channel-specific fields +- Add Reactome PPI ingest + - populate `interaction_type` + - decide whether the Reactome evidence/context column should map to `evidence` +- Add BioPlex PPI ingest +- Decide whether downstream `ncats_ppi` export should collapse duplicate canonical + pairs before reciprocal row generation, or continue to preserve one SQL row pair + per graph edge diff --git a/playbooks/ingest_playbook.md b/playbooks/ingest_playbook.md index 9747301..3b04c48 100644 --- a/playbooks/ingest_playbook.md +++ b/playbooks/ingest_playbook.md @@ -184,3 +184,8 @@ Provide a repeatable workflow for adding a new data source to the target graph i regenerates files on a weekly Sunday schedule — the `Last-Modified` header captures that publish date reliably. Write it to a small TSV in Snakemake so the adapter can read it as `version_date`. - **Use named parameters for `DatasourceVersionInfo`.** This avoids argument-order regressions when version handling evolves. + +### Edge merging and downstream parity +- **If duplicate source rows can collapse onto the same final graph edge, make merge-sensitive edge payload list-valued in the graph.** STRING PPIs needed `score: List[float]` instead of a scalar because reviewed-target collapse could merge multiple source rows onto one final `PPIEdge`; otherwise `KeepLast` silently overwrote earlier scores. +- **A graph edge and a downstream SQL row do not need the same cardinality.** For PPIs, the graph should keep one conceptual undirected edge with merged payload, while the legacy `ncats_ppi` table expects reciprocal directed rows for query-order parity. Model the graph cleanly first, then adapt row cardinality explicitly in the SQL converter. +- **When a legacy downstream table stores one scalar but the graph preserves many values, choose and document an explicit collapse rule.** For STRING PPI export, `max(score)` was used for `ncats_ppi.score`; do not let scalar selection happen implicitly through merge order. diff --git a/src/constants.py b/src/constants.py index 1df535c..746005f 100755 --- a/src/constants.py +++ b/src/constants.py @@ -17,6 +17,7 @@ class DataSourceName(SimpleEnum): CCLE = "Cancer Cell Line Encyclopedia (CCLE)" Cellosaurus = "Cellosaurus" Reactome = "Reactome" + STRING = "STRING" WikiPathways = "WikiPathways" PathwayCommons = "PathwayCommons" CLO = "Cell Line Ontology (CLO)" diff --git a/src/input_adapters/pharos_arango/tcrd/ppi.py b/src/input_adapters/pharos_arango/tcrd/ppi.py new file mode 100644 index 0000000..5a86526 --- /dev/null +++ b/src/input_adapters/pharos_arango/tcrd/ppi.py @@ -0,0 +1,51 @@ +from typing import Generator, List + +from src.input_adapters.pharos_arango.tcrd.protein import PharosArangoAdapter +from src.models.datasource_version_info import DataSourceDetails +from src.models.ppi import PPIEdge +from src.models.protein import Protein + + +def protein_ppi_query(last_key: str = None, limit: int = 10000) -> str: + filter_clause = f'FILTER rel._key > "{last_key}"' if last_key else "" + return f""" + FOR rel IN `PPIEdge` + {filter_clause} + SORT rel._key + LIMIT {limit} + RETURN rel + """ + + +def ppi_version_query() -> str: + return """FOR rel IN `PPIEdge` LIMIT 1 RETURN rel.creation""" + + +class ProteinPPIAdapter(PharosArangoAdapter): + batch_size = 10_000 + + def get_all(self) -> Generator[List[PPIEdge], None, None]: + last_key = None + while True: + rows = list(self.runQuery(protein_ppi_query(last_key=last_key, limit=self.batch_size))) + if not rows: + break + + yield [ + PPIEdge( + start_node=Protein(id=row["start_id"]), + end_node=Protein(id=row["end_id"]), + provenance=row.get("provenance"), + sources=row.get("sources") or [], + p_int=row.get("p_int"), + p_ni=row.get("p_ni"), + p_wrong=row.get("p_wrong"), + score=row.get("score") or [], + ) + for row in rows + ] + last_key = rows[-1]["_key"] + + def get_version_info_query(self) -> DataSourceDetails: + raw_version_info = self.runQuery(ppi_version_query())[0] + return DataSourceDetails.parse_tsv(raw_version_info) diff --git a/src/input_adapters/pharos_mysql/ppi_adapter.py b/src/input_adapters/pharos_mysql/ppi_adapter.py index 8da92ee..7651c06 100755 --- a/src/input_adapters/pharos_mysql/ppi_adapter.py +++ b/src/input_adapters/pharos_mysql/ppi_adapter.py @@ -47,6 +47,6 @@ def get_all(self) -> List[Relationship]: p_int=row[3], p_ni=row[4], p_wrong=row[5], - score=row[6] + score=[row[6]] if row[6] is not None else [] ) for row in results] diff --git a/src/input_adapters/string/string_ppi.py b/src/input_adapters/string/string_ppi.py new file mode 100644 index 0000000..9256f4c --- /dev/null +++ b/src/input_adapters/string/string_ppi.py @@ -0,0 +1,100 @@ +import csv +import gzip +from datetime import date +from pathlib import Path +from typing import Generator, List, Optional + +from src.constants import DataSourceName, Prefix +from src.input_adapters.flat_file_adapter import FlatFileAdapter +from src.models.datasource_version_info import DatasourceVersionInfo +from src.models.ppi import PPIEdge +from src.models.protein import Protein + + +class StringPPIAdapter(FlatFileAdapter): + version_info: DatasourceVersionInfo + + def __init__( + self, + file_path: str, + version_file_path: Optional[str] = None, + score_cutoff: int = 400, + max_rows: Optional[int] = None, + ): + FlatFileAdapter.__init__(self, file_path=file_path) + self.score_cutoff = int(score_cutoff) + self.max_rows = max_rows + self.version_info = self._load_version_info(version_file_path) + + def _load_version_info(self, version_file_path: Optional[str]) -> DatasourceVersionInfo: + version = None + version_date = None + download_date = self.download_date + if version_file_path: + with open(version_file_path, "r", encoding="utf-8") as handle: + reader = csv.DictReader(handle, delimiter="\t") + first_row = next(reader, None) + if first_row: + version = first_row.get("version") or None + version_date_str = first_row.get("version_date") or None + download_date_str = first_row.get("download_date") or None + version_date = date.fromisoformat(version_date_str) if version_date_str else None + download_date = date.fromisoformat(download_date_str) if download_date_str else download_date + return DatasourceVersionInfo( + version=version, + version_date=version_date, + download_date=download_date, + ) + + def get_datasource_name(self) -> DataSourceName: + return DataSourceName.STRING + + def get_version(self) -> DatasourceVersionInfo: + return self.version_info + + def _open_input(self): + path = Path(self.file_path) + if path.suffix == ".gz": + return gzip.open(path, "rt", encoding="utf-8") + return open(path, "r", encoding="utf-8") + + @staticmethod + def _strip_taxon_prefix(protein_id: str) -> str: + if protein_id.startswith("9606."): + return protein_id.split(".", 1)[1] + return protein_id + + def get_all(self) -> Generator[List[PPIEdge], None, None]: + batch: List[PPIEdge] = [] + kept_rows = 0 + with self._open_input() as handle: + header = handle.readline().strip().split() + for line in handle: + if self.max_rows is not None and kept_rows >= self.max_rows: + break + parts = line.strip().split() + if not parts: + continue + row = dict(zip(header, parts)) + score = int(row["combined_score"]) + if score < self.score_cutoff: + continue + + protein1 = self._strip_taxon_prefix(row["protein1"]) + protein2 = self._strip_taxon_prefix(row["protein2"]) + if protein1 == protein2: + continue + protein1, protein2 = sorted((protein1, protein2)) + + edge = PPIEdge( + start_node=Protein(id=f"{Prefix.ENSEMBL}:{protein1}"), + end_node=Protein(id=f"{Prefix.ENSEMBL}:{protein2}"), + sources=[self.get_datasource_name().value], + score=[score], + ) + batch.append(edge) + kept_rows += 1 + if len(batch) >= self.batch_size: + yield batch + batch = [] + yield batch diff --git a/src/models/ppi.py b/src/models/ppi.py index 87e96d8..4ec3d32 100644 --- a/src/models/ppi.py +++ b/src/models/ppi.py @@ -13,4 +13,4 @@ class PPIEdge(Relationship): p_int: float = None p_ni: float = None p_wrong: float = None - score: float = None + score: List[float] = field(default_factory=list) diff --git a/src/output_adapters/sql_converters/tcrd.py b/src/output_adapters/sql_converters/tcrd.py index 43f6839..9246729 100755 --- a/src/output_adapters/sql_converters/tcrd.py +++ b/src/output_adapters/sql_converters/tcrd.py @@ -11,6 +11,7 @@ from src.models.node import EquivalentId from src.models.panther_class import PantherClass, ProteinPantherClassEdge from src.models.pathway import ProteinPathwayEdge +from src.models.ppi import PPIEdge from src.models.protein import Protein from src.models.tcrd_disease_ontology import MondoTerm, MondoTermParentEdge, DOTerm, DOTermParentEdge from src.models.tissue import Tissue, TissueParentEdge @@ -19,7 +20,7 @@ GeneRif, GeneRif2Pubmed, Protein2Pubmed, Ligand as mysqlLigand, LigandActivity, Uberon, UberonParent, Tissue as mysqlTissue, Expression, Gtex, Mondo, MondoParent, MondoXref, Disease as mysqlDisease, DiseaseType, DO, DOParent, - NcatsDisease, NcatsD2DA, Pathway as mysqlPathway, PantherClass as mysqlPantherClass, P2PC, + NcatsDisease, NcatsD2DA, Pathway as mysqlPathway, PantherClass as mysqlPantherClass, P2PC, PPI as mysqlPPI, DTO as mysqlDTO, DTOParent, P2DTO, ) from src.output_adapters.sql_converters.output_converter_base import SQLOutputConverter @@ -64,6 +65,8 @@ def __init__(self): ProteinDiseaseEdge: [self.disease_type_converter, self.disease_converter, self.ncats_d2da_converter], # Pathway ProteinPathwayEdge: [self.pathway_converter], + # PPI + PPIEdge: [self.ppi_converter], # Panther PantherClass: [self.panther_class_converter], ProteinPantherClassEdge: [self.p2pc_converter], @@ -612,6 +615,54 @@ def pathway_converter(self, obj: dict) -> mysqlPathway: provenance=obj['provenance'], ) + # --- PPI --- + + @staticmethod + def _ppi_source_label(source: str) -> str: + if not source: + return source + label = source.split('\t', 1)[0] + return {'STRING': 'StringDB'}.get(label, label) + + @staticmethod + def _max_or_value(value): + if isinstance(value, list): + return max(value) if value else None + return value + + def ppi_converter(self, obj: dict) -> List[mysqlPPI]: + protein_id = self.resolve_id('protein', obj['start_id']) + other_id = self.resolve_id('protein', obj['end_id']) + + source_labels = sorted({ + self._ppi_source_label(source) + for source in (obj.get('sources') or []) + if source + }) + + shared = dict( + ppitypes=",".join(source_labels) if source_labels else self._ppi_source_label(obj['provenance']), + p_int=self._max_or_value(obj.get('p_int')), + p_ni=self._max_or_value(obj.get('p_ni')), + p_wrong=self._max_or_value(obj.get('p_wrong')), + evidence=obj.get('evidence'), + interaction_type=obj.get('interaction_type'), + score=self._max_or_value(obj.get('score')), + ) + + return [ + mysqlPPI( + protein_id=protein_id, + other_id=other_id, + **shared, + ), + mysqlPPI( + protein_id=other_id, + other_id=protein_id, + **shared, + ), + ] + # --- Panther --- def panther_class_converter(self, obj: dict) -> mysqlPantherClass: diff --git a/src/use_cases/pharos/TCRD_TODO.md b/src/use_cases/pharos/TCRD_TODO.md index 27ecd57..1059e99 100644 --- a/src/use_cases/pharos/TCRD_TODO.md +++ b/src/use_cases/pharos/TCRD_TODO.md @@ -9,7 +9,7 @@ Status: [ ] not started | [~] in progress | [x] done - [x] Standardize edge/relationship class names — pick one convention (e.g. always end with `Edge`, drop `Relationship`) - [ ] `ProteinAdapter` (UniProt) yields `Pathway` nodes and `ProteinPathwayRelationship` edges in addition to `Protein` — consider splitting into separate adapters to make intent clear -- [ ] `ProteinDiseaseEdgeAdapter` (UniProt) yields `Disease` nodes in addition to edges — name already implies edges only, consider making the Disease node creation explicit +- [ ] Normalize old `pharos_mysql` adapters to use `EquivalentId(...).id_str()` consistently instead of manual `f"{Prefix...}:{...}"` string construction where they emit graph IDs. ## POUNCE Code Style TODOs @@ -22,7 +22,7 @@ Status: [ ] not started | [~] in progress | [x] done Each row is a protein-facing Pharos/TCRD concept. Data source checkboxes = ingested into the Pharos graph or side-lifted into the protein-oriented Pharos view. MySQL table checkboxes = graph-derived converter output written to TCRD. | Concept | Data Sources (→ graph) | Arango Type | MySQL Tables (graph → TCRD) | -|---------|------------------------|-------------|------------------------------| +|---------|-----------------------|-------------|-----------------------| | **Protein** | [x] target_graph CSV
[x] UniProt reviewed
[x] JensenLab *(pm_score)*
[x] Antibodypedia *(antibody_count)*
[x] old Pharos MySQL *(idg_family)* | `Protein` | [x] `protein`
[x] `target`
[x] `t2tc`
[x] `alias`
[x] `xref`
[x] `tdl_info` | | **GeneRif** | [x] target_graph generif CSV | `GeneRif` | [x] `generif` | | **GeneGeneRifEdge** | [x] target_graph generif CSV | `GeneGeneRifEdge` | [x] `generif`
[x] `generif2pubmed`
[x] `protein2pubmed` | @@ -34,21 +34,21 @@ Each row is a protein-facing Pharos/TCRD concept. Data source checkboxes = inges | **ProteinGoTermEdge** | [x] UniProt GAF
[x] GO GAF | `ProteinGoTermEdge` | [x] `goa` | | **Ligand** | [x] IUPHAR
[x] ChEMBL
[x] DrugCentral | `Ligand` | [x] `ncats_ligands` | | **ProteinLigandEdge** | [x] IUPHAR
[x] ChEMBL
[x] DrugCentral | `ProteinLigandEdge` | [x] `ncats_ligand_activity` | -| **Disease** | [x] MONDO
[x] Disease Ontology
[x] UniProt curated
[x] CTD
[x] JensenLab DISEASES *(promoted in `pharos.yaml` / `target_graph.yaml`)*
[x] DrugCentral Indication | `Disease` | [x] `ncats_disease` | +| **Disease** | [x] MONDO
[x] Disease Ontology
[x] UniProt curated
[x] CTD
[x] JensenLab DISEASES
[x] DrugCentral Indication | `Disease` | [x] `ncats_disease` | | **DiseaseParentEdge** | [x] MONDO | `DiseaseParentEdge` | not exported from merged graph; source-file MONDO tables populate `mondo_parent` / `ancestry_mondo` below | | **DODiseaseParentEdge** | [x] Disease Ontology | `DODiseaseParentEdge` | not exported from merged graph; source-file DO tables populate `do_parent` / `ancestry_do` below | -| **ProteinDiseaseEdge** | [x] UniProt curated
[x] CTD *(side-lifted from gene associations by the TCRD target resolver)*
[x] JensenLab DISEASES *(Knowledge, Experiment/TIGA, and Text Mining; promoted in `pharos.yaml` / `target_graph.yaml`; working/full configs apply `textmining_min_zscore: 6.0` to stay close to historical Pharos text-mining scope)*
[x] DrugCentral Indication | `ProteinDiseaseEdge` | [x] `disease_type`
[x] `disease`
[x] `ncats_d2da` | +| **ProteinDiseaseEdge** | [x] UniProt curated
[x] CTD
[x] JensenLab DISEASES
[x] DrugCentral Indication | `ProteinDiseaseEdge` | [x] `disease_type`
[x] `disease`
[x] `ncats_d2da` | | **Pathway** | [x] UniProt
[x] Reactome
[x] WikiPathways
[x] PathwayCommons | `Pathway` | no standalone TCRD table; pathway content is duplicated via `ProteinPathwayEdge` into `pathway` | | **PathwayParentEdge** | [x] Reactome | `PathwayParentEdge` | not exported to legacy TCRD MySQL | -| **ProteinPathwayEdge** | [x] UniProt
[x] Reactome
[x] WikiPathways *(side-lifted from gene associations by the TCRD target resolver)*
[x] PathwayCommons *(side-lifted from gene associations by the TCRD target resolver)* | `ProteinPathwayEdge` | [x] `pathway` | -| **PantherClass** | [x] PANTHER Classes *(promoted in `pharos.yaml` / `target_graph.yaml`)* | `PantherClass` | [x] `panther_class` *(via `tcrd.yaml`; validated in `working_mysql.yaml` first)* | -| **ProteinPantherClassEdge** | [x] PANTHER Classes *(promoted in `pharos.yaml` / `target_graph.yaml`)* | `ProteinPantherClassEdge` | [x] `p2pc` *(via `tcrd.yaml`; validated in `working_mysql.yaml` first)* | +| **ProteinPathwayEdge** | [x] UniProt
[x] Reactome
[x] WikiPathways
[x] PathwayCommons | `ProteinPathwayEdge` | [x] `pathway` | +| **PPIEdge** | [x] STRING | `PPIEdge` | [x] `ncats_ppi` | +| **PantherClass** | [x] PANTHER Classes | `PantherClass` | [x] `panther_class` | +| **ProteinPantherClassEdge** | [x] PANTHER Classes | `ProteinPantherClassEdge` | [x] `p2pc` | | **DTOClass** | [x] old Pharos MySQL | `DTOClass` | current converter supports `dto`, but DTO is not wired in active `tcrd.yaml` | | **DTOClassParentEdge** | [x] old Pharos MySQL | `DTOClassParentEdge` | current converter supports `dto_parent`, but DTO is not wired in active `tcrd.yaml` | | **ProteinDTOClassEdge** | [x] old Pharos MySQL | `ProteinDTOClassEdge` | current converter supports `p2dto`, but DTO is not wired in active `tcrd.yaml` | | **Keyword** | [x] UniProt | `Keyword` | no standalone TCRD table; keyword content is duplicated via `ProteinKeywordEdge` into `xref` | -| **ProteinKeywordEdge** | [x] UniProt | `ProteinKeywordEdge` | [x] `xref` *(UniProt Keyword xtype)* | -| | *— post-processing (pharos_aql_post.yaml) —* | | | +| **ProteinKeywordEdge** | [x] UniProt | `ProteinKeywordEdge` | [x] `xref` | | | | **SetPreferredSymbolAdapter** | [x] computed from graph | updates `preferred_symbol` on `Protein` | *(via Protein → `protein.preferred_symbol`)* | | **SetLigandActivityFlagAdapter** | [x] computed from graph | updates `meets_idg_cutoff` on `ProteinLigandEdge` | *(via ProteinLigandEdge)* | | **SetGoTermLeafFlagAdapter** | [x] computed from graph | updates `is_leaf` on `GoTerm` | *(via GoTerm)* | @@ -71,7 +71,7 @@ These tables are populated directly from ontology source files during the TCRD b - maybe ClinGen - old pharos didn't have it, but maybe it's useful ### New Concepts -- Protein-Protein Interactions — STRING, BioPlex, Reactome PPI +- Protein-Protein Interactions — BioPlex, Reactome PPI - Orthologs — OMA, EggNOG, Inparanoid - Phenotype — IMPC, JAX/MGI - GWAS @@ -83,9 +83,6 @@ These tables are populated directly from ontology source files during the TCRD b - Nearest Tclin (computed from graph) - Publication Statistics (PubMed Score, PubTator) -### Refactoring / Polish -- Normalize old `pharos_mysql` adapters to use `EquivalentId(...).id_str()` consistently instead of manual `f"{Prefix...}:{...}"` string construction where they emit graph IDs. - ### Simple Linkouts - Dark Kinase Knowledgebase — understudied kinases compendium - RESOLUTE — solute carrier (SLC) target class resource diff --git a/src/use_cases/pharos/pharos.yaml b/src/use_cases/pharos/pharos.yaml index 7159259..1b1f768 100644 --- a/src/use_cases/pharos/pharos.yaml +++ b/src/use_cases/pharos/pharos.yaml @@ -138,6 +138,13 @@ input_adapters: file_path: ./input_files/auto/uniprot/uniprot-human-reviewed.json.gz version_file_path: ./input_files/auto/uniprot/uniprot_version.tsv + - import: ./src/input_adapters/string/string_ppi.py + class: StringPPIAdapter + kwargs: + file_path: ./input_files/auto/string/9606.protein.links.v12.0.txt.gz + version_file_path: ./input_files/auto/string/string_version.tsv + score_cutoff: 400 + - import: ./src/input_adapters/target_graph/generif_node.py class: GeneRifNodeAdapter kwargs: diff --git a/src/use_cases/pharos/target_graph.yaml b/src/use_cases/pharos/target_graph.yaml index 23e1745..872caff 100644 --- a/src/use_cases/pharos/target_graph.yaml +++ b/src/use_cases/pharos/target_graph.yaml @@ -150,6 +150,13 @@ input_adapters: file_path: ./input_files/auto/uniprot/uniprot-human.json.gz version_file_path: ./input_files/auto/uniprot/uniprot_version.tsv + - import: ./src/input_adapters/string/string_ppi.py + class: StringPPIAdapter + kwargs: + file_path: ./input_files/auto/string/9606.protein.links.v12.0.txt.gz + version_file_path: ./input_files/auto/string/string_version.tsv + score_cutoff: 400 + - import: ./src/input_adapters/target_graph/protein_nodes_and_edges.py class: IsoformProteinEdgeAdapter kwargs: diff --git a/src/use_cases/pharos/tcrd.yaml b/src/use_cases/pharos/tcrd.yaml index d90187d..ff65f3f 100644 --- a/src/use_cases/pharos/tcrd.yaml +++ b/src/use_cases/pharos/tcrd.yaml @@ -54,6 +54,12 @@ input_adapters: kwargs: database_name: *source_database + - import: ./src/input_adapters/pharos_arango/tcrd/ppi.py + class: ProteinPPIAdapter + credentials: *source_credentials + kwargs: + database_name: *source_database + - import: ./src/input_adapters/pharos_arango/tcrd/go_terms.py class: GoTermAdapter credentials: *source_credentials diff --git a/src/use_cases/working.yaml b/src/use_cases/working.yaml index e446415..1d38496 100644 --- a/src/use_cases/working.yaml +++ b/src/use_cases/working.yaml @@ -34,37 +34,13 @@ input_adapters: file_path: ./input_files/manual/target_graph/protein_ids.tsv collapse_reviewed_targets: true - - import: ./src/input_adapters/file_uniprot/protein_adapter.py - class: ProteinAdapter + - import: ./src/input_adapters/string/string_ppi.py + class: StringPPIAdapter kwargs: - file_path: ./input_files/auto/uniprot/uniprot-human-reviewed.json.gz - version_file_path: ./input_files/auto/uniprot/uniprot_version.tsv - - - import: ./src/input_adapters/pharos_arango/set_preferred_symbol.py - class: SetPreferredSymbolAdapter - kwargs: - database_name: test_pharos - credentials: *destination_credentials - - - import: ./src/input_adapters/panther/panther_classes.py - class: PantherClassesAdapter - kwargs: - class_file_path: ./input_files/auto/panther/Protein_Class_19.0 - relationship_file_path: ./input_files/auto/panther/Protein_class_relationship - sequence_classification_file_path: ./input_files/auto/panther/PTHR19.0_human - version_file_path: ./input_files/auto/panther/panther_classes_version.tsv - - - import: ./src/input_adapters/pharos_mysql/dto_adapter.py - class: DTOClassAdapter - credentials: ./src/use_cases/secrets/pharos_credentials.yaml - - - import: ./src/input_adapters/pharos_mysql/dto_adapter.py - class: DTOClassParentEdgeAdapter - credentials: ./src/use_cases/secrets/pharos_credentials.yaml - - - import: ./src/input_adapters/pharos_mysql/dto_adapter.py - class: ProteinDTOClassAdapter - credentials: ./src/use_cases/secrets/pharos_credentials.yaml + file_path: ./input_files/auto/string/9606.protein.links.v12.0.txt.gz + version_file_path: ./input_files/auto/string/string_version.tsv + score_cutoff: 400 + max_rows: 10000 output_adapters: - import: ./src/output_adapters/arango_output_adapter.py diff --git a/src/use_cases/working_mysql.yaml b/src/use_cases/working_mysql.yaml index cbd62b4..543d1aa 100644 --- a/src/use_cases/working_mysql.yaml +++ b/src/use_cases/working_mysql.yaml @@ -20,100 +20,18 @@ resolvers: - Transcript input_adapters: -# - import: ./src/input_adapters/pharos_source_tcrd/ontology_tables.py -# class: MondoTableAdapter -# kwargs: -# file_path: ./input_files/auto/mondo/mondo.json -# -# - import: ./src/input_adapters/pharos_source_tcrd/ontology_tables.py -# class: MondoTableParentEdgeAdapter -# kwargs: -# file_path: ./input_files/auto/mondo/mondo.json -# -# - import: ./src/input_adapters/pharos_source_tcrd/ontology_tables.py -# class: DOTableAdapter -# kwargs: -# file_path: ./input_files/auto/disease_ontology/doid.json -# -# - import: ./src/input_adapters/pharos_source_tcrd/ontology_tables.py -# class: DOTableParentEdgeAdapter -# kwargs: -# file_path: ./input_files/auto/disease_ontology/doid.json - - import: ./src/input_adapters/pharos_arango/tcrd/protein.py class: ProteinAdapter credentials: *source_credentials kwargs: database_name: *source_database -# - import: ./src/input_adapters/pharos_arango/tcrd/tissue.py -# class: TissueAdapter -# credentials: *source_credentials -# kwargs: -# database_name: *source_database -# -# - import: ./src/input_adapters/pharos_arango/tcrd/tissue.py -# class: ExpressionAdapter -# credentials: *source_credentials -# kwargs: -# database_name: *source_database -# max_rows: 10000 - -# - import: ./src/input_adapters/pharos_arango/tcrd/pathway.py -# class: ProteinPathwayAdapter -# credentials: *source_credentials -# kwargs: -# database_name: *source_database -# - - import: ./src/input_adapters/pharos_arango/tcrd/panther.py - class: PantherClassAdapter - credentials: *source_credentials - kwargs: - database_name: *source_database - - - import: ./src/input_adapters/pharos_arango/tcrd/panther.py - class: ProteinPantherClassAdapter - credentials: *source_credentials - kwargs: - database_name: *source_database - - - import: ./src/input_adapters/pharos_arango/tcrd/dto.py - class: DTOClassAdapter + - import: ./src/input_adapters/pharos_arango/tcrd/ppi.py + class: ProteinPPIAdapter credentials: *source_credentials kwargs: database_name: *source_database - - import: ./src/input_adapters/pharos_arango/tcrd/dto.py - class: DTOClassParentAdapter - credentials: *source_credentials - kwargs: - database_name: *source_database - - - import: ./src/input_adapters/pharos_arango/tcrd/dto.py - class: ProteinDTOClassAdapter - credentials: *source_credentials - kwargs: - database_name: *source_database - -# - import: ./src/input_adapters/pharos_arango/tcrd/keyword.py -# class: ProteinKeywordAdapter -# credentials: *source_credentials -# kwargs: -# database_name: *source_database -# -# - import: ./src/input_adapters/pharos_arango/tcrd/disease.py -# class: DiseaseAdapter -# credentials: *source_credentials -# kwargs: -# database_name: *source_database -# associated_only: true -# -# - import: ./src/input_adapters/pharos_arango/tcrd/disease.py -# class: ProteinDiseaseAdapter -# credentials: *source_credentials -# kwargs: -# database_name: *source_database - output_adapters: - import: ./src/output_adapters/mysql_output_adapter.py class: TCRDOutputAdapter diff --git a/tests/test_ppi_record_merging.py b/tests/test_ppi_record_merging.py new file mode 100644 index 0000000..b9916bb --- /dev/null +++ b/tests/test_ppi_record_merging.py @@ -0,0 +1,30 @@ +from src.shared.record_merger import RecordMerger, FieldConflictBehavior + + +def test_ppi_scores_merge_as_list_values(): + merger = RecordMerger(field_conflict_behavior=FieldConflictBehavior.KeepLast) + + records = [ + { + "start_id": "IFXProtein:A", + "end_id": "IFXProtein:B", + "score": [475], + "sources": ["STRING\t12.0\t2023-05-16\t2026-04-16"], + "entity_resolution": "STRING\tStringPPIAdapter\tENSEMBL:ENSP1\tENSEMBL:ENSP2", + "provenance": "STRING\t12.0\t2023-05-16\t2026-04-16", + }, + { + "start_id": "IFXProtein:A", + "end_id": "IFXProtein:B", + "score": [477], + "sources": ["STRING\t12.0\t2023-05-16\t2026-04-16"], + "entity_resolution": "STRING\tStringPPIAdapter\tENSEMBL:ENSP3\tENSEMBL:ENSP4", + "provenance": "STRING\t12.0\t2023-05-16\t2026-04-16", + }, + ] + + merged = merger.merge_records(records, {}, nodes_or_edges="edges") + + assert len(merged) == 1 + assert sorted(merged[0]["score"]) == [475, 477] + assert any(line.startswith("score\t1 entries already there\t1 entries being merged") for line in merged[0]["updates"]) diff --git a/tests/test_string_ppi.py b/tests/test_string_ppi.py new file mode 100644 index 0000000..86c227a --- /dev/null +++ b/tests/test_string_ppi.py @@ -0,0 +1,126 @@ +import gzip + +from src.input_adapters.string.string_ppi import StringPPIAdapter + + +def test_string_ppi_adapter_applies_default_cutoff_and_skips_self_pairs(tmp_path): + data_path = tmp_path / "9606.protein.links.v12.0.txt.gz" + version_path = tmp_path / "string_version.tsv" + + with gzip.open(data_path, "wt", encoding="utf-8") as handle: + handle.write( + "\n".join( + [ + "protein1 protein2 combined_score", + "9606.ENSP0001 9606.ENSP0002 399", + "9606.ENSP0001 9606.ENSP0001 900", + "9606.ENSP0002 9606.ENSP0003 400", + "9606.ENSP0003 9606.ENSP0004 700", + ] + ) + ) + + version_path.write_text("version\tversion_date\n12.0\t2025-01-15\n", encoding="utf-8") + + adapter = StringPPIAdapter( + file_path=str(data_path), + version_file_path=str(version_path), + ) + + batches = list(adapter.get_all()) + edges = [edge for batch in batches for edge in batch] + + assert len(edges) == 2 + assert [(edge.start_node.id, edge.end_node.id, edge.score) for edge in edges] == [ + ("ENSEMBL:ENSP0002", "ENSEMBL:ENSP0003", [400]), + ("ENSEMBL:ENSP0003", "ENSEMBL:ENSP0004", [700]), + ] + assert all(edge.sources == ["STRING"] for edge in edges) + + version = adapter.get_version() + assert version.version == "12.0" + assert version.version_date.isoformat() == "2025-01-15" + + +def test_string_ppi_adapter_supports_explicit_cutoff_override(tmp_path): + data_path = tmp_path / "9606.protein.links.v12.0.txt" + + data_path.write_text( + "\n".join( + [ + "protein1 protein2 combined_score", + "9606.ENSP0001 9606.ENSP0002 250", + "9606.ENSP0002 9606.ENSP0003 300", + ] + ), + encoding="utf-8", + ) + + adapter = StringPPIAdapter( + file_path=str(data_path), + score_cutoff=300, + ) + + batches = list(adapter.get_all()) + edges = [edge for batch in batches for edge in batch] + + assert len(edges) == 1 + assert edges[0].start_node.id == "ENSEMBL:ENSP0002" + assert edges[0].end_node.id == "ENSEMBL:ENSP0003" + assert edges[0].score == [300] + + +def test_string_ppi_adapter_honors_max_rows_on_kept_edges(tmp_path): + data_path = tmp_path / "9606.protein.links.v12.0.txt" + + data_path.write_text( + "\n".join( + [ + "protein1 protein2 combined_score", + "9606.ENSP0001 9606.ENSP0002 250", + "9606.ENSP0002 9606.ENSP0002 900", + "9606.ENSP0003 9606.ENSP0004 400", + "9606.ENSP0004 9606.ENSP0005 500", + "9606.ENSP0005 9606.ENSP0006 600", + ] + ), + encoding="utf-8", + ) + + adapter = StringPPIAdapter( + file_path=str(data_path), + max_rows=2, + ) + + batches = list(adapter.get_all()) + edges = [edge for batch in batches for edge in batch] + + assert len(edges) == 2 + assert [(edge.start_node.id, edge.end_node.id, edge.score) for edge in edges] == [ + ("ENSEMBL:ENSP0003", "ENSEMBL:ENSP0004", [400]), + ("ENSEMBL:ENSP0004", "ENSEMBL:ENSP0005", [500]), + ] + + +def test_string_ppi_adapter_canonicalizes_pair_direction(tmp_path): + data_path = tmp_path / "9606.protein.links.v12.0.txt" + + data_path.write_text( + "\n".join( + [ + "protein1 protein2 combined_score", + "9606.ENSP9999 9606.ENSP0001 500", + ] + ), + encoding="utf-8", + ) + + adapter = StringPPIAdapter(file_path=str(data_path)) + + batches = list(adapter.get_all()) + edges = [edge for batch in batches for edge in batch] + + assert len(edges) == 1 + assert edges[0].start_node.id == "ENSEMBL:ENSP0001" + assert edges[0].end_node.id == "ENSEMBL:ENSP9999" + assert edges[0].score == [500] diff --git a/tests/test_tcrd_output_converter.py b/tests/test_tcrd_output_converter.py index e71c574..7b1715a 100644 --- a/tests/test_tcrd_output_converter.py +++ b/tests/test_tcrd_output_converter.py @@ -96,6 +96,50 @@ def test_pathway_converter_keeps_pwtype_without_lookup_table(): assert row.id_in_source == "R-HSA-199420" +def test_ppi_converter_emits_reciprocal_rows_with_max_score_and_legacy_stringdb_label(): + converter = TCRDOutputConverter() + converter.id_mapping["protein"] = { + "IFX123": 123, + "IFX456": 456, + } + + rows = converter.ppi_converter({ + "start_id": "IFX456", + "end_id": "IFX123", + "sources": ["STRING\t12.0\t2023-05-16\t2026-04-16"], + "score": [475, 477], + "provenance": "STRING\t12.0\t2023-05-16\t2026-04-16", + }) + + assert len(rows) == 2 + assert {(row.protein_id, row.other_id) for row in rows} == {(123, 456), (456, 123)} + assert all(row.ppitypes == "StringDB" for row in rows) + assert all(row.score == 477 for row in rows) + + +def test_ppi_converter_joins_multiple_source_labels(): + converter = TCRDOutputConverter() + converter.id_mapping["protein"] = { + "IFX123": 123, + "IFX456": 456, + } + + rows = converter.ppi_converter({ + "start_id": "IFX123", + "end_id": "IFX456", + "sources": [ + "Reactome\t95\t2025-11-27\t2026-03-23", + "STRING\t12.0\t2023-05-16\t2026-04-16", + ], + "score": [800], + "provenance": "STRING\t12.0\t2023-05-16\t2026-04-16", + }) + + assert len(rows) == 2 + assert all(row.ppitypes == "Reactome,StringDB" for row in rows) + assert all(row.score == 800 for row in rows) + + def test_gtex_converter_branches_gtex_details_from_shared_expression_edge(): converter = TCRDOutputConverter() converter.id_mapping["protein"] = {"IFX123": 123} diff --git a/workflows/pharos.Snakefile b/workflows/pharos.Snakefile index 565df34..386f304 100644 --- a/workflows/pharos.Snakefile +++ b/workflows/pharos.Snakefile @@ -25,6 +25,8 @@ rule all: "../input_files/auto/reactome/ReactomePathwaysRelation.txt", "../input_files/auto/reactome/UniProt2Reactome_All_Levels.txt", "../input_files/auto/reactome/reactome_version.tsv", + "../input_files/auto/string/9606.protein.links.v12.0.txt.gz", + "../input_files/auto/string/string_version.tsv", "../input_files/auto/gtex/GTEx_Analysis_2025_08_22_v11_RNASeQCv2.4.3_gene_tpm.gct.gz", "../input_files/auto/gtex/GTEx_Analysis_v11_Annotations_SampleAttributesDS.txt", "../input_files/auto/gtex/GTEx_Analysis_v11_Annotations_SubjectPhenotypesDS.txt", @@ -191,6 +193,19 @@ rule download_reactome: python3 -c "import email.utils,sys; lm=sys.argv[1]; v=sys.argv[2].strip(); out=sys.argv[3]; dt=email.utils.parsedate_to_datetime(lm).date().isoformat(); open(out,'w').write('version\\tversion_date\\n'+v+'\\t'+dt+'\\n')" "$last_modified" "$version" {output[3]} """ +rule download_string: + output: + "../input_files/auto/string/9606.protein.links.v12.0.txt.gz", + "../input_files/auto/string/string_version.tsv" + shell: + """ + mkdir -p ../input_files/auto/string + url='https://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz' + curl -fL -o {output[0]} "$url" + last_modified=$(curl -fsI "$url" | awk -F': ' 'tolower($1)=="last-modified"{{print $2}}') + python3 -c "import email.utils,sys; lm=sys.argv[1]; out=sys.argv[2]; dt=email.utils.parsedate_to_datetime(lm).date().isoformat(); open(out,'w').write('version\\tversion_date\\n12.0\\t'+dt+'\\n')" "$last_modified" {output[1]} + """ + rule download_gtex: output: "../input_files/auto/gtex/GTEx_Analysis_2025_08_22_v11_RNASeQCv2.4.3_gene_tpm.gct.gz",