From 19ec19b1afd65908b2c8f707530d1eea844f95a7 Mon Sep 17 00:00:00 2001 From: kelleherkj Date: Wed, 15 Apr 2026 14:05:23 -0400 Subject: [PATCH 1/2] harmonizome investigation --- designs/harmonizome/investigation.md | 276 +++++++++++++++++++++++++++ designs/harmonizome/sources.csv | 147 ++++++++++++++ src/use_cases/pharos/TCRD_TODO.md | 2 +- 3 files changed, 424 insertions(+), 1 deletion(-) create mode 100644 designs/harmonizome/investigation.md create mode 100644 designs/harmonizome/sources.csv diff --git a/designs/harmonizome/investigation.md b/designs/harmonizome/investigation.md new file mode 100644 index 0000000..eb81c6d --- /dev/null +++ b/designs/harmonizome/investigation.md @@ -0,0 +1,276 @@ +# Harmonizome Investigation Notes + +Date: 2026-04-15 + +## Goal + +Investigate whether the current Harmonizome release should be ingested into IFX_ODIN / Pharos, and if so, what the right model boundary is. + +This started from the `New Concepts` note in [TCRD_TODO.md](/Users/kelleherkj/IdeaProjects/IFX_ODIN/src/use_cases/pharos/TCRD_TODO.md:69). + +## High-Level Conclusion + +Do not start by ingesting Harmonizome wholesale. + +The old Pharos/TCRD use of Harmonizome was mostly a generic summary-attribute layer over targets, not a source-native graph ingest. After inspecting the current Harmonizome catalog, API, and the old `pharos319` landing tables, the cleaner direction is likely: + +1. preserve this investigation as reference +2. avoid a full Harmonizome ingest for now +3. compute analogous high-level summary metrics directly from the IFX_ODIN graph + +Harmonizome may still be useful later as a source of selected refreshed datasets, but it does not currently look like the best path for a first-pass new Pharos concept. + +## What Old Pharos Did + +Historical loader: +- [`load-Harmonizome.py`](https://github.com/unmtransinfo/TCRD/blob/master/loaders/load-Harmonizome.py) + +Observed legacy behavior: +- one `gene_attribute_type` row per Harmonizome dataset +- one `gene_attribute` row per matched `protein_id` and gene set +- provenance recorded for: + - `gene_attribute` + - `gene_attribute_type` + - `hgram_cdf` + +Important implication: +- old Pharos did not use Harmonizome as a native disease/pathway/tissue/PPI graph source +- it used Harmonizome as a target-facing summary layer + +## What Landed In `pharos319` + +Read-only inspection of `pharos319` showed: + +- `gene_attribute_type`: `113` rows +- `gene_attribute`: `65,549,760` rows +- `hgram_cdf`: `1,167,880` rows + +Representative `gene_attribute_type.name` values: +- `GTEx Tissue Gene Expression Profiles` +- `DISEASES Text-mining Gene-Disease Assocation Evidence Scores` +- `Virus MINT Protein-Viral Protein Interactions` +- `Reactome Pathways` + +Representative `gene_attribute` pattern: +- `protein_id` +- `gat_id` +- `name` = gene set name +- `value` = threshold-like integer value + +Representative `hgram_cdf` pattern: +- `protein_id` +- `type` = dataset name +- `attr_count` +- `attr_cdf` + +`hgram_cdf.type` matched `gene_attribute_type.name` directly for most legacy dataset types. + +Interpretation: +- `gene_attribute_type` identified the dataset +- `gene_attribute` stored per-target membership / score against attributes within that dataset +- `hgram_cdf` stored derived summary statistics over those rows + +## Current Harmonizome Surface + +Current public release: +- Harmonizome 3.0 + +Verified current sources: +- [About](https://maayanlab.cloud/Harmonizome/about) +- [What's New](https://maayanlab.cloud/Harmonizome/whatsNew) +- [Download](https://maayanlab.cloud/Harmonizome/download) +- [Documentation](https://maayanlab.cloud/Harmonizome/documentation) + +Key observations: +- The catalog is mixed. + - some refreshed datasets are clearly current (`2023`-`2026`) + - many legacy datasets are still present + - some old entries are explicitly archived +- The download page currently exposes more datasets than the legacy downloader script. + - live catalog extracted into [sources.csv](/Users/kelleherkj/IdeaProjects/IFX_ODIN/designs/harmonizome/sources.csv) + - legacy `harmonizomedownloader.py` only hard-codes `129` datasets +- The current site still uses the same abstract model: + - dataset/resource metadata + - generic `Category` + - generic `Attribute` + - gene-attribute matrices / edge lists / gene set libraries + +## Attribute Vocabulary + +The current `Attribute` vocabulary from the download page is extremely close to the old `gene_attribute_type.attribute_type` vocabulary in old Pharos. + +Main result: +- this appears to be the same conceptual field, carried forward with minor drift + +Observed current-only additions: +- `cell type` +- `glycan` + +Important nuance: +- `Attribute` is not unique to one `Category` +- the same attribute values can appear under multiple categories + +Examples from [sources.csv](/Users/kelleherkj/IdeaProjects/IFX_ODIN/designs/harmonizome/sources.csv): +- `tissue`: `proteomics`, `structural or functional annotations`, `transcriptomics` +- `cell line`: `disease or phenotype associations`, `genomics`, `proteomics`, `transcriptomics` +- `microRNA`: `genomics`, `physical interactions` +- `protein complex`: `proteomics`, `structural or functional annotations` + +Interpretation: +- `Dataset` is still the primary identity +- `Category` and `Attribute` are dataset metadata, not stable graph concept keys by themselves + +## Current API Findings + +Useful endpoints from the docs: +- `GET /api/1.0/gene` +- `GET /api/1.0/gene/` +- `GET /api/1.0/gene/?showAssociations=true` + +Live fetch inspected: +- `GET https://maayanlab.cloud/Harmonizome/api/1.0/gene/DRD2?showAssociations=true` + +Observed gene payload fields: +- `symbol` +- `synonyms` +- `name` +- `description` +- `ncbiEntrezGeneId` +- `ncbiEntrezGeneUrl` +- `proteins` +- `hgncRootFamilies` +- `associations` + +Observed association payload fields: +- `geneSet.name` +- `geneSet.href` +- `thresholdValue` +- `standardizedValue` + +Important identifier result: +- the live payload includes `ncbiEntrezGeneId` +- when ingesting, the intended identifier policy should be: + - use `NCBIGene:` when available + - otherwise use `Symbol:` + +Important shape result: +- `geneSet.name` looks like: + - `/` + +Examples: +- `697/Achilles Cell Line Gene Essentiality Profiles` +- `nucleus accumbens, right/Allen Brain Atlas Adult Human Brain Tissue Gene Expression Profiles` + +Interpretation: +- the API is gene-centric +- the API likely gives enough to recover: + - dataset name + - attribute name + - association sign / standardized score +- but a full API crawl would still be expensive and slow + +## Gene vs Protein + +Current Harmonizome is still fundamentally gene-first. + +Evidence: +- gene pages and API expose `ncbiEntrezGeneId` +- associations are attached to gene entities +- old loader mapped those gene associations onto `protein_id` in TCRD + +Recommended semantic boundary: +- if Harmonizome were ingested, target graph should treat it as gene-level first +- protein-facing Pharos/TCRD behavior would be a later side-lift + +## Modeling Options Considered + +### Option A: Fully model attributes as nodes + +Potential shape: +- `GeneAttributeType` +- `GeneAttribute` +- `GeneGeneAttributeEdge` + +Rejected for first pass because: +- too much graph expansion +- old Pharos mostly treated attribute names as payload, not as reusable graph concepts + +### Option B: Dataset node plus edge details + +Potential shape: +- `GeneAttributeType` node per dataset +- `GeneGeneAttributeEdge` from `Gene` to `GeneAttributeType` +- `details` containing per-attribute entries + +This was the best ingest-style fit we found. + +Potential detail fields: +- `attribute_name` +- `gene_set_name` +- `gene_set_href` +- `threshold_value` +- `standardized_value` + +Benefits: +- much closer to old `gene_attribute` +- much less graph bloat +- preserves dataset identity cleanly + +### Option C: Do not ingest Harmonizome; compute our own summary metrics + +This is the direction that currently looks best. + +Reasoning: +- the real value old Pharos got from Harmonizome was high-level summary statistics +- IFX_ODIN already has a richer graph than old TCRD in many concept areas +- we can compute summary metrics directly from our own graph instead of importing Ma'ayan Lab's summary layer + +Examples of graph-derived summaries we could compute ourselves: +- disease counts / percentiles +- pathway counts / percentiles +- tissue/expression breadth +- GO annotation breadth +- ligand / MoA breadth +- publication / GeneRIF metrics +- future phenotype or PPI breadth if those concepts are added + +## `attr_count` / `attr_cdf` + +Important clarification from the legacy schema: +- `attr_count` and `attr_cdf` are derived summary values +- they are not the raw association payload + +Likely intended meaning: +- `attr_count` = number of associated attributes for one dataset type for one protein +- `attr_cdf` = empirical cumulative distribution value over those counts for that dataset type + +This is not simple min-max scaling. + +If we ever reproduce this behavior: +- compute from graph associations in post-processing +- not during raw adapter ingest + +## Current Recommendation + +Do not implement Harmonizome ingest yet. + +Instead: +- treat this investigation as closed for now +- if we revisit, start from a graph-derived summary-metrics design +- only pull specific Harmonizome datasets later if there is a concrete gap our graph does not already cover + +If Harmonizome is revisited later, the most defensible ingest boundary would be: +- `GeneAttributeType` node per dataset +- `GeneGeneAttributeEdge` with per-attribute `details` +- optional side-lift to protein-facing export for Pharos + +## Artifacts Produced + +- [sources.csv](/Users/kelleherkj/IdeaProjects/IFX_ODIN/designs/harmonizome/sources.csv) +- this note + +## Open Questions If Revisited + +- Which graph-derived summary metrics would actually be most useful in Pharos UX? +- Should those summaries live only in post-processing / MySQL export, or also as graph fields? +- Are there any current Harmonizome datasets that still fill genuine gaps after accounting for direct sources already present in IFX_ODIN? diff --git a/designs/harmonizome/sources.csv b/designs/harmonizome/sources.csv new file mode 100644 index 0000000..bcb4a5a --- /dev/null +++ b/designs/harmonizome/sources.csv @@ -0,0 +1,147 @@ +Resource,Dataset,Description,Category,Attribute,Views,Archived +Achilles,Cell Line Gene Essentiality Profiles,Fitness scores for cell lines following single gene knockdowns,disease or phenotype associations,cell line,27402,FALSE +Allen Brain Atlas,Adult Human Brain Tissue Gene Expression Profiles,mRNA expression profiles for 6 adult human brain tissue samples spanning ~300 brain structures,transcriptomics,tissue,46368,FALSE +Allen Brain Atlas,Adult Mouse Brain Tissue Gene Expression Profiles,mRNA expression profiles for adult mouse brain tissues spanning ~2000 anatomically defined brain structures,transcriptomics,tissue,22736,FALSE +Allen Brain Atlas,Developing Human Brain Tissue Gene Expression Profiles by Microarray,mRNA expression profiles for human brain tissue samples spanning 27 time points and 26 brain structures,transcriptomics,tissue sample,11072,FALSE +Allen Brain Atlas,Developing Human Brain Tissue Gene Expression Profiles by RNA-seq,mRNA expression profiles for human brain tissue samples spanning 31 time points and 26 brain structures,transcriptomics,tissue sample,9901,FALSE +Allen Brain Atlas,Prenatal Human Brain Tissue Gene Expression Profiles,mRNA expression profiles for 4 human prenatal brain tissue samples spanning 4 time points and ~300 brain structures,transcriptomics,tissue,11522,FALSE +Biocarta,Pathways,Sets of proteins participating in pathways from Biocarta,structural or functional annotations,pathway,85074,FALSE +BioGPS,Cell Line Gene Expression Profiles,mRNA expression profiles for the NCI-60 panel of cancer cell lines,transcriptomics,cell line,53275,FALSE +BioGPS,Human Cell Type and Tissue Gene Expression Profiles,mRNA expression profiles for human tissues and cell types,transcriptomics,cell type or tissue,23586,FALSE +BioGPS,Mouse Cell Type and Tissue Gene Expression Profiles,mRNA expression profiles for mouse tissues and cell types,transcriptomics,cell type or tissue,29091,FALSE +Cancer Cell Line Encyclopedia (CCLE),Cell Line Gene CNV Profiles,Gene-level copy number variation profiles for cancer cell lines,genomics,cell line,40435,FALSE +Cancer Cell Line Encyclopedia (CCLE),Cell Line Gene Expression Profiles,mRNA microarray expression profiles for cancer cell lines,transcriptomics,cell line,131737,FALSE +Cancer Cell Line Encyclopedia (CCLE),Cell Line Gene Mutation Profiles,Gene-level mutation profiles for cancer cell lines,genomics,cell line,41238,FALSE +ChIP-X Enrichment Analysis (ChEA),Transcription Factor Targets,Target genes of transcription factors from published ChIP-chip ChIP-seq and other transcription factor binding site profiling studies,genomics,transcription factor,109401,FALSE +Connectivity Map (CMAP),Signatures of Differentially Expressed Genes for Small Molecules,mRNA expression profiles for cell lines following chemical perturbation,transcriptomics,small molecule perturbation,17592,FALSE +CORUM,Protein Complexes,Proteins participating in complexes by manual literature curation,structural or functional annotations,protein complex,21336,FALSE +Catalogue of Somatic Mutations In Cancer (COSMIC),Cell Line Gene CNV Profiles,Gene-level copy number variation profiles for cancer cell lines,genomics,cell line,16497,FALSE +Catalogue of Somatic Mutations In Cancer (COSMIC),Cell Line Gene Mutation Profiles,Gene mutations in cancer cell lines from low-throughput or high-throughput studies,genomics,cell line,31096,FALSE +Comparative Toxicogenomics Database (CTD),Gene-Chemical Interactions,Chemical-gene interactions curated from literature,physical interactions,chemical,139178,FALSE +Comparative Toxicogenomics Database (CTD),Gene-Disease Associations,Disease-gene interactions from manually curated literature,disease or phenotype associations,disease,43154,FALSE +Database of Genotypes and Phenotypes (dbGAP),Gene-Trait Associations,Gene-trait associations curated from genetic association studies,disease or phenotype associations,trait,10329,FALSE +Dephosphorylation Database (DEPOD),Substrates of Phosphatases,Phosphatase-substrate interactions manually curated from literature and databases of protein annotations or protein interactions,physical interactions,phosphatase,5543,FALSE +DrugBank,Drug Targets,Sets of proteins targeted by drugs by manual literature curation,physical interactions,drug,119120,FALSE +Encyclopedia of DNA Elements (ENCODE),Histone Modification Site Profiles,Histone site modification profiles for cell lines from ENCODE,genomics,histone modification site profile,10798,FALSE +Encyclopedia of DNA Elements (ENCODE),Transcription Factor Binding Site Profiles,Transcription factor binding site profiles for cell lines,genomics,transcription factor binding site profile,24381,FALSE +Encyclopedia of DNA Elements (ENCODE),Transcription Factor Targets,Target genes of transcription factors from transcription factor binding site profiles,genomics,transcription factor,152744,FALSE +ESCAPE,Omics Signatures of Genes and Proteins for Stem Cells,Sets of target genes of transcription factors from published ChIP-chip ChIP-seq and other transcription factor binding site profiling studies;sets of differentially expressed genes following perturbation of a protein from gene expression data in GEO; sets of interacting proteins from high- and low-throughput protein-protein interaction studies; sets of targets of microRNAs from public databases computationally predicted or experimentally verified,transcriptomics,PubMedID,7022,FALSE +Genetic Association Database (GAD),Gene-Disease Associations,Gene-disease associations curated from genetic association studies,disease or phenotype associations,disease,45396,FALSE +Genetic Association Database (GAD),High Level Gene-Disease Associations,Gene-disease associations curated from genetic association studies,disease or phenotype associations,disease,9514,FALSE +Genomics of Drug Sensitivity in Cancer (GDSC),Cell Line Gene Expression Profiles,mRNA microarray expression profiles for cancer cell lines,transcriptomics,cell line,21317,FALSE +Gene Reference Into Function (GeneRIF),Biological Term Annotations,Statements describing functions of genes distilled from biomedical publications,structural or functional annotations,biological term,38260,FALSE +GeneSigDB,Published Gene Signatures,Gene signatures reported in the literature that were derived from analysis of transcriptomic or proteomic data,transcriptomics,PubMedID,9969,FALSE +Gene Expression Omnibus (GEO),Signatures of Differentially Expressed Genes for Diseases,mRNA expression profiles for cell lines or tissues following disease perturbation,transcriptomics,disease perturbation,18110,FALSE +Gene Expression Omnibus (GEO),Signatures of Differentially Expressed Genes for Gene Perturbations,mRNA expression profiles for cell lines or tissues following genetic perturbation (knockdown knockout over-expression mutation),transcriptomics,gene perturbation,10904,FALSE +Gene Expression Omnibus (GEO),Signatures of Differentially Expressed Genes for Kinase Perturbations,mRNA expression profiles for cell lines or tissues following kinase perturbation (inhibition activation knockdown knockout over-expression mutation),transcriptomics,kinase perturbation,10221,FALSE +Gene Expression Omnibus (GEO),Signatures of Differentially Expressed Genes for Small Molecules,mRNA expression profiles for cell lines or tissues following small molecule perturbation,transcriptomics,small molecule perturbation,8817,FALSE +Gene Expression Omnibus (GEO),Signatures of Differentially Expressed Genes for Transcription Factor Perturbations,mRNA expression profiles for cell lines or tissues following transcription factor perturbation (inhibition activation knockdown knockout over-expression mutation),transcriptomics,transcription factor perturbation,14623,FALSE +Gene Expression Omnibus (GEO),Signatures of Differentially Expressed Genes for Viral Infections,mRNA expression profiles for cell lines or tissues following viral infection,transcriptomics,virus perturbation,10599,FALSE +Genotype Tissue Expression (GTEx),Tissue Sample Gene Expression Profiles,RNA-seq gene expression profiles for tissue samples from GTEx,transcriptomics,tissue sample,15278,FALSE +Guide to Pharmacology,Chemical Ligands of Receptors,Chemical ligand-receptor interactions curated by experts,physical interactions,ligand (chemical),31564,FALSE +Guide to Pharmacology,Protein Ligands of Receptors,Protein ligand-receptor interactions curated by experts,physical interactions,ligand (protein),4165,FALSE +GWASdb,SNP-Disease Associations,SNP-disease association p-values curated from published GWAS,disease or phenotype associations,disease,33894,FALSE +GWASdb,SNP-Phenotype Associations,SNP-phenotype association p-values curated from published GWAS,disease or phenotype associations,phenotype,11955,FALSE +Heiser et al. PNAS 2011,Cell Line Gene Expression Profiles,mRNA expression profiles for breast cancer cell lines measured by microarray,transcriptomics,cell line,7207,FALSE +Human Metabolome Database,Metabolites of Enzymes,Biomolecular interactions between metabolites and proteins such as processing enzymes curated from literature,physical interactions,metabolite,15374,FALSE +Human Protein Atlas (HPA),Cell Line Gene Expression Profiles,RNA-seq gene expression profiles for cell lines from HPA,transcriptomics,cell line,14693,FALSE +Human Protein Atlas (HPA),Tissue Gene Expression Profiles,RNA-seq gene expression profiles for tissues from HPA,transcriptomics,tissue,15269,FALSE +Human Protein Atlas (HPA),Tissue Protein Expression Profiles,Semiquantitative protein expression profiles for tissues,proteomics,tissue,7142,FALSE +Human Protein Atlas (HPA),Tissue Sample Gene Expression Profiles,RNA-seq gene expression profiles for tissue samples from HPA,transcriptomics,tissue sample,6667,FALSE +Human Proteome Map (HPM),Cell Type and Tissue Protein Expression Profiles,Protein expression profiles for tissues and cell types,proteomics,cell type or tissue,5399,FALSE +Human Phenotype Ontology (HPO),Gene-Disease Associations,Phenotype-causing gene mutations of human phenotypes from disease knowledgebases,disease or phenotype associations,phenotype,41843,FALSE +Hub Proteins,Protein-Protein Interactions,Sets of proteins interacting with hub proteins aggregated from data,physical interactions,hub protein,14664,FALSE +HuGE Navigator,Gene-Phenotype Associations,Gene-phenotype associations extracted from published GWAS by automated text-mining,disease or phenotype associations,phenotype,14962,FALSE +HumanCyc,Pathways,Sets of proteins participating in pathways from HumanCyc,structural or functional annotations,pathway,6521,FALSE +InterPro,Predicted Protein Domain Annotations,Protein domains predicted for gene products based on sequence similarity to known domain signatures,structural or functional annotations,protein domain,23220,FALSE +Kinase Enrichment Analysis (KEA),Substrates of Kinases,Protein substrates of kinases from published low-throughput and high-throughput phosphoproteomics studies,physical interactions,kinase,11864,FALSE +LINCS Kinativ,Kinase Inhibitor Bioactivity Profiles,Percent inhibition of kinases by small molecules measured in cell lysates,physical interactions,chemical bioactivity profile,3853,FALSE +LINCS KinomeScan,Kinase Inhibitor Targets,Kinase inhibitor targets from percent inhibition of kinases by small molecules measured using purified kinases,physical interactions,small molecule,6705,FALSE +Klijn et al. Nat. Biotechnol. 2015,Cell Line Gene CNV Profiles,Gene-level copy number variation profiles for cancer cell lines,genomics,cell line,10068,FALSE +Klijn et al. Nat. Biotechnol. 2015,Cell Line Gene Expression Profiles,RNA-seq gene expression profiles for cancer cell lines,transcriptomics,cell line,15315,FALSE +Klijn et al. Nat. Biotechnol. 2015,Cell Line Gene Mutation Profiles,SNP gene mutations in cancer cell lines identified by microarray,genomics,cell line,9699,FALSE +LOCATE,Curated Protein Localization Annotations,Subcellular localization of proteins from low-throughput or high-throughput protein localization assays,proteomics,cellular component,9006,FALSE +MiRTarBase,microRNA Targets,Target genes of microRNAs from published experiments,physical interactions,microRNA,24342,FALSE +MotifMap,Predicted Transcription Factor Targets,Target genes of transcription factors predicted using known transcription factor binding site motifs,genomics,transcription factor,30714,FALSE +Molecular Signatures Database (MSigDB),Cancer Gene Co-expression Modules,Computational signatures of genes co-expressed with cancer related genes,transcriptomics,co-expressed gene,9717,FALSE +Molecular Signatures Database (MSigDB),Signatures of Differentially Expressed Genes for Cancer Gene Perturbations,Oncogenic signatures of genes differentially expressed following cancer gene perturbations,transcriptomics,gene perturbation,8749,FALSE +Nuclear Receptor Signaling Atlas (NURSA),Protein Complexes,Proteins identified in complexes isolated from cultured cells,proteomics,protein complex,13205,FALSE +Nuclear Receptor Signaling Atlas (NURSA),Protein-Protein Interactions,Protein-protein interactions inferred from membership in complexes,proteomics,interacting protein,10222,FALSE +Online Mendelian Inheritance in Man (OMIM),Gene-Disease Associations,Disease- or phenotype-causing gene mutations for heritable human diseases or phenotypes curated from biomedical publications,disease or phenotype associations,phenotype,29083,FALSE +PANTHER,Pathways,Sets of proteins participating in pathways from PANTHER,structural or functional annotations,pathway,13028,FALSE +Pathway Commons (PC),Protein-Protein Interactions,Protein-protein interactions from low-throughput or high-throughput studies aggregated by Pathway Commons from the following databases: Reactome NCI Pathways PhosphoSite HumanCyc HPRD PANTHER DIP BioGRID IntAct BIND Transfac MiRTarBase Drugbank Recon X Comparative Toxicogenomics Database and KEGG,physical interactions,interacting protein,81664,FALSE +Phosphosite Textmining,Biological Term Annotations,Occurrence frequencies for biological terms in abstracts of publications describing phosphosites,structural or functional annotations,biological term,4115,FALSE +PhosphoSitePlus,Substrates of Kinases,Kinase-substrate interactions curated from low-throughput or high-throughput phosphoproteomics studies,physical interactions,kinase,16361,FALSE +Pathway Interaction Database (PID),Pathways,Sets of proteins participating in pathways from PID,structural or functional annotations,pathway,18482,FALSE +Proteomics Database (ProteomicsDB),Cell Type and Tissue Protein Expression Profiles,Protein expression profiles for tissues and cell types reprocessed from many proteomics datasets,proteomics,cell type or tissue,16719,FALSE +Roadmap Epigenomics,Cell and Tissue DNA Methylation Profiles,DNA methylation profiles for primary cell types and tissues,genomics,cell type or tissue,11555,FALSE +Roadmap Epigenomics,Cell and Tissue Gene Expression Profiles,mRNA expression profiles for primary cell types and tissues,transcriptomics,cell type or tissue,7754,FALSE +Roadmap Epigenomics,Histone Modification Site Profiles,Histone modification profiles for primary cells and tissues,genomics,histone modification site profile,8798,FALSE +SILAC Phosphoproteomics,Signatures of Differentially Phosphorylated Proteins for Drugs,Phosphorylation levels of proteins in cell lines following drug treatment,proteomics,drug perturbation,5187,FALSE +SILAC Phosphoproteomics,Signatures of Differentially Phosphorylated Proteins for Gene Perturbations,Phosphorylation levels of proteins in cell lines following genetic perturbation (knockdown knockout over-expression mutation),proteomics,gene perturbation,3149,FALSE +SILAC Phosphoproteomics,Signatures of Differentially Phosphorylated Proteins for Protein Ligands,Phosphorylation levels of proteins in cell lines following ligand treatment,proteomics,ligand (protein) perturbation,4446,FALSE +TargetScan,Predicted Conserved microRNA Targets,Target genes of microRNAs predicted by searching genes for sites matching conserved miRNA seed regions,genomics,microRNA,8990,FALSE +The Cancer Genome Atlas (TCGA),Signatures of Differentially Expressed Genes for Tumors,mRNA expression profiles for tumor and normal tissue samples,transcriptomics,tissue sample,21862,FALSE +Virus MINT,Protein-Viral Protein Interactions,Interactions between viral and human proteins manually curated from literature,physical interactions,viral protein,7940,FALSE +Virus MINT,Protein-Virus Interactions,Interactions between viruses and human proteins manually curated from literature,physical interactions,virus,5341,FALSE +LOCATE,Predicted Protein Localization Annotations,Subcellular localization of proteins by sequence similarity to localization sequences,proteomics,cellular component,5904,FALSE +TargetScan,Predicted Nonconserved microRNA Targets,Target genes of microRNAs predicted by searching genes for sites matching nonconserved miRNA seed regions,genomics,microRNA,7651,FALSE +PhosphoSitePlus,Phosphosite-Disease Associations,Disease-phosphosite associations curated from literature,disease or phenotype associations,disease,5006,FALSE +Sanger Cancer Dependency Map (DepMap),Cancer Cell Line Proteomics,Protein intensity values acquired using data-independent acquisition mass spectrometry (DIA-MS).,proteomics,cell line,4215,FALSE +knockTF,Gene Expression Profiles with Transcription Factor Perturtbations,Gene expression profiles for cell lines or tissues following transcription factor perturbation (knockdown/knockout),transcriptomics,transcription factor perturbation,4397,FALSE +Kinase Library,Serine Threonine Kinome Atlas,A phosphoproteomics atlas detailing phosphorylation of protein substrates by 303 serine/threonine kinases in the human kinome.,proteomics,kinase,4235,FALSE +DeepCoverMOA,Drug Mechanisms of Action,Protein expression profiles in HCT116 cell line following drug treatment,proteomics,small molecule perturbation,3778,FALSE +Tabula Sapiens,Gene-Cell Associations,Gene expression data from Tabula Sapiens scRNA-seq counts,transcriptomics,cell type,4844,FALSE +DisGeNET,Gene-Disease Associations,Gene-disease associations sourced from curated repositories GWAS catalogues animal models and the scientific literature,disease or phenotype associations,disease,35106,FALSE +DisGeNET,Gene-Phenotype Associations,Gene-phenotype associations sourced from curated repositories GWAS catalogues animal models and the scientific literature,disease or phenotype associations,phenotype,13614,FALSE +Cancer Cell Line Encyclopedia (CCLE),Cell Line Proteomics,Protein intensity values acquired using mass spectrometry across human cancerous cell lines,proteomics,cell line,7452,FALSE +Cancer Dependency Map (DepMap),CRISPR Gene Dependency,Dependency scores for cell lines following single gene knockdowns,disease or phenotype associations,cell line,9182,FALSE +ChIP-X Enrichment Analysis (ChEA),Transcription Factor Targets 2022,Target genes of transcription factors from published ChIP-chip ChIP-seq and other transcription factor binding site profiling studies,genomics,transcription factor,11824,FALSE +Mammalian Phenotype Ontology (MPO),Mouse Phenotype Associations 2023,Observed phenotypes of transgenic mice collected from mouse phenotyping studies,disease or phenotype associations,phenotype,10232,FALSE +LINCS L1000 Connectivity Map (L1000 CMAP),CRISPR Knockout Consensus Signatures,Gene association consensus signatures following CRISPR gene knockout,transcriptomics,gene perturbation,33071,FALSE +Molecular Transducers of Physical Activity Consortium (MoTrPAC),Rat Endurance Exercise Training Transcriptomics,RNA-seq gene expression profiles for rat tissue samples across 4 time points and 19 tissues,transcriptomics,tissue sample,1728,FALSE +GlyGen,Glycosylated Proteins,Proteins glycosylated by saccharide ligands from glycosylation site citations,proteomics,glycan,2766,FALSE +International Mouse Phenotyping Consortium (IMPC),Knockout Mouse Phenotypes,Observed phenotypes of mice following gene knockout,disease or phenotype associations,phenotype,4107,FALSE +Metabolomics Workbench (MW),Enzyme Metabolite Associations,Biomolecular interactions between metabolites and proteins curated from experimental studies,physical interactions,metabolite,1701,FALSE +LINCS L1000 Connectivity Map (L1000 CMAP),Chemical Perturbations Consensus Signatures,Gene association consensus signatures following small molecule perturbation,transcriptomics,small molecule perturbation,7838,FALSE +Genotype Tissue Expression (GTEx),Tissue Gene Expression Profiles 2023,Gene expression profiles for tissues from GTEx by RNA-seq,transcriptomics,tissue,5124,FALSE +Genotype Tissue Expression (GTEx),Tissue-Specific Aging Signatures,Tissue-specific aging signatures createed from GTEx RNA-seq gene expression profiles,transcriptomics,tissue sample,5299,FALSE +The Human BioMolecular Atlas Program (HuBMAP),Azimuth Cell Type Annotations,Gene-cell type annotations from integrated reference scRNA-seq gene expression profiles,transcriptomics,cell type,4734,FALSE +SynGO,Synaptic Gene Annotations,Curated annotations of genes with synaptic processes and components,structural or functional annotations,biological term,2421,FALSE +CellMarker,Gene-Cell Type Associations,Cell type markers across various tissues from human and mouse scRNA-seq,transcriptomics,cell type,8697,FALSE +Kinase Library,Tyrosine Kinome Atlas,A phosphoproteomics atlas detailing phosphorylation of protein substrates by 93 canonical and non-canonical tyrosine kinases in the human kinome.,proteomics,kinase,1852,FALSE +Pathway Figure Optical Character Recognition (PFOCR),Pathway Figure Associations 2024,Sets of genes/proteins extracted from pathway figures in research publications using optical character recognition updated for 2024,structural or functional annotations,pathway,20023,FALSE +WikiPathways,Pathways 2024,Sets of proteins participating in pathways from WikiPathways updated for 2024,structural or functional annotations,pathway,3596,FALSE +PerturbAtlas,Signatures of Differentially Expressed Genes for Gene Perturbations,Gene expression profiles for cell lines cell types tissues and models following genetic perturbation (knockdown knockout knockin over-expression mutation and multi-condition),transcriptomics,gene perturbation,6487,FALSE +PerturbAtlas,Signatures of Differentially Expressed Genes for Mouse Gene Perturbations,Gene expression profiles for cell lines cell types tissues and models following genetic perturbation (knockdown knockout knockin over-expression mutation and multi-condition),transcriptomics,gene perturbation,1580,FALSE +Reactome,Pathways 2024,Sets of proteins participating in pathways from Reactome updated for 2024,structural or functional annotations,pathway,11023,FALSE +The Human BioMolecular Atlas Program (HuBMAP),ASCT+B Annotations,Anatomical structure and cell type biomarker annotations from the HuBMAP ASCT+B tables,structural or functional annotations,cell type,2068,FALSE +RummaGEO,Gene Perturbation Signatures,Single gene perturbation signatures produced by querying RummaGEO metadata for knockouts knockdowns and over-expression conditions,transcriptomics,gene perturbation,1196,FALSE +RummaGEO,Drug Perturbation Signatures,Drug perturbation signatures produced from automatically mined RNA-seq samples from GEO.,transcriptomics,drug perturbation,1567,FALSE +The Human BioMolecular Atlas Program (HuBMAP),ASCT+B Augmented with RNA-seq Coexpression,Anatomical structure and cell type biomarker annotations from the HuBMAP ASCT+B tables augmented with RNA-seq coexpression data from ARCHS4,structural or functional annotations,cell type,1321,FALSE +Gene Ontology (GO),Biological Process Annotations 2025,curated annotations of genes with biological processes,structural or functional annotations,biological process,5279,FALSE +Gene Ontology (GO),Cellular Component Annotations 2025,curated annotations of genes with cellular components,structural or functional annotations,cellular component,2545,FALSE +Gene Ontology (GO),Molecular Function Annotations 2025,curated annotations of genes with molecular functions,structural or functional annotations,molecular function,797,FALSE +DISEASES,Curated Gene-Disease Assocation Evidence Scores 2025,Disease gene evidence scores by manual literature curation,disease or phenotype associations,disease,10970,FALSE +DISEASES,Experimental Gene-Disease Association Evidence Scores 2025,Disease gene evidence scores by integrating experimental data (GWAS),disease or phenotype associations,disease,13578,FALSE +DISEASES,Text-mining Gene-Disease Association Evidence Scores 2025,Gene-disease co-occurrence scores from text-mining biomedical abstracts,disease or phenotype associations,disease,49658,FALSE +Novartis Institutes for Biomedical Research (NIBR),DRUG-seq U2OS MoA Box,Drug perturbation signatures created by profiling 4343 small molecules in the U2-OS cell line with the DRUG-seq platform,transcriptomics,drug perturbation,28722,FALSE +Genotype Tissue Expression (GTEx),eQTL 2025,Significance values for all gene-SNP pairs testing likelihood that SNP affects gene expression,genomics,SNP,8554,FALSE +TISSUES,Curated Tissue Protein Expression Evidence Scores 2025,Protein tissue expression evidence scores by manual literature curation,structural or functional annotations,tissue,10389,FALSE +TISSUES,Experimental Tissue Protein Expression Evidence Scores 2025,Protein tissue expression evidence scores by integrating experimental data,proteomics,cell type or tissue,8374,FALSE +TISSUES,Text-mining Tissue Protein Expression Evidence Scores 2025,Gene-tissue co-occurrence scores from text-mining biomedical abstracts,structural or functional annotations,cell type or tissue,31613,FALSE +COMPARTMENTS,Curated Protein Localization Evidence Scores 2025,Protein subcellular localization evidence scores by manual literature curation,structural or functional annotations,cellular component,1704,FALSE +COMPARTMENTS,Experimental Protein Localization Evidence Scores 2025,Protein subcellular localization evidence scores by integrating experimental data,proteomics,cellular component,867,FALSE +COMPARTMENTS,Text-mining Protein Localization Evidence Scores 2025,Gene-cellular compartment co-occurrence scores from text-mining biomedical abstracts,structural or functional annotations,cellular component,10913,FALSE +Replogle et al. Cell 2022,K562 Genome-wide Perturb-seq Gene Perturbation Signatures,Gene expression profiles in the K562 cell line following CRISPRi genetic perturbation,transcriptomics,gene perturbation,4640,FALSE +Replogle et al. Cell 2022,K562 Essential Perturb-seq Gene Perturbation Signatures,Gene expression profiles in the K562 cell line following CRISPRi genetic perturbation of essential genes,transcriptomics,gene perturbation,6222,FALSE +Replogle et al. Cell 2022,RPE1 Essential Perturb-seq Gene Perturbation Signatures,Gene expression profiles in the RPE1 cell line following CRISPRi genetic perturbation of essential genes,transcriptomics,gene perturbation,4346,FALSE +Tahoe Therapeutics,Tahoe 100M Perturbation Atlas,Single-cell expression data from drug perturbation across cell lines in over 95 million cells,transcriptomics,drug perturbation,8291,FALSE +ClinVar,Gene-Phenotype Associations 2025,SNP-phenotype associations curated by ClinVar users from various sources,disease or phenotype associations,phenotype,23808,FALSE +Jaspar PWMs,Predicted Human Transcription Factor Targets 2025,Target genes of human transcription factors predicted using known transcription factor binding site motifs,genomics,transcription factor,48056,FALSE +Jaspar PWMs,Predicted Mouse Transcription Factor Targets 2025,Target genes of mouse transcription factors predicted using known transcription factor binding site motifs,genomics,transcription factor,2133,FALSE +Cell Maps for AI (CM4AI),U2OS Cell Map Protein Localization Assemblies,Protein localization assemblies constructed from integrating AP-MS biomolecular interaction and IF imaging data,structural or functional annotations,biological term,660,FALSE +GWAS Catalog,SNP-Phenotype Associations 2025,SNP-phenotype association p-values curated from published GWAS,disease or phenotype associations,phenotype,18130,FALSE +Allen Brain Atlas,Aging Dementia and Traumatic Brain Injury Tissue Sample Gene Expression Profiles,mRNA expression profiles for dementia and traumatic brain injuries (TBI) from aging brain tissue,transcriptomics,tissue sample,1157,FALSE +Sci-Plex,Drug Perturbation Signatures,sciRNA-seq expression profiles for A549 K562 and MCF7 cells treated with 188 compounds at 4 doses,transcriptomics,drug perturbation,980,FALSE +Kyoto Encyclopedia of Genes and Genomes (KEGG),Pathways 2026,Sets of proteins participating in pathways from KEGG,structural or functional annotations,pathway,22865,FALSE \ No newline at end of file diff --git a/src/use_cases/pharos/TCRD_TODO.md b/src/use_cases/pharos/TCRD_TODO.md index 2eb609c..9b9ae10 100644 --- a/src/use_cases/pharos/TCRD_TODO.md +++ b/src/use_cases/pharos/TCRD_TODO.md @@ -73,7 +73,6 @@ These tables are populated directly from ontology source files during the TCRD b - Phenotype — IMPC, JAX/MGI - GWAS - Protein & Disease Novelty (this might be TINx, I'm not sure) -- Harmonizome - updated version maybe - P-HIPSTer Viral PPIs - Publications — NCBI, JensenLab - NIH Target Lists @@ -101,6 +100,7 @@ These tables are populated directly from ontology source files during the TCRD b - ERAM *(punt for now: public download appears stale/legacy; if we need ERAM coverage, prefer copying or migrating the legacy `eRAM` rows from `pharos319` rather than building a fresh ingest from the public files)* - Expression Atlas *(punt for now: old TCRD used a bulk Atlas export plus custom preprocessing, but current Atlas appears to require per-experiment harvesting from FTP; revisit only as a larger dedicated project, not a quick ingest)* - Monarch as a standalone disease-association source *(do not ingest the current dump as `Monarch`; the public file is a Translator-style aggregate whose primary sources are `infores:omim` and `infores:clingen`)* +- Harmonizome: pharos shows high-level summary stats for different types of data - it's basically a summary of relations in their KG, when we should probalby just use summary stats from our own KG ### Findings From Investigation - OMIM is not a legacy Pharos target-disease association source From d6e63156eb04065c4eab3ecfa2cafb479b78ea24 Mon Sep 17 00:00:00 2001 From: kelleherkj Date: Thu, 16 Apr 2026 12:31:28 -0400 Subject: [PATCH 2/2] Add pharos319 DTO ingest/export path and fix protein symbol mapping for local MySQL build --- designs/dto/investigation.md | 150 ++++++++++++++++++ .../file_uniprot/protein_adapter.py | 2 +- .../pharos_arango/set_preferred_symbol.py | 49 ++++++ src/input_adapters/pharos_arango/tcrd/dto.py | 92 +++++++++++ .../pharos_arango/tcrd/protein.py | 18 +-- .../pharos_mysql/ab_count_adapter.py | 5 +- src/input_adapters/pharos_mysql/base.py | 23 +++ .../pharos_mysql/dto_adapter.py | 89 +++++++++++ .../pharos_mysql/idg_family_adapter.py | 17 +- .../pharos_mysql/ppi_adapter.py | 5 +- .../pharos_mysql/protein_adapter.py | 5 +- src/input_adapters/sql_adapter.py | 8 +- .../target_graph/protein_nodes_and_edges.py | 2 +- src/models/dto_class.py | 26 +++ src/models/protein.py | 2 + src/output_adapters/mysql_output_adapter.py | 22 +++ src/output_adapters/sql_converters/tcrd.py | 37 ++++- src/qa_browser/static/style.css | 24 +++ .../templates/pounce_existing_project.html | 2 +- .../sqlalchemy_tables/pharos_tables_new.py | 13 +- src/shared/uniprot_parser.py | 7 + src/use_cases/pharos/TCRD_TODO.md | 18 ++- src/use_cases/pharos/pharos.yaml | 12 ++ src/use_cases/pharos/pharos_aql_post.yaml | 6 + src/use_cases/pharos/target_graph.yaml | 12 ++ .../pharos/target_graph_aql_post.yaml | 6 + src/use_cases/pharos/tcrd.yaml | 18 +++ src/use_cases/working.yaml | 24 +++ src/use_cases/working_mysql.yaml | 18 +++ 29 files changed, 658 insertions(+), 54 deletions(-) create mode 100644 designs/dto/investigation.md create mode 100644 src/input_adapters/pharos_arango/set_preferred_symbol.py create mode 100644 src/input_adapters/pharos_arango/tcrd/dto.py create mode 100644 src/input_adapters/pharos_mysql/base.py create mode 100644 src/input_adapters/pharos_mysql/dto_adapter.py create mode 100644 src/models/dto_class.py diff --git a/designs/dto/investigation.md b/designs/dto/investigation.md new file mode 100644 index 0000000..8c7f0c3 --- /dev/null +++ b/designs/dto/investigation.md @@ -0,0 +1,150 @@ +# DTO Investigation Notes + +Date: 2026-04-15 + +## Goal + +Investigate how Pharos 3.19 handled Drug Target Ontology (DTO) protein classes and decide on an initial IFX_ODIN ingest path. + +## High-Level Conclusion + +Use `pharos319` as the source of truth for the first DTO pass. + +The public DTO GitHub repository appears stale relative to what `pharos319` actually loaded. Legacy Pharos includes explicit dataset provenance showing that the DTO content came from newer Schurer Group handoff files in 2019 and 2020, not just from the older public GitHub release surface. + +## Evidence From `pharos319` + +Legacy schema tables already present: +- `dto` +- `p2dto` +- `ancestry_dto` +- direct protein columns: `protein.dtoid`, `protein.dtoclass` + +Legacy schema refs: +- [pharos_tables_old.py](/Users/kelleherkj/IdeaProjects/IFX_ODIN/src/shared/sqlalchemy_tables/pharos_tables_old.py:604) +- [pharos_tables_old.py](/Users/kelleherkj/IdeaProjects/IFX_ODIN/src/shared/sqlalchemy_tables/pharos_tables_old.py:617) +- [pharos_tables_old.py](/Users/kelleherkj/IdeaProjects/IFX_ODIN/src/shared/sqlalchemy_tables/pharos_tables_old.py:933) + +Observed row counts in `pharos319`: +- `dto`: `17,779` +- `p2dto`: `43,006` +- `ancestry_dto`: `43,376` +- proteins with non-null `dtoid`: `9,232` + +Representative mapped classes: +- `DTO:05007624` `Enzyme` +- `DTO:05007405` `Transporter` +- `DTO:02300001` `G-protein coupled receptor` +- `DTO:03300101` `Kinase` +- `DTO:01300327` `Ion channel` + +## Provenance Found In `pharos319` + +`dataset` rows in `pharos319` show two DTO-related loads: + +1. `Drug Target Ontology IDs and Classifications` + - source: `Files DTO2UniProt_DTOv2.csv, Final_ProteomeClassification_Sep232019.csv from Schurer Group` + - app: `load-DTO_Classifications.py` + - datetime: `2019-10-17 17:49:08` + +2. `Drug Target Ontology` + - source: `File ../data/UMiami/dto_proteome_classification_only.owl from Schurer Group at UMiami` + - app: `load-DTO.py` + - app_version: `3.0.0` + - datetime: `2020-01-20 15:31:48` + +`provenance` confirms `dataset_id=89` for table `dto`. + +Interpretation: +- `pharos319` uses DTO content newer than the obvious public GitHub release metadata +- first-pass IFX_ODIN ingest should therefore prefer `pharos319` over the public DTO repo + +## Public DTO Repo Status + +Public repo: +- [DrugTargetOntology/DTO](https://github.com/DrugTargetOntology/DTO) + +Observed issue: +- the repo/release surface looks old +- latest GitHub release shown there is `Drug Target Ontology V1.1` from 2017-12-06 + +That is older than the 2019/2020 provenance in `pharos319`. + +## First-Pass IFX_ODIN Modeling Choice + +Initial graph model added: +- `DTOClass` +- `DTOClassParentEdge` +- `ProteinDTOClassEdge` + +This mirrors the existing Pharos/TCRD DTO tables directly enough for a first pass: +- `dto` -> `DTOClass` +- `dto.parent_id` -> `DTOClassParentEdge` +- `p2dto` -> `ProteinDTOClassEdge` + +For now, the direct `protein.dtoid` / `protein.dtoclass` legacy columns are treated as derived/denormalized legacy fields rather than the primary ingest path. + +## Implementation Notes + +Added reusable source base: +- [base.py](/Users/kelleherkj/IdeaProjects/IFX_ODIN/src/input_adapters/pharos_mysql/base.py) + +Added DTO adapters: +- [dto_adapter.py](/Users/kelleherkj/IdeaProjects/IFX_ODIN/src/input_adapters/pharos_mysql/dto_adapter.py) + +Added DTO graph model: +- [dto_class.py](/Users/kelleherkj/IdeaProjects/IFX_ODIN/src/models/dto_class.py) + +Wired into: +- [working.yaml](/Users/kelleherkj/IdeaProjects/IFX_ODIN/src/use_cases/working.yaml) + +`working.yaml` now: +- writes to local ArangoDB +- writes dataset artifacts to local MinIO +- pulls DTO from `pharos_credentials.yaml` + +## Recommendation + +For the first DTO pass: +- use `pharos319` DTO content directly +- validate graph shape and counts in `working.yaml` +- defer any attempt to replace this with a public DTO OWL ingest until there is a concrete reason to do so + +## MySQL Conversion Decision + +For the graph-to-MySQL path, keep DTO normalized instead of reproducing the old ancestor-expanded `p2dto` behavior: +- `dto`: ontology terms +- `dto_parent`: direct DTO parent edges +- `p2dto`: direct protein-to-DTO assignments only +- `ancestry_dto`: transitive closure derived in MySQL post-processing + +Legacy compatibility fields still belong on `protein`: +- `protein.dtoid` +- `protein.dtoclass` + +That keeps the graph and MySQL output aligned: +- direct assignments stay direct +- the tree is reconstructed through `dto_parent` and `ancestry_dto` +- downstream GraphQL/UI code can derive the displayed DTO lineage without needing ancestor-expanded `p2dto` +- the legacy `p2dto.generation` column is intentionally dropped in the new schema because it is always implicit in this normalized model + +If revisited later: +- compare `pharos319.dto` against current public DTO OWL files +- decide whether the public ontology has caught up or whether DTO still requires a handoff/source outside the repo + +## Local Validation Outcome + +Validated end to end with: +- local Arango graph build +- Pharos/target-graph post-processing +- local MySQL export via `working_mysql.yaml` + +Observed outcomes: +- `DTOClass`, `DTOClassParentEdge`, and direct `ProteinDTOClassEdge` loaded into the graph +- `dto`, `dto_parent`, `p2dto`, and `ancestry_dto` populated in local MySQL +- `dto.parent_id` was correctly repopulated from `dto_parent` +- `p2dto` contains direct assignments only +- `ProteinDTOClassEdge` no longer carries a legacy `generation` field +- `p2dto.generation` is intentionally removed from the new MySQL schema + +This confirms the normalized DTO path works without recreating the legacy ancestor-expanded `p2dto` layout. diff --git a/src/input_adapters/file_uniprot/protein_adapter.py b/src/input_adapters/file_uniprot/protein_adapter.py index 79760fb..1469844 100755 --- a/src/input_adapters/file_uniprot/protein_adapter.py +++ b/src/input_adapters/file_uniprot/protein_adapter.py @@ -55,7 +55,7 @@ def get_all(self) -> Generator[List[Union[Node, Relationship]], None, None]: sequence=UniProtParser.get_sequence(row), secondary_uniprot_ids=UniProtParser.get_secondary_accessions(row), gene_name=UniProtParser.get_gene_name(row), - symbol=';'.join(UniProtParser.get_symbols(row)) if UniProtParser.get_symbols(row) else None, + symbol=UniProtParser.get_primary_symbol(row), name=UniProtParser.get_full_name(row) ) proteins.append(protein) diff --git a/src/input_adapters/pharos_arango/set_preferred_symbol.py b/src/input_adapters/pharos_arango/set_preferred_symbol.py new file mode 100644 index 0000000..3c62bb3 --- /dev/null +++ b/src/input_adapters/pharos_arango/set_preferred_symbol.py @@ -0,0 +1,49 @@ +from typing import Generator, List + +from src.constants import DataSourceName +from src.interfaces.input_adapter import InputAdapter +from src.models.datasource_version_info import DatasourceVersionInfo +from src.models.protein import Protein +from src.shared.arango_adapter import ArangoAdapter + + +class SetPreferredSymbolAdapter(InputAdapter, ArangoAdapter): + + def get_datasource_name(self) -> DataSourceName: + return DataSourceName.PostProcessing + + def get_version(self) -> DatasourceVersionInfo: + return DatasourceVersionInfo() + + def get_all(self) -> Generator[List[Protein], None, None]: + rows = self.runQuery(preferred_symbol_query) + yield [ + Protein(id=row["id"], preferred_symbol=row["preferred_symbol"]) + for row in rows + if row.get("preferred_symbol") + ] + + +preferred_symbol_query = """ +LET symbol_counts = ( + FOR p IN Protein + FILTER p.symbol != null AND p.symbol != "" + COLLECT symbol = p.symbol WITH COUNT INTO count + RETURN {symbol, count} +) +FOR p IN Protein + LET symbol_count = FIRST( + FOR sc IN symbol_counts + FILTER sc.symbol == p.symbol + RETURN sc.count + ) + LET preferred_symbol = ( + p.symbol != null AND p.symbol != "" AND symbol_count == 1 + ? p.symbol + : p.uniprot_id + ) + RETURN { + id: p.id, + preferred_symbol: preferred_symbol + } +""" diff --git a/src/input_adapters/pharos_arango/tcrd/dto.py b/src/input_adapters/pharos_arango/tcrd/dto.py new file mode 100644 index 0000000..25447ea --- /dev/null +++ b/src/input_adapters/pharos_arango/tcrd/dto.py @@ -0,0 +1,92 @@ +from typing import Generator, List + +from src.input_adapters.pharos_arango.tcrd.protein import PharosArangoAdapter +from src.models.datasource_version_info import DataSourceDetails +from src.models.dto_class import DTOClass, DTOClassParentEdge, ProteinDTOClassEdge +from src.models.protein import Protein + + +def dto_class_query() -> str: + return """FOR d IN `DTOClass` RETURN d""" + + +def dto_class_parent_query() -> str: + return """FOR rel IN `DTOClassParentEdge` RETURN rel""" + + +def protein_dto_class_query(last_key: str = None, limit: int = 10000) -> str: + filter_clause = f'FILTER rel._key > "{last_key}"' if last_key else "" + return f""" + FOR rel IN `ProteinDTOClassEdge` + {filter_clause} + SORT rel._key + LIMIT {limit} + RETURN rel + """ + + +def dto_version_query() -> str: + return """FOR d IN `DTOClass` LIMIT 1 RETURN d.creation""" + + +class DTOClassAdapter(PharosArangoAdapter): + def get_all(self) -> Generator[List[DTOClass], None, None]: + rows = [ + DTOClass( + id=row["id"], + source_id=row.get("source_id"), + name=row.get("name"), + description=row.get("description"), + provenance=row.get("provenance"), + sources=row.get("sources") or [], + ) + for row in self.runQuery(dto_class_query()) + ] + yield rows + + def get_version_info_query(self) -> DataSourceDetails: + raw_version_info = self.runQuery(dto_version_query())[0] + return DataSourceDetails.parse_tsv(raw_version_info) + + +class DTOClassParentAdapter(PharosArangoAdapter): + def get_all(self) -> Generator[List[DTOClassParentEdge], None, None]: + yield [ + DTOClassParentEdge( + start_node=DTOClass(id=row["start_id"]), + end_node=DTOClass(id=row["end_id"]), + provenance=row.get("provenance"), + sources=row.get("sources") or [], + ) + for row in self.runQuery(dto_class_parent_query()) + ] + + def get_version_info_query(self) -> DataSourceDetails: + raw_version_info = self.runQuery(dto_version_query())[0] + return DataSourceDetails.parse_tsv(raw_version_info) + + +class ProteinDTOClassAdapter(PharosArangoAdapter): + batch_size = 10_000 + + def get_all(self) -> Generator[List[ProteinDTOClassEdge], None, None]: + last_key = None + while True: + rows = list(self.runQuery(protein_dto_class_query(last_key=last_key, limit=self.batch_size))) + if not rows: + break + + yield [ + ProteinDTOClassEdge( + start_node=Protein(id=row["start_id"]), + end_node=DTOClass(id=row["end_id"]), + provenance=row.get("provenance"), + sources=row.get("sources") or [], + ) + for row in rows + ] + last_key = rows[-1]["_key"] + + def get_version_info_query(self) -> DataSourceDetails: + raw_version_info = self.runQuery(dto_version_query())[0] + return DataSourceDetails.parse_tsv(raw_version_info) diff --git a/src/input_adapters/pharos_arango/tcrd/protein.py b/src/input_adapters/pharos_arango/tcrd/protein.py index d7aba70..b20501e 100644 --- a/src/input_adapters/pharos_arango/tcrd/protein.py +++ b/src/input_adapters/pharos_arango/tcrd/protein.py @@ -54,20 +54,4 @@ def get_version_info_query(self) -> DataSourceDetails: return DataSourceDetails.parse_tsv(raw_version_info) def get_all(self) -> Generator[List[Node], None, None]: - proteins = self.runQuery(protein_query()) - symbol_count = {} - for p in proteins: - symbol = p.get('symbol', None) - if symbol is not None: - if symbol not in symbol_count: - symbol_count[symbol] = 0 - symbol_count[symbol] += 1 - - for p in proteins: - symbol = p.get('symbol', None) - if symbol is not None and symbol_count[symbol] == 1: - p['preferred_symbol'] = symbol - else: - p['preferred_symbol'] = p['uniprot_id'] - - yield [Protein.from_dict(row) for row in proteins] + yield [Protein.from_dict(row) for row in self.runQuery(protein_query())] diff --git a/src/input_adapters/pharos_mysql/ab_count_adapter.py b/src/input_adapters/pharos_mysql/ab_count_adapter.py index 0e9529e..3d13236 100755 --- a/src/input_adapters/pharos_mysql/ab_count_adapter.py +++ b/src/input_adapters/pharos_mysql/ab_count_adapter.py @@ -1,12 +1,11 @@ from abc import ABC, abstractmethod from src.constants import Prefix -from src.input_adapters.sql_adapter import MySqlAdapter +from src.input_adapters.pharos_mysql.base import Pharos319Adapter from src.shared.sqlalchemy_tables.pharos_tables_old import Protein as mysql_Protein, TDL_info as mysql_tdl_info -from src.interfaces.input_adapter import InputAdapter from src.models.protein import Protein -class TdlTableAdapter(InputAdapter, MySqlAdapter, ABC): +class TdlTableAdapter(Pharos319Adapter, ABC): column = None field = None itype = None diff --git a/src/input_adapters/pharos_mysql/base.py b/src/input_adapters/pharos_mysql/base.py new file mode 100644 index 0000000..849a2a6 --- /dev/null +++ b/src/input_adapters/pharos_mysql/base.py @@ -0,0 +1,23 @@ +from datetime import date + +from src.constants import DataSourceName +from src.input_adapters.sql_adapter import MySqlAdapter +from src.interfaces.input_adapter import InputAdapter +from src.models.datasource_version_info import DatasourceVersionInfo +from src.shared.db_credentials import DBCredentials + + +class Pharos319Adapter(InputAdapter, MySqlAdapter): + version = DatasourceVersionInfo( + version="3.19", + version_date=date.fromisoformat("2024-02-15"), + ) + + def __init__(self, credentials: DBCredentials): + super().__init__(credentials) + + def get_datasource_name(self) -> DataSourceName: + return DataSourceName.OldPharos + + def get_version(self) -> DatasourceVersionInfo: + return self.version diff --git a/src/input_adapters/pharos_mysql/dto_adapter.py b/src/input_adapters/pharos_mysql/dto_adapter.py new file mode 100644 index 0000000..efcaf83 --- /dev/null +++ b/src/input_adapters/pharos_mysql/dto_adapter.py @@ -0,0 +1,89 @@ +from typing import Generator, List + +from src.constants import Prefix +from src.input_adapters.pharos_mysql.base import Pharos319Adapter +from src.models.dto_class import DTOClass, DTOClassParentEdge, ProteinDTOClassEdge +from src.models.node import EquivalentId +from src.models.protein import Protein +from src.shared.sqlalchemy_tables.pharos_tables_old import ( + DTO as mysql_DTO, + P2DTO as mysql_P2DTO, + Protein as mysql_Protein, +) + + +def _dto_node_id(dtoid: str) -> str: + return dtoid.replace("_", ":") + + +class DTOClassAdapter(Pharos319Adapter): + def get_all(self) -> Generator[List[DTOClass], None, None]: + rows = self.get_session().query( + mysql_DTO.dtoid, + mysql_DTO.name, + mysql_DTO.def_, + ) + + yield [ + DTOClass( + id=_dto_node_id(row[0]), + source_id=row[0], + name=row[1], + description=row[2], + ) + for row in rows + ] + + +class DTOClassParentEdgeAdapter(Pharos319Adapter): + def get_all(self) -> Generator[List[DTOClassParentEdge], None, None]: + rows = ( + self.get_session().query( + mysql_DTO.dtoid, + mysql_DTO.parent_id, + ) + .filter(mysql_DTO.parent_id.is_not(None)) + ) + + yield [ + DTOClassParentEdge( + start_node=DTOClass(id=_dto_node_id(row[0])), + end_node=DTOClass(id=_dto_node_id(row[1])), + ) + for row in rows + ] + + +class ProteinDTOClassAdapter(Pharos319Adapter): + def get_all(self) -> Generator[List[Protein | ProteinDTOClassEdge], None, None]: + rows = ( + self.get_session().query( + mysql_Protein.uniprot, + mysql_P2DTO.dtoid, + mysql_P2DTO.generation, + mysql_DTO.name, + ) + .join(mysql_Protein, mysql_Protein.id == mysql_P2DTO.protein_id) + .join(mysql_DTO, mysql_DTO.dtoid == mysql_P2DTO.dtoid) + .filter(mysql_P2DTO.generation == 0) + ) + + objects: List[Protein | ProteinDTOClassEdge] = [] + for row in rows: + protein_id = EquivalentId(id=row[0], type=Prefix.UniProtKB).id_str() + dtoid = _dto_node_id(row[1]) + objects.append( + Protein( + id=protein_id, + dtoid=dtoid, + dtoclass=row[3], + ) + ) + objects.append( + ProteinDTOClassEdge( + start_node=Protein(id=protein_id), + end_node=DTOClass(id=dtoid), + ) + ) + + yield objects diff --git a/src/input_adapters/pharos_mysql/idg_family_adapter.py b/src/input_adapters/pharos_mysql/idg_family_adapter.py index 95d49f2..8dfb9b8 100644 --- a/src/input_adapters/pharos_mysql/idg_family_adapter.py +++ b/src/input_adapters/pharos_mysql/idg_family_adapter.py @@ -1,25 +1,14 @@ -from datetime import datetime from typing import List, Generator -from src.constants import Prefix, DataSourceName -from src.input_adapters.sql_adapter import MySqlAdapter +from src.constants import Prefix +from src.input_adapters.pharos_mysql.base import Pharos319Adapter from src.shared.sqlalchemy_tables.pharos_tables_old import Protein as mysql_Protein, Target as mysql_Target, T2TC as mysql_t2tc -from src.interfaces.input_adapter import InputAdapter -from src.models.datasource_version_info import DatasourceVersionInfo from src.models.node import EquivalentId from src.models.protein import Protein, IDGFamily -class IDGFamilyAdapter(InputAdapter, MySqlAdapter): +class IDGFamilyAdapter(Pharos319Adapter): batch_size: int = 1000 - def get_datasource_name(self) -> DataSourceName: - return DataSourceName.OldPharos - - def get_version(self) -> DatasourceVersionInfo: - return DatasourceVersionInfo( - version="3.19", - version_date=datetime.fromisoformat("2024-02-15"), - ) def get_all(self) -> Generator[List[Protein], None, None]: results = (self.get_session().query( diff --git a/src/input_adapters/pharos_mysql/ppi_adapter.py b/src/input_adapters/pharos_mysql/ppi_adapter.py index 920dd51..8da92ee 100755 --- a/src/input_adapters/pharos_mysql/ppi_adapter.py +++ b/src/input_adapters/pharos_mysql/ppi_adapter.py @@ -4,15 +4,14 @@ from sqlalchemy.orm import aliased from src.constants import Prefix -from src.input_adapters.sql_adapter import MySqlAdapter -from src.interfaces.input_adapter import InputAdapter +from src.input_adapters.pharos_mysql.base import Pharos319Adapter from src.models.node import Relationship from src.shared.sqlalchemy_tables.pharos_tables_old import Protein as mysql_Protein, PPI as mysql_ppi from src.models.ppi import PPIEdge from src.models.protein import Protein -class ProteinProteinInteractionAdapter(InputAdapter, MySqlAdapter): +class ProteinProteinInteractionAdapter(Pharos319Adapter): def get_all(self) -> List[Relationship]: protein_alias1 = aliased(mysql_Protein) diff --git a/src/input_adapters/pharos_mysql/protein_adapter.py b/src/input_adapters/pharos_mysql/protein_adapter.py index c850e09..de0df92 100755 --- a/src/input_adapters/pharos_mysql/protein_adapter.py +++ b/src/input_adapters/pharos_mysql/protein_adapter.py @@ -1,12 +1,11 @@ from typing import List from src.constants import Prefix -from src.input_adapters.sql_adapter import MySqlAdapter +from src.input_adapters.pharos_mysql.base import Pharos319Adapter from src.shared.sqlalchemy_tables.pharos_tables_old import Protein as mysql_Protein, Target as mysql_Target, T2TC as mysql_t2tc -from src.interfaces.input_adapter import InputAdapter from src.models.protein import Protein, IDGFamily -class ProteinAdapter(InputAdapter, MySqlAdapter): +class ProteinAdapter(Pharos319Adapter): def get_all(self): results = (self.get_session().query( mysql_Protein.name, diff --git a/src/input_adapters/sql_adapter.py b/src/input_adapters/sql_adapter.py index b0c1bf2..a37f137 100644 --- a/src/input_adapters/sql_adapter.py +++ b/src/input_adapters/sql_adapter.py @@ -1,5 +1,6 @@ from sqlalchemy import create_engine, text from sqlalchemy.orm import sessionmaker, Session +from dataclasses import replace from src.shared.db_credentials import DBCredentials @@ -59,7 +60,11 @@ def __init__(self, credentials: DBCredentials): super().__init__(credentials, dialect="mysql+pymysql") def recreate_mysql_db(self, db_name, truncate_tables = True): - engine = self.get_engine() + server_credentials = replace(self.credentials, schema=None) + engine = create_engine( + HostedSqlAdapter(server_credentials, dialect="mysql+pymysql").get_connection_string(), + pool_pre_ping=True + ) with engine.connect() as conn: if truncate_tables: @@ -69,6 +74,7 @@ def recreate_mysql_db(self, db_name, truncate_tables = True): print(f"Created empty MySQL database: {db_name}") else: print(f"Ensured MySQL database exists: {db_name}") + engine.dispose() self.update_database(db_name) diff --git a/src/input_adapters/target_graph/protein_nodes_and_edges.py b/src/input_adapters/target_graph/protein_nodes_and_edges.py index 95620f0..98afc94 100644 --- a/src/input_adapters/target_graph/protein_nodes_and_edges.py +++ b/src/input_adapters/target_graph/protein_nodes_and_edges.py @@ -20,7 +20,7 @@ def build_protein_obj(line): protein_obj.created = TargetGraphProteinParser.get_creation_date(line) protein_obj.updated = TargetGraphProteinParser.get_updated_time(line) protein_obj.name = TargetGraphProteinParser.get_name(line) - protein_obj.symbol = TargetGraphProteinParser.get_symbol(line) + # protein_obj.symbol = TargetGraphProteinParser.get_symbol(line) # don't use this, it's sometimes a multi-valued symbol protein_obj.ensembl_id = TargetGraphProteinParser.get_ensembl_id(line) protein_obj.refseq_id = TargetGraphProteinParser.get_refseq_id(line) diff --git a/src/models/dto_class.py b/src/models/dto_class.py new file mode 100644 index 0000000..6fe1a21 --- /dev/null +++ b/src/models/dto_class.py @@ -0,0 +1,26 @@ +from dataclasses import dataclass +from typing import Optional + +from src.core.decorators import search +from src.models.node import Node, Relationship +from src.models.protein import Protein + + +@dataclass +@search(text_fields=["name", "description"]) +class DTOClass(Node): + source_id: Optional[str] = None + name: Optional[str] = None + description: Optional[str] = None + + +@dataclass +class DTOClassParentEdge(Relationship): + start_node: DTOClass + end_node: DTOClass + + +@dataclass +class ProteinDTOClassEdge(Relationship): + start_node: Protein + end_node: DTOClass diff --git a/src/models/protein.py b/src/models/protein.py index 5158962..20894ba 100644 --- a/src/models/protein.py +++ b/src/models/protein.py @@ -117,6 +117,8 @@ class Protein(Audited, Analyte): protein_name_method: Optional[str] = None uniprot_isoform: Optional[str] = None calculated_properties: Optional[Dict[str, float]] = None + dtoid: Optional[str] = None + dtoclass: Optional[str] = None @dataclass diff --git a/src/output_adapters/mysql_output_adapter.py b/src/output_adapters/mysql_output_adapter.py index 209ef9b..9f22f11 100644 --- a/src/output_adapters/mysql_output_adapter.py +++ b/src/output_adapters/mysql_output_adapter.py @@ -16,11 +16,14 @@ from src.shared.db_credentials import DBCredentials from src.shared.sqlalchemy_tables.pharos_tables_new import ( AncestryDO, + AncestryDTO, AncestryMONDO, AncestryUBERON, DataSourceVersion, DO, DOParent, + DTO, + DTOParent, ETLRun, Mondo, MondoParent, @@ -354,11 +357,22 @@ def _populate_ancestry_table(self, session, node_cls, parent_cls, ancestry_cls, [{"oid": oid, "ancestor_id": ancestor_id} for oid, ancestor_id in closure], ) + @staticmethod + def _populate_dto_parent_column(session): + session.query(DTO).update({DTO.parent_id: None}) + direct_edges = session.query(DTOParent.dtoid, DTOParent.parent_id).all() + if direct_edges: + session.bulk_update_mappings( + DTO, + [{"dtoid": dtoid, "parent_id": parent_id} for dtoid, parent_id in direct_edges], + ) + def do_post_processing(self, clean_edges: bool = True) -> None: session = self.get_session() try: self._populate_data_source_version_table(session) self._populate_etl_run_table(session) + self._populate_dto_parent_column(session) self._populate_ancestry_table( session=session, node_cls=DO, @@ -367,6 +381,14 @@ def do_post_processing(self, clean_edges: bool = True) -> None: node_key="doid", parent_key="parent_id", ) + self._populate_ancestry_table( + session=session, + node_cls=DTO, + parent_cls=DTOParent, + ancestry_cls=AncestryDTO, + node_key="dtoid", + parent_key="parent_id", + ) self._populate_ancestry_table( session=session, node_cls=Mondo, diff --git a/src/output_adapters/sql_converters/tcrd.py b/src/output_adapters/sql_converters/tcrd.py index d306750..43f6839 100755 --- a/src/output_adapters/sql_converters/tcrd.py +++ b/src/output_adapters/sql_converters/tcrd.py @@ -2,6 +2,7 @@ from typing import Union, List, Optional from src.constants import Prefix from src.models.disease import Disease, DiseaseParentEdge, DODiseaseParentEdge, ProteinDiseaseEdge +from src.models.dto_class import DTOClass, DTOClassParentEdge, ProteinDTOClassEdge from src.models.expression import ProteinTissueExpressionEdge from src.models.generif import GeneGeneRifEdge from src.models.go_term import GoType, GoTerm, GoTermHasParent, ProteinGoTermEdge @@ -19,6 +20,7 @@ Uberon, UberonParent, Tissue as mysqlTissue, Expression, Gtex, Mondo, MondoParent, MondoXref, Disease as mysqlDisease, DiseaseType, DO, DOParent, NcatsDisease, NcatsD2DA, Pathway as mysqlPathway, PantherClass as mysqlPantherClass, P2PC, + DTO as mysqlDTO, DTOParent, P2DTO, ) from src.output_adapters.sql_converters.output_converter_base import SQLOutputConverter from src.shared.sqlalchemy_tables.pharos_tables_new import Base as TCRDBase @@ -65,6 +67,10 @@ def __init__(self): # Panther PantherClass: [self.panther_class_converter], ProteinPantherClassEdge: [self.p2pc_converter], + # DTO + DTOClass: [self.dto_converter], + DTOClassParentEdge: [self.dto_parent_converter], + ProteinDTOClassEdge: [self.p2dto_converter], # Keyword ProteinKeywordEdge: [self.keyword_xref_converter], } @@ -100,14 +106,17 @@ def protein_converter(self, obj: dict) -> mysqlProtein: return mysqlProtein( id=self.resolve_id('protein', obj['id']), ifx_id=obj['id'], - description=obj['name'], + name=obj.get('gene_name'), + description=obj.get('description'), uniprot=obj['uniprot_id'], - sym=obj['symbol'], + sym=obj.get('symbol'), geneid=gene_id, stringid=string_id, seq=obj['sequence'], + dtoid=(obj.get('dtoid') or '').replace(':', '_') or None, + dtoclass=obj.get('dtoclass'), provenance=obj['provenance'], - preferred_symbol=obj['preferred_symbol'] + preferred_symbol=obj.get('preferred_symbol') ) def tdl_info_converter(self, obj: dict) -> List[TDL_info]: @@ -620,6 +629,28 @@ def p2pc_converter(self, obj: dict) -> P2PC: protein_id=self.resolve_id('protein', obj['start_id']), ) + # --- DTO --- + + def dto_converter(self, obj: dict) -> mysqlDTO: + return mysqlDTO( + dtoid=obj['id'], + name=obj.get('name') or '', + parent_id=None, + def_=obj.get('description'), + ) + + def dto_parent_converter(self, obj: dict) -> DTOParent: + return DTOParent( + dtoid=obj['start_id'], + parent_id=obj['end_id'], + ) + + def p2dto_converter(self, obj: dict) -> P2DTO: + return P2DTO( + dtoid=obj['end_id'], + protein_id=self.resolve_id('protein', obj['start_id']), + ) + # --- Keyword --- def keyword_xref_converter(self, obj: dict) -> Xref: diff --git a/src/qa_browser/static/style.css b/src/qa_browser/static/style.css index 020a72e..e57e6b7 100644 --- a/src/qa_browser/static/style.css +++ b/src/qa_browser/static/style.css @@ -744,6 +744,30 @@ td.mono { font-family: var(--mono); font-size: 12px; } font-size: 12px; } +.existing-project-load-btn { + background: color-mix(in srgb, var(--accent) 92%, black 8%); + color: #fff; + min-width: 5.25rem; + padding: 0.55rem 1rem; + font-size: 13px; + font-weight: 700; + box-shadow: 0 1px 0 rgba(0, 0, 0, 0.12), 0 0 0 1px color-mix(in srgb, var(--accent) 65%, transparent); + transition: transform 0.12s ease, box-shadow 0.12s ease, opacity 0.15s ease; +} + +.existing-project-load-btn:hover { + opacity: 1; + transform: translateY(-1px); + box-shadow: 0 4px 10px color-mix(in srgb, var(--accent) 22%, transparent), + 0 0 0 1px color-mix(in srgb, var(--accent) 75%, transparent); +} + +.existing-project-load-btn:focus-visible { + outline: none; + box-shadow: 0 0 0 3px var(--accent-light), + 0 0 0 1px color-mix(in srgb, var(--accent) 75%, transparent); +} + /* Mermaid */ .mermaid { background: var(--bg-alt); diff --git a/src/qa_browser/templates/pounce_existing_project.html b/src/qa_browser/templates/pounce_existing_project.html index d387245..006766b 100644 --- a/src/qa_browser/templates/pounce_existing_project.html +++ b/src/qa_browser/templates/pounce_existing_project.html @@ -76,7 +76,7 @@

Edit Existing Project

- +
diff --git a/src/shared/sqlalchemy_tables/pharos_tables_new.py b/src/shared/sqlalchemy_tables/pharos_tables_new.py index 7f1816c..37a1b23 100644 --- a/src/shared/sqlalchemy_tables/pharos_tables_new.py +++ b/src/shared/sqlalchemy_tables/pharos_tables_new.py @@ -613,12 +613,23 @@ class DTO(Base): ) +class DTOParent(Base): + __tablename__ = "dto_parent" + + dtoid = Column(String(255), ForeignKey("dto.dtoid"), primary_key=True, nullable=False) + parent_id = Column(String(255), ForeignKey("dto.dtoid"), primary_key=True, nullable=False) + + __table_args__ = ( + Index("dto_parent_idx1", "dtoid"), + Index("dto_parent_idx2", "parent_id"), + ) + + class P2DTO(Base): __tablename__ = "p2dto" dtoid = Column(String(255), ForeignKey("dto.dtoid"), primary_key=True, nullable=False) protein_id = Column(Integer, ForeignKey("protein.id"), primary_key=True, nullable=False) - generation = Column(Integer, nullable=False) __table_args__ = ( Index("p2dto_dtoid_foreign", "dtoid"), diff --git a/src/shared/uniprot_parser.py b/src/shared/uniprot_parser.py index a00388d..bdae907 100755 --- a/src/shared/uniprot_parser.py +++ b/src/shared/uniprot_parser.py @@ -112,6 +112,13 @@ def get_symbols(uniprot_obj): return None return symbols + @staticmethod + def get_primary_symbol(uniprot_obj): + symbols = UniProtParser.get_symbols(uniprot_obj) + if not symbols: + return None + return symbols[0] + @staticmethod def get_sequence(uniprot_obj): return uniprot_obj['sequence']['value'] diff --git a/src/use_cases/pharos/TCRD_TODO.md b/src/use_cases/pharos/TCRD_TODO.md index 9b9ae10..27ecd57 100644 --- a/src/use_cases/pharos/TCRD_TODO.md +++ b/src/use_cases/pharos/TCRD_TODO.md @@ -35,17 +35,21 @@ Each row is a protein-facing Pharos/TCRD concept. Data source checkboxes = inges | **Ligand** | [x] IUPHAR
[x] ChEMBL
[x] DrugCentral | `Ligand` | [x] `ncats_ligands` | | **ProteinLigandEdge** | [x] IUPHAR
[x] ChEMBL
[x] DrugCentral | `ProteinLigandEdge` | [x] `ncats_ligand_activity` | | **Disease** | [x] MONDO
[x] Disease Ontology
[x] UniProt curated
[x] CTD
[x] JensenLab DISEASES *(promoted in `pharos.yaml` / `target_graph.yaml`)*
[x] DrugCentral Indication | `Disease` | [x] `ncats_disease` | -| **DiseaseParentEdge** | [x] MONDO | `DiseaseParentEdge` | [x] `mondo_parent`
[x] `ancestry_mondo` | -| **DODiseaseParentEdge** | [x] Disease Ontology | `DODiseaseParentEdge` | [x] `do_parent`
[x] `ancestry_do` | +| **DiseaseParentEdge** | [x] MONDO | `DiseaseParentEdge` | not exported from merged graph; source-file MONDO tables populate `mondo_parent` / `ancestry_mondo` below | +| **DODiseaseParentEdge** | [x] Disease Ontology | `DODiseaseParentEdge` | not exported from merged graph; source-file DO tables populate `do_parent` / `ancestry_do` below | | **ProteinDiseaseEdge** | [x] UniProt curated
[x] CTD *(side-lifted from gene associations by the TCRD target resolver)*
[x] JensenLab DISEASES *(Knowledge, Experiment/TIGA, and Text Mining; promoted in `pharos.yaml` / `target_graph.yaml`; working/full configs apply `textmining_min_zscore: 6.0` to stay close to historical Pharos text-mining scope)*
[x] DrugCentral Indication | `ProteinDiseaseEdge` | [x] `disease_type`
[x] `disease`
[x] `ncats_d2da` | -| **Pathway** | [x] UniProt
[x] Reactome
[x] WikiPathways
[x] PathwayCommons | `Pathway` | [x] `pathway` | +| **Pathway** | [x] UniProt
[x] Reactome
[x] WikiPathways
[x] PathwayCommons | `Pathway` | no standalone TCRD table; pathway content is duplicated via `ProteinPathwayEdge` into `pathway` | | **PathwayParentEdge** | [x] Reactome | `PathwayParentEdge` | not exported to legacy TCRD MySQL | | **ProteinPathwayEdge** | [x] UniProt
[x] Reactome
[x] WikiPathways *(side-lifted from gene associations by the TCRD target resolver)*
[x] PathwayCommons *(side-lifted from gene associations by the TCRD target resolver)* | `ProteinPathwayEdge` | [x] `pathway` | | **PantherClass** | [x] PANTHER Classes *(promoted in `pharos.yaml` / `target_graph.yaml`)* | `PantherClass` | [x] `panther_class` *(via `tcrd.yaml`; validated in `working_mysql.yaml` first)* | | **ProteinPantherClassEdge** | [x] PANTHER Classes *(promoted in `pharos.yaml` / `target_graph.yaml`)* | `ProteinPantherClassEdge` | [x] `p2pc` *(via `tcrd.yaml`; validated in `working_mysql.yaml` first)* | -| **Keyword** | [x] UniProt | `Keyword` | [x] `xref` *(UniProt Keyword xtype)* | +| **DTOClass** | [x] old Pharos MySQL | `DTOClass` | current converter supports `dto`, but DTO is not wired in active `tcrd.yaml` | +| **DTOClassParentEdge** | [x] old Pharos MySQL | `DTOClassParentEdge` | current converter supports `dto_parent`, but DTO is not wired in active `tcrd.yaml` | +| **ProteinDTOClassEdge** | [x] old Pharos MySQL | `ProteinDTOClassEdge` | current converter supports `p2dto`, but DTO is not wired in active `tcrd.yaml` | +| **Keyword** | [x] UniProt | `Keyword` | no standalone TCRD table; keyword content is duplicated via `ProteinKeywordEdge` into `xref` | | **ProteinKeywordEdge** | [x] UniProt | `ProteinKeywordEdge` | [x] `xref` *(UniProt Keyword xtype)* | | | *— post-processing (pharos_aql_post.yaml) —* | | | +| **SetPreferredSymbolAdapter** | [x] computed from graph | updates `preferred_symbol` on `Protein` | *(via Protein → `protein.preferred_symbol`)* | | **SetLigandActivityFlagAdapter** | [x] computed from graph | updates `meets_idg_cutoff` on `ProteinLigandEdge` | *(via ProteinLigandEdge)* | | **SetGoTermLeafFlagAdapter** | [x] computed from graph | updates `is_leaf` on `GoTerm` | *(via GoTerm)* | | **TDLInputAdapter** | [x] computed from graph | updates `tdl`, `tdl_meta` on `Protein` | *(via Protein)* | @@ -57,7 +61,7 @@ These tables are populated directly from ontology source files during the TCRD b | Source Concept | Source Files | TCRD Tables | |---------|------------------------|-------------| -| **MONDO ontology** | [x] `input_files/auto/mondo/mondo.json` | [x] `mondo`
[x] `mondo_parent`
[x] `ancestry_mondo` *(post-processing from `mondo_parent`)* | +| **MONDO ontology** | [x] `input_files/auto/mondo/mondo.json` | [x] `mondo`
[x] `mondo_xref`
[x] `mondo_parent`
[x] `ancestry_mondo` *(post-processing from `mondo_parent`)* | | **Disease Ontology** | [x] `input_files/auto/disease_ontology/doid.json` | [x] `do`
[x] `do_parent`
[x] `ancestry_do` *(post-processing from `do_parent`)* | --- @@ -69,7 +73,6 @@ These tables are populated directly from ontology source files during the TCRD b ### New Concepts - Protein-Protein Interactions — STRING, BioPlex, Reactome PPI - Orthologs — OMA, EggNOG, Inparanoid -- Protein Classes - DTO - Phenotype — IMPC, JAX/MGI - GWAS - Protein & Disease Novelty (this might be TINx, I'm not sure) @@ -80,6 +83,9 @@ These tables are populated directly from ontology source files during the TCRD b - Nearest Tclin (computed from graph) - Publication Statistics (PubMed Score, PubTator) +### Refactoring / Polish +- Normalize old `pharos_mysql` adapters to use `EquivalentId(...).id_str()` consistently instead of manual `f"{Prefix...}:{...}"` string construction where they emit graph IDs. + ### Simple Linkouts - Dark Kinase Knowledgebase — understudied kinases compendium - RESOLUTE — solute carrier (SLC) target class resource diff --git a/src/use_cases/pharos/pharos.yaml b/src/use_cases/pharos/pharos.yaml index cf792d2..7159259 100644 --- a/src/use_cases/pharos/pharos.yaml +++ b/src/use_cases/pharos/pharos.yaml @@ -337,6 +337,18 @@ input_adapters: sequence_classification_file_path: ./input_files/auto/panther/PTHR19.0_human version_file_path: ./input_files/auto/panther/panther_classes_version.tsv + - import: ./src/input_adapters/pharos_mysql/dto_adapter.py + class: DTOClassAdapter + credentials: ./src/use_cases/secrets/pharos_credentials.yaml + + - import: ./src/input_adapters/pharos_mysql/dto_adapter.py + class: DTOClassParentEdgeAdapter + credentials: ./src/use_cases/secrets/pharos_credentials.yaml + + - import: ./src/input_adapters/pharos_mysql/dto_adapter.py + class: ProteinDTOClassAdapter + credentials: ./src/use_cases/secrets/pharos_credentials.yaml + output_adapters: - import: ./src/output_adapters/arango_output_adapter.py class: ArangoOutputAdapter diff --git a/src/use_cases/pharos/pharos_aql_post.yaml b/src/use_cases/pharos/pharos_aql_post.yaml index 593f463..4de6638 100644 --- a/src/use_cases/pharos/pharos_aql_post.yaml +++ b/src/use_cases/pharos/pharos_aql_post.yaml @@ -21,6 +21,12 @@ resolvers: input_adapters: + - import: ./src/input_adapters/pharos_arango/set_preferred_symbol.py + class: SetPreferredSymbolAdapter + kwargs: + database_name: *database_name + credentials: *source_credentials + - import: ./src/input_adapters/pharos_arango/set_ligand_activity_flag.py class: SetLigandActivityFlagAdapter kwargs: diff --git a/src/use_cases/pharos/target_graph.yaml b/src/use_cases/pharos/target_graph.yaml index f240502..23e1745 100644 --- a/src/use_cases/pharos/target_graph.yaml +++ b/src/use_cases/pharos/target_graph.yaml @@ -379,6 +379,18 @@ input_adapters: sequence_classification_file_path: ./input_files/auto/panther/PTHR19.0_human version_file_path: ./input_files/auto/panther/panther_classes_version.tsv + - import: ./src/input_adapters/pharos_mysql/dto_adapter.py + class: DTOClassAdapter + credentials: ./src/use_cases/secrets/pharos_credentials.yaml + + - import: ./src/input_adapters/pharos_mysql/dto_adapter.py + class: DTOClassParentEdgeAdapter + credentials: ./src/use_cases/secrets/pharos_credentials.yaml + + - import: ./src/input_adapters/pharos_mysql/dto_adapter.py + class: ProteinDTOClassAdapter + credentials: ./src/use_cases/secrets/pharos_credentials.yaml + output_adapters: - import: ./src/output_adapters/arango_output_adapter.py class: ArangoOutputAdapter diff --git a/src/use_cases/pharos/target_graph_aql_post.yaml b/src/use_cases/pharos/target_graph_aql_post.yaml index a53d995..7f0b011 100644 --- a/src/use_cases/pharos/target_graph_aql_post.yaml +++ b/src/use_cases/pharos/target_graph_aql_post.yaml @@ -14,6 +14,12 @@ resolvers: - Protein input_adapters: + - import: ./src/input_adapters/pharos_arango/set_preferred_symbol.py + class: SetPreferredSymbolAdapter + kwargs: + database_name: *database_name + credentials: *source_credentials + - import: ./src/input_adapters/pharos_arango/expand_IDG_families.py class: ExpandIDGFamilies kwargs: diff --git a/src/use_cases/pharos/tcrd.yaml b/src/use_cases/pharos/tcrd.yaml index 0686df9..d90187d 100644 --- a/src/use_cases/pharos/tcrd.yaml +++ b/src/use_cases/pharos/tcrd.yaml @@ -117,6 +117,24 @@ input_adapters: kwargs: database_name: *source_database + - import: ./src/input_adapters/pharos_arango/tcrd/dto.py + class: DTOClassAdapter + credentials: *source_credentials + kwargs: + database_name: *source_database + + - import: ./src/input_adapters/pharos_arango/tcrd/dto.py + class: DTOClassParentAdapter + credentials: *source_credentials + kwargs: + database_name: *source_database + + - import: ./src/input_adapters/pharos_arango/tcrd/dto.py + class: ProteinDTOClassAdapter + credentials: *source_credentials + kwargs: + database_name: *source_database + - import: ./src/input_adapters/pharos_arango/tcrd/keyword.py class: ProteinKeywordAdapter credentials: *source_credentials diff --git a/src/use_cases/working.yaml b/src/use_cases/working.yaml index 9809511..e446415 100644 --- a/src/use_cases/working.yaml +++ b/src/use_cases/working.yaml @@ -34,6 +34,18 @@ input_adapters: file_path: ./input_files/manual/target_graph/protein_ids.tsv collapse_reviewed_targets: true + - import: ./src/input_adapters/file_uniprot/protein_adapter.py + class: ProteinAdapter + kwargs: + file_path: ./input_files/auto/uniprot/uniprot-human-reviewed.json.gz + version_file_path: ./input_files/auto/uniprot/uniprot_version.tsv + + - import: ./src/input_adapters/pharos_arango/set_preferred_symbol.py + class: SetPreferredSymbolAdapter + kwargs: + database_name: test_pharos + credentials: *destination_credentials + - import: ./src/input_adapters/panther/panther_classes.py class: PantherClassesAdapter kwargs: @@ -42,6 +54,18 @@ input_adapters: sequence_classification_file_path: ./input_files/auto/panther/PTHR19.0_human version_file_path: ./input_files/auto/panther/panther_classes_version.tsv + - import: ./src/input_adapters/pharos_mysql/dto_adapter.py + class: DTOClassAdapter + credentials: ./src/use_cases/secrets/pharos_credentials.yaml + + - import: ./src/input_adapters/pharos_mysql/dto_adapter.py + class: DTOClassParentEdgeAdapter + credentials: ./src/use_cases/secrets/pharos_credentials.yaml + + - import: ./src/input_adapters/pharos_mysql/dto_adapter.py + class: ProteinDTOClassAdapter + credentials: ./src/use_cases/secrets/pharos_credentials.yaml + output_adapters: - import: ./src/output_adapters/arango_output_adapter.py class: ArangoOutputAdapter diff --git a/src/use_cases/working_mysql.yaml b/src/use_cases/working_mysql.yaml index c5dd6d8..cbd62b4 100644 --- a/src/use_cases/working_mysql.yaml +++ b/src/use_cases/working_mysql.yaml @@ -77,6 +77,24 @@ input_adapters: kwargs: database_name: *source_database + - import: ./src/input_adapters/pharos_arango/tcrd/dto.py + class: DTOClassAdapter + credentials: *source_credentials + kwargs: + database_name: *source_database + + - import: ./src/input_adapters/pharos_arango/tcrd/dto.py + class: DTOClassParentAdapter + credentials: *source_credentials + kwargs: + database_name: *source_database + + - import: ./src/input_adapters/pharos_arango/tcrd/dto.py + class: ProteinDTOClassAdapter + credentials: *source_credentials + kwargs: + database_name: *source_database + # - import: ./src/input_adapters/pharos_arango/tcrd/keyword.py # class: ProteinKeywordAdapter # credentials: *source_credentials