diff --git a/.gitignore b/.gitignore index c54895d..e3cdb38 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ coverage.xml lib .pybiomart.sqlite proxy.log +pounce_audit \ No newline at end of file diff --git a/designs/drugcentral_indications_discovery_2026_04.md b/designs/drugcentral_indications_discovery_2026_04.md new file mode 100644 index 0000000..02feec1 --- /dev/null +++ b/designs/drugcentral_indications_discovery_2026_04.md @@ -0,0 +1,129 @@ +# DrugCentral Indications Summary + +Date: 2026-04-14 + +## Goal + +Add DrugCentral indications to the Pharos working graph and validate the TCRD MySQL conversion path. + +## Scope Chosen + +- Source: current DrugCentral PostgreSQL `omop_relationship_doid_view` +- Relationship scope: `indication` only +- Protein gating: only drugs with human target activity in `act_table_full` +- Graph outputs: + - `Disease` + - `ProteinDiseaseEdge` +- Working config only: + - graph in `src/use_cases/working.yaml` + - MySQL in `src/use_cases/working_mysql.yaml` + +Reference counts from discovery: + +- Raw source counts: + - current DrugCentral `indication` rows: `12047` + - current indication structures: `2723` + - current indication rows with `UMLS`: `9562` +- Legacy downstream count: + - `pharos319.disease` rows with `dtype='DrugCentral Indication'`: `13919` +- Working graph count: + - `test_pharos.ProteinDiseaseEdge` rows: `41663` +- Initial working MySQL count: + - `pharos400_working.disease` rows with `dtype='DrugCentral Indication'`: `62140` + +Note: + +- the raw DrugCentral source count is not directly comparable to the MySQL `disease` table count +- the source count is pre-expansion +- the MySQL `disease` count is post-expansion across protein targets + +Deferred: + +- contraindication / off-label use +- approval metadata +- full Pharos config promotion + +## Key Design Decisions + +- Use `UMLS` as the primary source disease ID. + - Node Normalizer coverage was stronger for `UMLS` than `SNOMEDCT`. + - `SNOMEDCT` rows were already paired with `UMLS`, so it was not needed as a fallback. +- Do not rely on DrugCentral `DOID` as a fallback. + - It did not add useful extra coverage once `UMLS` / `SNOMEDCT` were considered. +- Preserve text-only indication concepts. + - If a row has `UMLS`, emit `Disease.id = UMLS:`. + - If a row has no `UMLS`, emit a stable local ID `DrugCentral:INDICATION:`. +- Preserve source metadata in edge details: + - `drug_name` + - `snomed_id` + - `doid` + +Resolver investigation summary: + +- We checked the current Node Normalizer integration used by Pharos / target_graph. +- `UMLS` resolved at a higher rate than `SNOMEDCT` for DrugCentral indication IDs. +- Where both existed, `UMLS` and `SNOMEDCT` usually normalized to the same concept, but not always. +- That was enough to justify a deterministic rule: + - use `UMLS` as the emitted disease ID + - keep `SNOMEDCT` as metadata + - avoid mixing the two at adapter time + +## Legacy Comparison + +Legacy `pharos319` DrugCentral indications: + +- lived in `disease` with `dtype = 'DrugCentral Indication'` +- populated `name` and `drug_name` +- sometimes populated `did` +- often had no discrete disease ID at all + +This mattered for first-pass design because legacy Pharos did preserve text-only indication names downstream. + +Additional legacy comparison: + +- among the Pharos disease association sources reviewed in `pharos319`, DrugCentral was the only one with true name-only disease association rows that had neither `did` nor `mondoid` + +## What Was Implemented + +Code/config changes: + +- new DrugCentral indication adapter +- `DiseaseAssociationDetail` extended for DrugCentral metadata +- `working.yaml` updated to ingest DrugCentral indications into `test_pharos` +- `working_mysql.yaml` updated to read disease data from `test_pharos` +- TCRD converter updated so: + - `disease.did = detail.source_id` + - `disease.drug_name = detail.drug_name` + +## Graph Validation + +Validation of `test_pharos` showed: + +- DrugCentral indication diseases were loaded +- many `UMLS` diseases normalized to `MONDO` +- text-only indication concepts were preserved as local `DrugCentral:INDICATION:*` disease nodes + +Representative outcomes: + +- `Anesthesia for cesarean section` preserved +- `Local anesthesia` preserved +- `Alcoholism` normalized to `MONDO` +- `Metastatic Breast Carcinoma` preserved as a local DrugCentral indication concept + +## MySQL Validation + +Initial `pharos400_working` run showed the main converter issues: + +- `disease.drug_name` was not being populated +- local graph IDs were leaking into `disease.did` + +Those converter issues were patched. + +## Remaining Follow-Up + +- Recheck `pharos400_working` after the latest converter rerun: + - `disease.did` + - `disease.drug_name` + - `disease.mondoid` + - `ncats_disease` preservation of text-only names +- Compare this pattern with other disease association ingests such as CTD to decide whether the local-ID strategy should remain DrugCentral-specific or become a broader convention. diff --git a/designs/graph_views_metadata.md b/designs/graph_views_metadata.md index 963d753..8f4d28f 100644 --- a/designs/graph_views_metadata.md +++ b/designs/graph_views_metadata.md @@ -5,6 +5,7 @@ Add graph-owned live views/exports that `qa_browser` can discover and execute generically. + The first target is `current_tdls`. Because TDL fields are populated in post-processing, that view should be declared in post-processing YAML such as [`pharos_aql_post.yaml`](/Users/kelleherkj/IdeaProjects/IFX_ODIN/src/use_cases/pharos/pharos_aql_post.yaml). diff --git a/playbooks/ingest_playbook.md b/playbooks/ingest_playbook.md index d2de6e0..d6de4d0 100644 --- a/playbooks/ingest_playbook.md +++ b/playbooks/ingest_playbook.md @@ -58,18 +58,29 @@ Provide a repeatable workflow for adding a new data source to the target graph i - Note that the TCRD format is not always a natural fit, but often captures important historical scope. - For Pharos-related sources, inspect the old loader implementation in the TCRD repository when it helps explain legacy field choices or filtering. -6) **Review data that makes it into TCRD** +6) **Check identifier normalization coverage** + - Before adapter implementation, inspect how the configured resolver path will normalize the source IDs. + - For Pharos / target_graph disease ingest, check the current Node Normalizer integration in `src/id_resolvers/node_normalizer.py`. + - When the source offers multiple disease identifier families, profile each candidate family separately (for example `UMLS`, `SNOMEDCT`, `DOID`) rather than assuming the most ontology-like one is best. + - Query the resolver service metadata when helpful, for example Node Normalizer `GET /get_curie_prefixes`, to confirm accepted prefixes. + - Measure real coverage on distinct source IDs, not just a few spot checks. + - Record both: + - percent of source IDs that resolve at all + - representative canonical prefixes returned by the resolver + - Use these findings to choose what raw source ID the adapter should emit and leave canonicalization to the resolver layer whenever possible. + +7) **Review data that makes it into TCRD** - Currently Pharos uses pharos319. - Review the relevant tables and row counts to understand what was ingested previously. - Compare previous ingest output against the current raw payload to separate legacy limitations from current source reality. - When relevant, also inspect `pharos400` to understand what the newer MySQL path already captures or still misses. -7) **Pause and propose the implementation plan** +8) **Pause and propose the implementation plan** - Summarize the intended adapter scope, node/edge model, resolver dependencies, and validation plan. - Keep the first pass intentionally minimal. - Get user confirmation before making code changes. -8) **Implement an InputAdapter** +9) **Implement an InputAdapter** - Inherit from `src/interfaces/input_adapter.py` (or `FlatFileAdapter`). - Implement: - `get_all` @@ -78,18 +89,18 @@ Provide a repeatable workflow for adding a new data source to the target graph i - Emit `Node` / `Relationship` models that match the schema. - Keep adapters focused on source parsing and structural graph emission. -9) **Map to the data model** +10) **Map to the data model** - Confirm existing node/edge classes or add new ones in `src/models/`. - Use stable IDs and consistent prefixes. - Avoid speculative parsing when source text is ambiguous; preserve the source text when parsing would be lossy. - Keep source-specific payload that may merge later inside `details` structures instead of flattening it into top-level edge fields. -10) **Wire configuration into YAML** +11) **Wire configuration into YAML** - Add the adapter to `src/use_cases/working.yaml` first. - Pass file paths and version metadata file paths via `kwargs`. - Only after the working ingest is validated, promote the finalized configuration into `src/use_cases/pharos/target_graph.yaml`. -11) **Validate the working ingest** +12) **Validate the working ingest** - Ask the user to run the working ETL path. - Validate that counts, labels, IDs, provenance, and key edge endpoints look correct. - Validate that representative input-file records land where expected in the working graph and, when available, in the working MySQL output. @@ -100,7 +111,7 @@ Provide a repeatable workflow for adding a new data source to the target graph i - which source-specific columns are populated in `pharos319` but still empty in the working MySQL output - whether graph data is present in the working graph but not yet mapped into downstream tables -12) **Update the design document** +13) **Update the design document** - Revise the design doc to reflect what actually ended up in the code: - Final field mappings and any decisions that changed during implementation - Actual node/edge counts produced diff --git a/src/input_adapters/drug_central/drug_indication.py b/src/input_adapters/drug_central/drug_indication.py new file mode 100644 index 0000000..78a2e5a --- /dev/null +++ b/src/input_adapters/drug_central/drug_indication.py @@ -0,0 +1,118 @@ +from collections import OrderedDict +import hashlib +from typing import Generator, List, Union + +from sqlalchemy import text + +from src.constants import DataSourceName, Prefix +from src.input_adapters.drug_central.drug_node import DrugCentralAdapter +from src.interfaces.input_adapter import InputAdapter +from src.models.datasource_version_info import DatasourceVersionInfo +from src.models.disease import Disease, DiseaseAssociationDetail, ProteinDiseaseEdge +from src.models.node import EquivalentId, Node, Relationship +from src.models.protein import Protein + + +class DrugCentralIndicationAdapter(InputAdapter, DrugCentralAdapter): + batch_size = 10000 + + def get_datasource_name(self) -> DataSourceName: + return DataSourceName.DrugCentral + + def get_version(self) -> DatasourceVersionInfo: + return self.version_info + + def get_all(self) -> Generator[List[Union[Node, Relationship]], None, None]: + diseases_by_id: OrderedDict[str, Disease] = OrderedDict() + edge_by_key: OrderedDict[tuple[str, str], ProteinDiseaseEdge] = OrderedDict() + seen_detail_keys: set[tuple[str, str, str, str | None, str | None]] = set() + + with self.get_session() as session: + rows = session.execute(text(""" + select distinct + o.struct_id, + s.name as drug_name, + o.concept_name, + o.umls_cui, + o.snomed_conceptid, + o.doid, + a.accession + from omop_relationship_doid_view o + join structures s + on s.id = o.struct_id + join act_table_full a + on a.struct_id = o.struct_id + where o.relationship_name = 'indication' + and a.organism = 'Homo sapiens' + and a.accession is not null + and trim(a.accession) <> '' + order by o.struct_id, o.concept_name, o.umls_cui, a.accession + """)).mappings() + + for row in rows: + disease_name = (row["concept_name"] or "").strip() + if not disease_name: + continue + umls_cui = (row["umls_cui"] or "").strip() or None + disease_id = self._disease_id(disease_name, umls_cui) + + diseases_by_id.setdefault( + disease_id, + Disease( + id=disease_id, + name=disease_name, + ), + ) + + snomed_id = ( + EquivalentId(id=str(row["snomed_conceptid"]).strip(), type=Prefix.SNOMEDCT).id_str() + if row["snomed_conceptid"] is not None and str(row["snomed_conceptid"]).strip() + else None + ) + doid = (row["doid"] or "").strip() or None + drug_name = (row["drug_name"] or "").strip() or None + + for accession in self._split_accessions(row["accession"]): + protein_id = EquivalentId(id=accession, type=Prefix.UniProtKB).id_str() + edge_key = (protein_id, disease_id) + detail_key = (protein_id, disease_id, drug_name or "", snomed_id, doid) + if detail_key in seen_detail_keys: + continue + seen_detail_keys.add(detail_key) + + detail = DiseaseAssociationDetail( + source="DrugCentral Indication", + source_id=EquivalentId(id=umls_cui, type=Prefix.UMLS).id_str() if umls_cui else None, + drug_name=drug_name, + snomed_id=snomed_id, + doid=doid, + ) + + if edge_key not in edge_by_key: + edge_by_key[edge_key] = ProteinDiseaseEdge( + start_node=Protein(id=protein_id), + end_node=diseases_by_id[disease_id], + details=[detail], + ) + else: + edge_by_key[edge_key].details.append(detail) + + yield list(diseases_by_id.values()) + edge_values = list(edge_by_key.values()) + for i in range(0, len(edge_values), self.batch_size): + yield edge_values[i:i + self.batch_size] + + @staticmethod + def _split_accessions(raw_accessions: str) -> List[str]: + return [ + token.strip() + for token in raw_accessions.split("|") + if token and token.strip() + ] + + @staticmethod + def _disease_id(disease_name: str, umls_cui: str | None) -> str: + if umls_cui: + return EquivalentId(id=umls_cui, type=Prefix.UMLS).id_str() + digest = hashlib.sha1(disease_name.strip().lower().encode("utf-8")).hexdigest()[:16] + return f"DrugCentral:INDICATION:{digest}" diff --git a/src/input_adapters/pharos_arango/tcrd/disease.py b/src/input_adapters/pharos_arango/tcrd/disease.py index f08db65..b6ac9b8 100644 --- a/src/input_adapters/pharos_arango/tcrd/disease.py +++ b/src/input_adapters/pharos_arango/tcrd/disease.py @@ -63,6 +63,11 @@ def get_all(self) -> Generator[List[Union[Disease, DiseaseParentEdge]], None, No rows.append(disease) yield rows + db = self.get_db() + if not db.has_collection("DiseaseParentEdge"): + yield [] + return + parents = self.runQuery(disease_parent_query()) yield [ DiseaseParentEdge( diff --git a/src/input_adapters/pounce_sheets/pounce_node_builder.py b/src/input_adapters/pounce_sheets/pounce_node_builder.py index 48b2d2a..209cc8f 100644 --- a/src/input_adapters/pounce_sheets/pounce_node_builder.py +++ b/src/input_adapters/pounce_sheets/pounce_node_builder.py @@ -388,7 +388,7 @@ def _experiment_nodes( meta_sheet=ExperimentWorkbook.ProteinDataMetaSheet.name, data_sheet=ExperimentWorkbook.ProteinDataSheet.name, analyte_id_col=analyte_id_col, - data_type="protein data", + data_type="raw data", parser=exp_parser ) diff --git a/src/models/disease.py b/src/models/disease.py index e87a275..dd60b91 100644 --- a/src/models/disease.py +++ b/src/models/disease.py @@ -48,6 +48,9 @@ class DiseaseAssociationDetail: confidence: Optional[float] = None zscore: Optional[float] = None url: Optional[str] = None + drug_name: Optional[str] = None + snomed_id: Optional[str] = None + doid: Optional[str] = None def to_dict(self): return asdict(self) diff --git a/src/output_adapters/sql_converters/tcrd.py b/src/output_adapters/sql_converters/tcrd.py index c37100a..e9e4e0a 100755 --- a/src/output_adapters/sql_converters/tcrd.py +++ b/src/output_adapters/sql_converters/tcrd.py @@ -552,7 +552,6 @@ def disease_converter(self, obj: dict) -> List[mysqlDisease]: disease_name = self._disease_name(obj) rows = [] for ordinal, detail in enumerate(self._iter_disease_details(obj)): - source_disease_id = detail.get('source_id') or resolved_disease_id assoc_key = self._disease_assoc_key(obj['start_id'], resolved_disease_id, detail, ordinal) rows.append(mysqlDisease( id=self.resolve_id('disease_assoc', assoc_key), @@ -560,11 +559,12 @@ def disease_converter(self, obj: dict) -> List[mysqlDisease]: protein_id=self.resolve_id('protein', obj['start_id']), name=disease_name, ncats_name=disease_name, - did=source_disease_id, + did=detail.get('source_id'), evidence="|".join(detail.get('evidence_terms') or detail.get('evidence_codes') or []) or None, zscore=detail.get('zscore'), conf=detail.get('confidence'), reference=detail.get('url'), + drug_name=detail.get('drug_name'), mondoid=mondoid, provenance=obj['provenance'], )) diff --git a/src/use_cases/pharos/TCRD_TODO.md b/src/use_cases/pharos/TCRD_TODO.md index 743bfd4..567c4a2 100644 --- a/src/use_cases/pharos/TCRD_TODO.md +++ b/src/use_cases/pharos/TCRD_TODO.md @@ -24,7 +24,7 @@ Each row is a protein-facing Pharos/TCRD concept. Data source checkboxes = inges | Concept | Data Sources (→ graph) | Arango Type | MySQL Tables (graph → TCRD) | |---------|------------------------|-------------|------------------------------| | **Protein** | [x] target_graph CSV
[x] UniProt reviewed
[x] JensenLab *(pm_score)*
[x] Antibodypedia *(antibody_count)*
[x] old Pharos MySQL *(idg_family)* | `Protein` | [x] `protein`
[x] `target`
[x] `t2tc`
[x] `alias`
[x] `xref`
[x] `tdl_info` | -| **GeneRif** | [x] target_graph generif CSV | `GeneRif` | [ ] TBD | +| **GeneRif** | [x] target_graph generif CSV | `GeneRif` | [x] `generif` | | **GeneGeneRifEdge** | [x] target_graph generif CSV | `GeneGeneRifEdge` | [x] `generif`
[x] `generif2pubmed`
[x] `protein2pubmed` | | **Tissue** | [x] Uberon OBO | `Tissue` | [x] `uberon` | | **TissueParentEdge** | [x] Uberon OBO | `TissueParentEdge` | [x] `uberon_parent` | @@ -35,13 +35,13 @@ Each row is a protein-facing Pharos/TCRD concept. Data source checkboxes = inges | **Ligand** | [x] IUPHAR
[x] ChEMBL
[x] DrugCentral | `Ligand` | [x] `ncats_ligands` | | **ProteinLigandEdge** | [x] IUPHAR
[x] ChEMBL
[x] DrugCentral | `ProteinLigandEdge` | [x] `ncats_ligand_activity` | | **Disease** | [x] MONDO
[x] Disease Ontology
[x] UniProt curated
[x] CTD
[x] JensenLab DISEASES *(promoted in `pharos.yaml` / `target_graph.yaml`)* | `Disease` | [x] `ncats_disease` | -| **DiseaseParentEdge** | [x] MONDO | `DiseaseParentEdge` | [ ] TBD | -| **DODiseaseParentEdge** | [x] Disease Ontology | `DODiseaseParentEdge` | [ ] TBD | +| **DiseaseParentEdge** | [x] MONDO | `DiseaseParentEdge` | [x] `mondo_parent`
[x] `ancestry_mondo` | +| **DODiseaseParentEdge** | [x] Disease Ontology | `DODiseaseParentEdge` | [x] `do_parent`
[x] `ancestry_do` | | **ProteinDiseaseEdge** | [x] UniProt curated
[x] CTD *(side-lifted from gene associations by the TCRD target resolver)*
[x] JensenLab DISEASES *(Knowledge, Experiment/TIGA, and Text Mining; promoted in `pharos.yaml` / `target_graph.yaml`; working/full configs apply `textmining_min_zscore: 6.0` to stay close to historical Pharos text-mining scope)* | `ProteinDiseaseEdge` | [x] `disease_type`
[x] `disease`
[x] `ncats_d2da` | -| **Pathway** | [x] UniProt
[x] Reactome
[x] WikiPathways
[x] PathwayCommons | `Pathway` | [ ] TBD | -| **PathwayParentEdge** | [x] Reactome | `PathwayParentEdge` | [ ] TBD | +| **Pathway** | [x] UniProt
[x] Reactome
[x] WikiPathways
[x] PathwayCommons | `Pathway` | [x] `pathway` | +| **PathwayParentEdge** | [x] Reactome | `PathwayParentEdge` | not exported to legacy TCRD MySQL | | **ProteinPathwayEdge** | [x] UniProt
[x] Reactome
[x] WikiPathways *(side-lifted from gene associations by the TCRD target resolver)*
[x] PathwayCommons *(side-lifted from gene associations by the TCRD target resolver)* | `ProteinPathwayEdge` | [x] `pathway` | -| **Keyword** | [x] UniProt | `Keyword` | [ ] TBD | +| **Keyword** | [x] UniProt | `Keyword` | [x] `xref` *(UniProt Keyword xtype)* | | **ProteinKeywordEdge** | [x] UniProt | `ProteinKeywordEdge` | [x] `xref` *(UniProt Keyword xtype)* | | | *— post-processing (pharos_aql_post.yaml) —* | | | | **SetLigandActivityFlagAdapter** | [x] computed from graph | updates `meets_idg_cutoff` on `ProteinLigandEdge` | *(via ProteinLigandEdge)* | @@ -55,8 +55,8 @@ These tables are populated directly from ontology source files during the TCRD b | Source Concept | Source Files | TCRD Tables | |---------|------------------------|-------------| -| **MONDO ontology** | [x] `input_files/auto/mondo/mondo.json` | [x] `mondo`
[x] `mondo_parent` | -| **Disease Ontology** | [x] `input_files/auto/disease_ontology/doid.json` | [x] `do`
[x] `do_parent` | +| **MONDO ontology** | [x] `input_files/auto/mondo/mondo.json` | [x] `mondo`
[x] `mondo_parent`
[x] `ancestry_mondo` *(post-processing from `mondo_parent`)* | +| **Disease Ontology** | [x] `input_files/auto/disease_ontology/doid.json` | [x] `do`
[x] `do_parent`
[x] `ancestry_do` *(post-processing from `do_parent`)* | --- diff --git a/src/use_cases/pharos/pharos.yaml b/src/use_cases/pharos/pharos.yaml index 917e0a4..b51bb0d 100644 --- a/src/use_cases/pharos/pharos.yaml +++ b/src/use_cases/pharos/pharos.yaml @@ -209,6 +209,10 @@ input_adapters: class: ProteinDrugEdgeAdapter credentials: ./src/use_cases/secrets/drugcentral_credentials.yaml + - import: ./src/input_adapters/drug_central/drug_indication.py + class: DrugCentralIndicationAdapter + credentials: ./src/use_cases/secrets/drugcentral_credentials.yaml + - import: ./src/input_adapters/jensenlab/total_pmscore.py class: TotalPMScoreAdapter kwargs: diff --git a/src/use_cases/pharos/target_graph.yaml b/src/use_cases/pharos/target_graph.yaml index 64c4434..439bfea 100644 --- a/src/use_cases/pharos/target_graph.yaml +++ b/src/use_cases/pharos/target_graph.yaml @@ -251,6 +251,10 @@ input_adapters: class: ProteinDrugEdgeAdapter credentials: ./src/use_cases/secrets/drugcentral_credentials.yaml + - import: ./src/input_adapters/drug_central/drug_indication.py + class: DrugCentralIndicationAdapter + credentials: ./src/use_cases/secrets/drugcentral_credentials.yaml + - import: ./src/input_adapters/jensenlab/total_pmscore.py class: TotalPMScoreAdapter kwargs: diff --git a/src/use_cases/working.yaml b/src/use_cases/working.yaml index 8136cf8..d39f966 100644 --- a/src/use_cases/working.yaml +++ b/src/use_cases/working.yaml @@ -3,50 +3,45 @@ minio_credentials: &minio_credentials ./src/use_cases/secrets/ifxdev_minio.yaml ramp_sqlite_file: &ramp_sqlite_file ./input_files/auto/ramp/RaMP_SQLite_v3.0.12.sqlite resolvers: -# - label: ramp_resolver -# import: ./src/id_resolvers/ramp_metabolite_resolver.py -# class: RampMetaboliteIdResolver -# kwargs: -# types: -# - Metabolite -# sqlite_file: *ramp_sqlite_file -# -# - label: ensembl_resolver -# import: ./src/id_resolvers/ensembl_gene_resolver.py -# class: EnsemblGeneResolver -# kwargs: -# types: -# - Gene + - label: translator_nn + import: ./src/id_resolvers/node_normalizer.py + class: TranslatorNodeNormResolver + kwargs: + types: + - Disease - - label: uniprot_proteins - import: ./src/id_resolvers/uniprot_resolver.py - class: UniProtResolver + - label: tcrd_targets + import: ./src/id_resolvers/target_graph_resolver.py + class: TCRDTargetResolver kwargs: - uniprot_json_path: ./input_files/auto/uniprot/uniprot-human-reviewed.json.gz + canonical_type: Protein + collapse_reviewed_targets: true + no_match_behavior: Skip + gene_file_path: ./input_files/manual/target_graph/gene_ids.tsv + transcript_file_path: ./input_files/manual/target_graph/transcript_ids.tsv + protein_file_paths: + - ./input_files/manual/target_graph/protein_ids.tsv + additional_ids: ./input_files/manual/target_graph/uniprotkb_mapping_20260315.csv types: - Protein + - Gene + - Transcript input_adapters: - - import: ./src/input_adapters/file_uniprot/protein_adapter.py - class: ProteinAdapter + - import: ./src/input_adapters/target_graph/protein_nodes_and_edges.py + class: ProteinNodeAdapter kwargs: - file_path: ./input_files/auto/uniprot/uniprot-human-reviewed.json.gz - version_file_path: ./input_files/auto/uniprot/uniprot_version.tsv + file_path: ./input_files/manual/target_graph/protein_ids.tsv + collapse_reviewed_targets: true - - import: ./src/input_adapters/pounce_sheets/pounce_input_adapter.py - class: PounceInputAdapter - kwargs: - project_file: ./input_files/manual/pounce/dingyin_proteomics/POUNCE_Project_Proteomics_DingyinDT_v2.xlsx - experiment_files: - - ./input_files/manual/pounce/dingyin_proteomics/POUNCE_Experiment_Proteomics_DingyinDT_v2.xlsx - stats_results_files: - - ./input_files/manual/pounce/dingyin_proteomics/POUNCE_StatsResults_Proteomics_DingyinDT_v2.xlsx - validators_config: ./src/use_cases/pounce/pounce_validators.yaml + - import: ./src/input_adapters/drug_central/drug_indication.py + class: DrugCentralIndicationAdapter + credentials: ./src/use_cases/secrets/drugcentral_credentials.yaml output_adapters: - import: ./src/output_adapters/arango_output_adapter.py class: ArangoOutputAdapter kwargs: - database_name: test_pounce + database_name: test_pharos minio_credentials: *minio_credentials credentials: *destination_credentials diff --git a/src/use_cases/working_mysql.yaml b/src/use_cases/working_mysql.yaml index 0c11d92..2cafd79 100644 --- a/src/use_cases/working_mysql.yaml +++ b/src/use_cases/working_mysql.yaml @@ -20,26 +20,26 @@ resolvers: - Transcript input_adapters: -# - import: ./src/input_adapters/pharos_source_tcrd/ontology_tables.py -# class: MondoTableAdapter -# kwargs: -# file_path: ./input_files/auto/mondo/mondo.json -# -# - import: ./src/input_adapters/pharos_source_tcrd/ontology_tables.py -# class: MondoTableParentEdgeAdapter -# kwargs: -# file_path: ./input_files/auto/mondo/mondo.json -# -# - import: ./src/input_adapters/pharos_source_tcrd/ontology_tables.py -# class: DOTableAdapter -# kwargs: -# file_path: ./input_files/auto/disease_ontology/doid.json -# -# - import: ./src/input_adapters/pharos_source_tcrd/ontology_tables.py -# class: DOTableParentEdgeAdapter -# kwargs: -# file_path: ./input_files/auto/disease_ontology/doid.json -# + - import: ./src/input_adapters/pharos_source_tcrd/ontology_tables.py + class: MondoTableAdapter + kwargs: + file_path: ./input_files/auto/mondo/mondo.json + + - import: ./src/input_adapters/pharos_source_tcrd/ontology_tables.py + class: MondoTableParentEdgeAdapter + kwargs: + file_path: ./input_files/auto/mondo/mondo.json + + - import: ./src/input_adapters/pharos_source_tcrd/ontology_tables.py + class: DOTableAdapter + kwargs: + file_path: ./input_files/auto/disease_ontology/doid.json + + - import: ./src/input_adapters/pharos_source_tcrd/ontology_tables.py + class: DOTableParentEdgeAdapter + kwargs: + file_path: ./input_files/auto/disease_ontology/doid.json + - import: ./src/input_adapters/pharos_arango/tcrd/protein.py class: ProteinAdapter credentials: *source_credentials @@ -71,18 +71,18 @@ input_adapters: # kwargs: # database_name: *source_database # -# - import: ./src/input_adapters/pharos_arango/tcrd/disease.py -# class: DiseaseAdapter -# credentials: *source_credentials -# kwargs: -# database_name: *source_database -# associated_only: true -# -# - import: ./src/input_adapters/pharos_arango/tcrd/disease.py -# class: ProteinDiseaseAdapter -# credentials: *source_credentials -# kwargs: -# database_name: *source_database + - import: ./src/input_adapters/pharos_arango/tcrd/disease.py + class: DiseaseAdapter + credentials: *source_credentials + kwargs: + database_name: *source_database + associated_only: true + + - import: ./src/input_adapters/pharos_arango/tcrd/disease.py + class: ProteinDiseaseAdapter + credentials: *source_credentials + kwargs: + database_name: *source_database output_adapters: - import: ./src/output_adapters/mysql_output_adapter.py