diff --git a/docs/parser-yaml-redesign.md b/docs/parser-yaml-redesign.md new file mode 100644 index 00000000..cd610709 --- /dev/null +++ b/docs/parser-yaml-redesign.md @@ -0,0 +1,490 @@ +# Parser YAML DSL Rethink + +## Summary + +The current `parser.yaml` design is too close to an interpreter AST: + +- simple field reads require `op: field` +- literals are mixed between raw scalars and `op: literal` +- iteration uses an implicit `item` context +- aggregation exposes engine internals like `group_key[3]` and `aggregate_value` +- many specs read like serialized execution plans rather than parser recipes + +That is the wrong abstraction level. The YAML should describe **what the parser does**, not **how the evaluator walks an expression tree**. + +The right redesign is to make the spec look like a small ETL recipe: + +- row input selection from Croissant +- row-level filters +- named derived values +- named repeated structures +- node/edge emission +- optional grouping with named keys and named aggregate outputs + +## What Is Wrong With The Current DSL + +### 1. It leaks runtime internals + +`item`, `group_key`, and `aggregate_value` are execution-engine concepts. They are not parser-domain concepts. + +This: + +```yaml +predicate: + op: group_key + index: 3 +``` + +is much harder to understand than: + +```yaml +predicate: $group.predicate +``` + +### 2. It is verbose in the common case + +This: + +```yaml +id: + op: field + name: hgnc_id +``` + +should not exist. The common case should be one token, not three lines. + +### 3. It has too many low-level combinators + +The current design has a growing collection of small operators: + +- `field` +- `literal` +- `item` +- `template` +- `coalesce` +- `prefix` +- `prefix_if_present` +- `split` +- `split_prefix` +- `explode_zip` +- `map_lookup` +- `map_each` +- `fanout_measurements` +- `aggregate_value` +- `group_key` + +That is already a programming language. It just happens to be one with poor ergonomics. + +### 4. It hides the dataflow + +In Python, you can usually read top to bottom and see what values are being named and reused. + +In the current YAML: + +- there are few named intermediate concepts +- context switches are implicit +- nested `op` blocks obscure intent +- repeated transforms are duplicated instead of named once + +### 5. It tries to make one generic DSL cover very different parser shapes + +HGNC and BindingDB are not the same parser shape: + +- HGNC is row-local with one zipped fanout +- BindingDB is row fanout plus grouping plus reduction + +Trying to represent both with one uniform expression-tree DSL is what causes the YAML to become interpreter-shaped. + +## Design Principles For A Better Spec + +### 1. Optimize for readability over genericity + +If a human cannot skim the YAML and explain the parser in one pass, the spec is too low-level. + +### 2. Use named variables, not positional references + +Never expose `item.0` or `group_key[2]` when the same concept can be named `family.id` or `group.parameter`. + +### 3. Make the common case scalar-friendly + +- bare scalars are literals +- `$name` is a reference +- `"prefix:${name}"` is a template + +That removes most uses of `field`, `literal`, and `template`. + +### 4. Prefer recipe primitives over expression primitives + +Keep a small number of parser concepts: + +- `split` +- `zip_split` +- `measurement_fields` +- `parse_qualified_float` +- `mean` +- `neglog10_nm` +- `list` +- `unique` + +Avoid generic AST glue like `item`, `group_key`, `aggregate_value`, and `map_each`. + +### 5. Separate row stage from group stage + +There are really two execution modes: + +- row-local parsing +- grouped aggregation + +The spec should model those directly instead of pretending everything is one kind of transform tree. + +### 6. Keep hard parsers in Python + +If a source needs: + +- nested transforms more than one or two levels deep +- custom control flow +- joins across files +- domain-specific exception handling +- nontrivial numeric logic that is unique to one source + +then it should remain a handwritten parser. + +The DSL should only exist where it is materially simpler than Python. + +## Proposed Shape + +### Core Syntax + +- Bare scalar: literal value +- `$field_name`: reference to a row field or previously named value +- `${field_name}` inside a string: template interpolation + +### Top-Level Sections + +```yaml +source_id: +provenance_id: +parsing_version: + +from: + croissant: + dataset_id: + version_from: + distribution: + record_set: + format: + delimiter: + archive_member: + member_pattern: + test_mode_limit: + +where: + - ... + +let: + ... + +emit: + nodes: + edges: + +group: + foreach: + as: + by: + collect: + let: + having: + emit: +``` + +### Semantics + +- `where`: row filters +- `let`: row-scoped derived values +- `emit`: row-scoped nodes and edges +- `group`: optional grouped aggregation stage +- `group.by`: named keys, not ordered tuples +- `group.collect`: reducers +- `group.let`: aggregate-scoped derived values +- `group.emit`: emission from grouped records + +## Proposed HGNC Spec + +```yaml +source_id: HGNC +provenance_id: infores:hgnc +parsing_version: "3.0" + +from: + croissant: hgnc_croissant.json + dataset_id: hgnc + version_from: dataset.version + distribution: hgnc/hgnc_complete_set_tsv + record_set: hgnc/hgnc_complete_set + format: tsv + delimiter: "\t" + test_mode_limit: 5000 + +where: + - exists: gene_group_id + +let: + families: + zip_split: + separator: "|" + fields: + id: gene_group_id + name: gene_group + +emit: + nodes: + - id: $hgnc_id + name: $name + category: biolink:Gene + props: + symbol: $symbol + locus_group: $locus_group + location: $location + + - foreach: families + as: family + id: "HGNC.FAMILY:${family.id}" + name: $family.name + category: biolink:GeneFamily + + edges: + - foreach: families + as: family + subject: $hgnc_id + predicate: RO:0002350 + object: "HGNC.FAMILY:${family.id}" + primary_knowledge_source: infores:hgnc + props: + publications: + split: + value: $pubmed_id + separator: "|" + prefix: "PMID:" + knowledge_level: knowledge_assertion + agent_type: manual_agent +``` + +Why this is better: + +- no `op: field` +- no `op: template` +- no `item.0` +- the repeated structure is named `families` +- the loop variable is named `family` + +## Proposed BindingDB Spec + +```yaml +source_id: BINDING-DB +provenance_id: infores:bindingdb +parsing_version: "3.0" + +from: + croissant: bindingdb_croissant.json + dataset_id: bindingdb + version_from: dataset.version + distribution: bindingdb/all_tsv_fileset + record_set: bindingdb/binding_data + format: tsv + delimiter: "\t" + archive_member: BindingDB_All.tsv + test_mode_limit: 10000 + +where: + - exists: pubchem_cid + - exists: chain1_swissprot_primary_id + +let: + ligand_id: "PUBCHEM.COMPOUND:${pubchem_cid}" + protein_id: "UniProtKB:${chain1_swissprot_primary_id}" + measurements: + measurement_fields: + - field: ki_nm + parameter: pKi + predicate: biolink:inhibits + - field: ic50_nm + parameter: pIC50 + predicate: CTD:decreases_activity_of + - field: kd_nm + parameter: pKd + predicate: RO:0002436 + - field: ec50_nm + parameter: pEC50 + predicate: CTD:increases_activity_of + +group: + foreach: measurements + as: measurement + + by: + ligand: $ligand_id + protein: $protein_id + parameter: $measurement.parameter + predicate: $measurement.predicate + + collect: + supporting_affinities_nm: + list: + parse_qualified_float: + value: $measurement.value + reject_operators: [">"] + strip_operators: ["<"] + minimum_exclusive: 0 + + publications: + unique: + prefix_if_present: + value: $pmid + prefix: "PMID:" + when: + parse_qualified_float: + value: $measurement.value + reject_operators: [">"] + strip_operators: ["<"] + minimum_exclusive: 0 + + pubchem_assay_ids: + unique: + prefix_if_present: + value: $pubchem_aid + prefix: "PUBCHEM.AID:" + when: + parse_qualified_float: + value: $measurement.value + reject_operators: [">"] + strip_operators: ["<"] + minimum_exclusive: 0 + + patent_ids: + unique: + prefix_if_present: + value: $patent_number + prefix: "PATENT:" + when: + parse_qualified_float: + value: $measurement.value + reject_operators: [">"] + strip_operators: ["<"] + minimum_exclusive: 0 + + let: + average_affinity_nm: + mean: $supporting_affinities_nm + + affinity: + neglog10_nm: + value: $average_affinity_nm + precision: 2 + + having: + - non_empty: supporting_affinities_nm + + emit: + nodes: + - id: $group.ligand + category: biolink:SmallMolecule + + - id: $group.protein + category: biolink:Protein + + edges: + - subject: $group.ligand + predicate: $group.predicate + object: $group.protein + primary_knowledge_source: infores:bindingdb + props: + affinity_parameter: $group.parameter + affinity: $affinity + average_affinity_nm: $average_affinity_nm + supporting_affinities_nm: $supporting_affinities_nm + publications: $publications + pubchem_assay_ids: $pubchem_assay_ids + patent_ids: $patent_ids + knowledge_level: knowledge_assertion + agent_type: manual_agent +``` + +Why this is better: + +- group keys are named, not positional +- aggregate outputs are referenced by name directly +- the BindingDB-specific fanout is expressed as a named parser concept +- the spec reads like the old Python algorithm + +## Specific Changes I Would Make + +### Remove + +- `op: field` +- `op: literal` +- `op: item` +- `op: template` +- `op: aggregate_value` +- `op: group_key` +- `op: map_each` + +### Rename + +- `derived_fields` -> `let` +- `aggregate` -> `group` +- `row_filters` -> `where` +- `properties` -> `props` + +### Restrict + +- keep only a small set of transforms that correspond to repeated parser concepts +- require every loop variable and group key to be named +- reject specs that require more than one layer of nested transform blocks unless explicitly supported + +## Implementation Consequences + +The implementation should stop centering around one generic `evaluate_transform()` that recursively interprets AST nodes. + +Instead, it should have typed handlers for a small number of spec shapes: + +- scalar reference resolution +- template interpolation +- row `let` +- `zip_split` +- `measurement_fields` +- group reducers +- simple aggregate calculations + +That makes validation and error messages much better: + +- unknown reference names can be caught statically +- missing loop variables become schema errors +- invalid group references can be rejected before parsing starts + +## Decision Rule For Future Parsers + +Before adding a new DSL feature, ask: + +1. Does this make at least two parsers simpler? +2. Is the resulting YAML still easier to read than the equivalent Python? +3. Is the construct a parser-domain concept rather than an evaluator-domain concept? + +If any answer is no, keep that parser in Python. + +## Recommendation + +Yes, the DSL should be rethought. + +The main problem is not that it is YAML. The problem is that it currently encodes an execution tree. + +I would move to: + +- concise references with `$name` +- string interpolation for IDs +- `let` for named row-scoped values +- `foreach ... as ...` with named variables +- `group.by` with named keys +- a very small set of high-level parser primitives + +That would preserve the benefits of metadata-driven parsing while making the specs readable by people who are comfortable with ordinary data pipelines, even if they never learn an internal expression language. diff --git a/docs/parser-yaml-semantic-table-design.md b/docs/parser-yaml-semantic-table-design.md new file mode 100644 index 00000000..26d2b6ec --- /dev/null +++ b/docs/parser-yaml-semantic-table-design.md @@ -0,0 +1,630 @@ +# Semantic Table Parser Design + +## Summary + +This note explores an alternative to the cleaned-up DSL proposed in `parser-yaml-redesign.md`. + +The central idea is: + +- treat the input as a table +- give the input columns explicit semantics +- reshape the table through a small set of relational operations +- project the resulting rows into nodes and edges + +This is a better fit for ORION's tabular sources than a generic expression-tree DSL. + +The parser spec should read like: + +1. what the source table is +2. what each column means +3. how the table is reshaped into useful row sets +4. how those row sets become graph objects + +## The Problem With The Current Direction + +The current metadata-driven parser design is still too execution-oriented. + +Even after simplifying the syntax, it still tends to answer questions like: + +- what transform operator runs here? +- what is the current loop variable? +- is this value row-scoped or aggregate-scoped? +- what positional index in the group key is this? + +That is not how people think about tables. + +For tabular sources, people think in terms of: + +- columns +- typed values +- lists and aligned lists +- filtering rows +- expanding rows +- grouping rows +- selecting the columns that matter + +That should be the organizing model of the spec. + +## Core Design + +The spec should have four layers: + +- `from`: bind a Croissant record set to a logical source table +- `fields`: assign semantics to source columns and define typed semantic values +- `views`: derive named row sets from the source table or from prior views +- `graph`: define nodes and edges as projections from a named row set + +The important structural rule is: + +- `fields` describe what values mean +- `views` describe how rows are reshaped +- `graph` describes how reshaped rows become graph records + +## Design Principles + +### 1. Put semantics before transformation + +If a column is the local identifier for a gene, that should be declared once in `fields`. + +If a column is a pipe-delimited list of family IDs, that should also be declared once in `fields`. + +The graph section should not have to rediscover those facts. + +### 2. Keep graph definitions thin + +Node and edge definitions should be the easiest part of the spec to read. + +They should mostly answer: + +- which row set is this built from? +- what is the subject ID? +- what is the object ID? +- what is the category or predicate? +- which row values become properties? + +They should not contain grouping logic, list zipping, or custom parsing machinery. + +### 3. Make row reshaping explicit and named + +Any nontrivial parser creates intermediate conceptual tables, even if the old Python never named them. + +For example: + +- HGNC has a row set of `gene_family_memberships` +- BindingDB has a row set of `measurement_rows` +- BindingDB also has a grouped row set of `aggregated_measurements` + +Those should be named views. + +### 4. Prefer a fixed relational toolbox over a generic expression language + +The view layer should support a constrained set of operations such as: + +- `where` +- `select` +- `unnest` +- `unnest_zip` +- `unpivot` +- `group_by` +- `aggregates` +- `distinct` + +This is still declarative, but it matches how tabular parsers are actually written. + +### 5. Keep arbitrary logic out of the spec + +If a parser needs complex control flow, custom joins, or deeply nested special cases, it should remain handwritten Python. + +This design is for the large class of row-oriented tabular sources that can be described as table reshaping plus graph projection. + +## Proposed Top-Level Shape + +```yaml +source_id: +provenance_id: +parsing_version: + +from: + croissant: + dataset_id: + version_from: + distribution: + record_set: + format: + delimiter: + archive_member: + member_pattern: + test_mode_limit: + +fields: + ... + +views: + ... + +graph: + nodes: + edges: +``` + +## `from`: Source Table Binding + +`from` should bind the parser to exactly one Croissant-backed source table: + +- Croissant path or URL +- dataset ID +- version extraction policy +- distribution +- record set +- reader options + +This section is structural, not semantic. + +## `fields`: Semantic Model Of The Input Table + +`fields` should describe the meaning of the table columns and expose them as logical semantic values. + +The important shift is that field entries are not just aliases. They are typed semantic declarations. + +Examples of useful field kinds: + +- scalar property +- identifier +- label +- optional identifier +- list +- zipped list +- measurement value columns +- normalized numeric value + +### Field examples + +```yaml +fields: + gene_id: + column: hgnc_id + kind: identifier + prefix: HGNC + + gene_name: + column: name + kind: label + + symbol: + column: symbol + kind: property + + publications: + column: pubmed_id + kind: list + separator: "|" + prefix: PMID: + + families: + kind: zipped_list + separator: "|" + columns: + id: + column: gene_group_id + kind: identifier + prefix: HGNC.FAMILY + name: + column: gene_group + kind: label +``` + +This says: + +- `gene_id` is not just a raw column; it is a graph identifier +- `publications` is not just a string; it is a normalized list +- `families` is not just two columns; it is a repeated aligned record structure + +That is the level of abstraction the parser author actually thinks in. + +## `views`: Named Relational Row Sets + +`views` are where parser logic lives. + +Each view: + +- starts from `source` or a prior view +- applies a small set of supported row-shaping operations +- exposes a new row schema + +The key point is that a view should describe a table, not a control-flow program. + +### Recommended operations + +- `where`: row filtering +- `select`: choose or rename columns +- `unnest`: expand a repeated field into one row per item +- `unnest_zip`: expand aligned lists into structured repeated rows +- `unpivot`: turn multiple measure columns into repeated measurement rows +- `group_by`: define named grouping keys +- `aggregates`: list, unique, count, mean, first_non_null +- `let`: aggregate-scoped derived values +- `distinct`: remove duplicate rows if needed + +### Validation expectations + +Because views are typed row sets, the engine can validate: + +- referenced fields exist in the source row schema +- `unnest_zip` inputs have compatible field definitions +- `group_by` names are unique +- aggregate outputs do not shadow reserved names +- graph sections only reference columns actually produced by the view + +## `graph`: Projection To Nodes And Edges + +`graph` should be simple. + +Node and edge specs should each name a row source and map columns from that row source into KGX objects. + +### Node definition + +A node definition should answer: + +- what row set does this come from? +- which column is the node ID? +- what category does it have? +- which column is the human-readable name? +- which columns become properties? + +### Edge definition + +An edge definition should answer: + +- what row set does this come from? +- which columns are subject and object IDs? +- what is the predicate? +- what is the primary knowledge source? +- which columns become edge properties? + +The graph section should not perform row grouping, fanout, or low-level parsing. + +## Proposed Reference Style + +To keep references obvious, use `$name` for row-value references and `${name}` inside strings for interpolation. + +Examples: + +- `$gene_id` +- `$family.name` +- `"HGNC.FAMILY:${family.id}"` + +This avoids the old `op: field`, `op: item`, and `op: template` patterns without introducing arbitrary embedded code. + +## Worked Example: HGNC + +HGNC is a good fit for this design because the row logic is simple: + +- each source row describes one gene +- one pair of pipe-delimited columns describes repeated family membership rows +- graph projection is then straightforward + +```yaml +source_id: HGNC +provenance_id: infores:hgnc +parsing_version: "3.0" + +from: + croissant: hgnc_croissant.json + dataset_id: hgnc + version_from: dataset.version + distribution: hgnc/hgnc_complete_set_tsv + record_set: hgnc/hgnc_complete_set + format: tsv + delimiter: "\t" + test_mode_limit: 5000 + +fields: + gene_id: + column: hgnc_id + kind: identifier + prefix: HGNC + + gene_name: + column: name + kind: label + + symbol: + column: symbol + kind: property + + locus_group: + column: locus_group + kind: property + + location: + column: location + kind: property + + families: + kind: zipped_list + separator: "|" + columns: + id: + column: gene_group_id + kind: identifier + prefix: HGNC.FAMILY + name: + column: gene_group + kind: label + + publications: + column: pubmed_id + kind: list + separator: "|" + prefix: PMID: + +views: + gene_family_memberships: + from: source + where: + - exists: families + unnest: families as family + select: + gene_id: $gene_id + family_id: $family.id + family_name: $family.name + publications: $publications + +graph: + nodes: + - from: source + id: $gene_id + category: biolink:Gene + name: $gene_name + props: + symbol: $symbol + locus_group: $locus_group + location: $location + + - from: gene_family_memberships + id: $family_id + category: biolink:GeneFamily + name: $family_name + + edges: + - from: gene_family_memberships + subject: $gene_id + predicate: RO:0002350 + object: $family_id + primary_knowledge_source: infores:hgnc + props: + publications: $publications + knowledge_level: knowledge_assertion + agent_type: manual_agent +``` + +### Why the HGNC version is better + +- the family structure is declared once in `fields` +- the exploded relationship rows are named in `views` +- the graph section is easy to scan +- there are no runtime concepts like `item` or `foreach` + +## Worked Example: BindingDB + +BindingDB is a harder case, but it still fits this model if the parser is treated as two derived tables: + +- `measurement_rows` +- `aggregated_measurements` + +```yaml +source_id: BINDING-DB +provenance_id: infores:bindingdb +parsing_version: "3.0" + +from: + croissant: bindingdb_croissant.json + dataset_id: bindingdb + version_from: dataset.version + distribution: bindingdb/all_tsv_fileset + record_set: bindingdb/binding_data + format: tsv + delimiter: "\t" + archive_member: BindingDB_All.tsv + test_mode_limit: 10000 + +fields: + ligand_id: + column: pubchem_cid + kind: identifier + prefix: PUBCHEM.COMPOUND + + protein_id: + column: chain1_swissprot_primary_id + kind: identifier + prefix: UniProtKB + + publication: + column: pmid + kind: optional_identifier + prefix: PMID: + + pubchem_assay_id: + column: pubchem_aid + kind: optional_identifier + prefix: PUBCHEM.AID: + + patent_id: + column: patent_number + kind: optional_identifier + prefix: PATENT: + + measurements: + kind: value_columns + unit: nM + columns: + ki_nm: + parameter: pKi + predicate: biolink:inhibits + ic50_nm: + parameter: pIC50 + predicate: CTD:decreases_activity_of + kd_nm: + parameter: pKd + predicate: RO:0002436 + ec50_nm: + parameter: pEC50 + predicate: CTD:increases_activity_of + +views: + measurement_rows: + from: source + where: + - exists: ligand_id + - exists: protein_id + unpivot: measurements as measurement + select: + ligand_id: $ligand_id + protein_id: $protein_id + parameter: $measurement.parameter + predicate: $measurement.predicate + affinity_nm: + parse_qualified_float: + value: $measurement.value + reject_operators: [">"] + strip_operators: ["<"] + minimum_exclusive: 0 + publication: $publication + pubchem_assay_id: $pubchem_assay_id + patent_id: $patent_id + + aggregated_measurements: + from: measurement_rows + where: + - exists: affinity_nm + group_by: + ligand_id: $ligand_id + protein_id: $protein_id + parameter: $parameter + predicate: $predicate + aggregates: + supporting_affinities_nm: + list: $affinity_nm + publications: + unique: $publication + pubchem_assay_ids: + unique: $pubchem_assay_id + patent_ids: + unique: $patent_id + average_affinity_nm: + mean: $affinity_nm + let: + affinity: + neglog10_nm: + value: $average_affinity_nm + precision: 2 + +graph: + nodes: + - from: aggregated_measurements + id: $ligand_id + category: biolink:SmallMolecule + + - from: aggregated_measurements + id: $protein_id + category: biolink:Protein + + edges: + - from: aggregated_measurements + subject: $ligand_id + predicate: $predicate + object: $protein_id + primary_knowledge_source: infores:bindingdb + props: + affinity_parameter: $parameter + affinity: $affinity + average_affinity_nm: $average_affinity_nm + supporting_affinities_nm: $supporting_affinities_nm + publications: $publications + pubchem_assay_ids: $pubchem_assay_ids + patent_ids: $patent_ids + knowledge_level: knowledge_assertion + agent_type: manual_agent +``` + +### Why the BindingDB version is better + +- the measurement columns are declared once as a semantic field group +- `unpivot` makes the fanout explicit +- `group_by` uses named columns, not tuple positions +- graph emission is a direct projection from a named grouped view + +## Why This Is Better Than A Generic DSL + +This approach has a much better division of labor. + +### The parser author thinks in the right concepts + +The author writes: + +- what columns mean +- what intermediate row sets exist +- how those row sets become graph objects + +The author does not write: + +- evaluator op trees +- loop-context plumbing +- group-key indexes +- ad hoc expression nesting + +### Validation gets stronger + +Because the schema of each row set is explicit, the engine can validate: + +- references to unknown columns +- illegal graph references +- incompatible `unnest` inputs +- impossible `group_by` keys +- unsupported aggregates for a field kind + +### The graph layer becomes stable + +Once the view layer is correct, node and edge definitions are simple and likely reusable. + +That is a good architectural property. + +## What To Avoid + +This design should still stay constrained. + +Avoid: + +- arbitrary SQL strings +- free-form expressions embedded in YAML +- joins unless a real source requires them +- view pipelines so complex that they amount to hidden code + +If a parser needs too many special cases, keep it in Python. + +## Implementation Consequences + +This design implies a different engine structure than the current `evaluate_transform()` model. + +The engine should be organized around: + +- source-table binding +- field normalization +- row-set schema tracking +- view execution with a fixed relational operator set +- graph projection from named row sets + +That is likely easier to validate and easier to explain than a recursive transform interpreter. + +## Recommendation + +If ORION wants a metadata-driven parser system that people can actually read, this semantic-table approach is stronger than the current DSL direction. + +The best shape is: + +- one source table +- one semantic field model +- zero or more named relational views +- one graph projection section + +That keeps the parser spec aligned with how tabular graph loaders are actually understood by humans. diff --git a/orion/croissant_resolver.py b/orion/croissant_resolver.py new file mode 100644 index 00000000..293fa8e6 --- /dev/null +++ b/orion/croissant_resolver.py @@ -0,0 +1,174 @@ +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from typing import Any + +import requests + + +def _normalize_croissant_type(raw_type: str | None) -> str | None: + if raw_type is None: + return None + if "/" in raw_type: + return raw_type.rsplit("/", 1)[-1] + return raw_type + + +@dataclass(frozen=True) +class ResolvedDistribution: + identifier: str + distribution_type: str | None + name: str | None + content_url: str | None + encoding_format: str | None + version: str | None + md5: str | None + contained_in: tuple[str, ...] = () + includes: str | None = None + raw: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class ResolvedField: + identifier: str + name: str + description: str | None + data_type: str | None + source_distribution_id: str | None + source_column: str | None + raw: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class ResolvedRecordSet: + identifier: str + name: str + description: str | None + fields_by_name: dict[str, ResolvedField] + raw: dict[str, Any] = field(default_factory=dict) + + +class CroissantResolver: + def __init__(self, document: dict[str, Any], source_location: str | None = None): + self.document = document + self.source_location = source_location + + self.dataset_id: str | None = document.get("@id") + self.dataset_name: str | None = document.get("name") + self.dataset_version: str | None = document.get("version") + self.dataset_modified: str | None = document.get("dateModified") + + self.distributions = self._parse_distributions(document.get("distribution", [])) + self.record_sets = self._parse_record_sets(document.get("recordSet", [])) + + @classmethod + def from_path(cls, path: str) -> "CroissantResolver": + with open(path, "r") as croissant_file: + document = json.load(croissant_file) + return cls(document=document, source_location=path) + + @classmethod + def from_url(cls, url: str, timeout: int = 30) -> "CroissantResolver": + response = requests.get(url, timeout=timeout) + response.raise_for_status() + return cls(document=response.json(), source_location=url) + + def _parse_distributions(self, distributions: list[dict[str, Any]]) -> dict[str, ResolvedDistribution]: + resolved: dict[str, ResolvedDistribution] = {} + for distribution in distributions: + identifier = distribution.get("@id") + if not identifier: + raise ValueError("Croissant distribution is missing @id.") + + contained_in_ids = tuple( + item["@id"] for item in distribution.get("containedIn", []) if "@id" in item + ) + resolved[identifier] = ResolvedDistribution( + identifier=identifier, + distribution_type=_normalize_croissant_type(distribution.get("@type")), + name=distribution.get("name"), + content_url=distribution.get("contentUrl"), + encoding_format=distribution.get("encodingFormat"), + version=distribution.get("version"), + md5=distribution.get("md5"), + contained_in=contained_in_ids, + includes=distribution.get("includes"), + raw=distribution, + ) + return resolved + + def _parse_record_sets(self, record_sets: list[dict[str, Any]]) -> dict[str, ResolvedRecordSet]: + resolved: dict[str, ResolvedRecordSet] = {} + for record_set in record_sets: + identifier = record_set.get("@id") + name = record_set.get("name") + if not identifier or not name: + raise ValueError("Croissant recordSet is missing @id or name.") + + fields_by_name: dict[str, ResolvedField] = {} + for field_obj in record_set.get("field", []): + field_name = field_obj.get("name") + field_id = field_obj.get("@id") + if not field_name or not field_id: + raise ValueError(f"Croissant field is missing @id or name in record set {identifier}.") + source_info = field_obj.get("source", {}) + source_distribution = None + if "fileObject" in source_info: + source_distribution = source_info["fileObject"].get("@id") + elif "fileSet" in source_info: + source_distribution = source_info["fileSet"].get("@id") + + source_column = None + extract_info = source_info.get("extract", {}) + if isinstance(extract_info, dict): + source_column = extract_info.get("column") + + fields_by_name[field_name] = ResolvedField( + identifier=field_id, + name=field_name, + description=field_obj.get("description"), + data_type=field_obj.get("dataType"), + source_distribution_id=source_distribution, + source_column=source_column, + raw=field_obj, + ) + + resolved[identifier] = ResolvedRecordSet( + identifier=identifier, + name=name, + description=record_set.get("description"), + fields_by_name=fields_by_name, + raw=record_set, + ) + return resolved + + def get_distribution(self, identifier: str) -> ResolvedDistribution: + try: + return self.distributions[identifier] + except KeyError as exc: + raise KeyError(f"Croissant distribution not found: {identifier}") from exc + + def get_record_set(self, identifier: str) -> ResolvedRecordSet: + try: + return self.record_sets[identifier] + except KeyError as exc: + raise KeyError(f"Croissant recordSet not found: {identifier}") from exc + + def get_field(self, record_set_id: str, field_name: str) -> ResolvedField: + record_set = self.get_record_set(record_set_id) + try: + return record_set.fields_by_name[field_name] + except KeyError as exc: + raise KeyError( + f"Croissant field '{field_name}' not found in record set '{record_set_id}'." + ) from exc + + def get_field_column_map(self, record_set_id: str) -> dict[str, str]: + record_set = self.get_record_set(record_set_id) + return { + field_name: field.source_column + for field_name, field in record_set.fields_by_name.items() + if field.source_column + } + diff --git a/orion/data_sources.py b/orion/data_sources.py index 32714784..4d833edc 100644 --- a/orion/data_sources.py +++ b/orion/data_sources.py @@ -2,6 +2,7 @@ import importlib BINDING_DB = 'BINDING-DB' +BINDING_DB_CROISSANT = 'BINDING-DB-Croissant' CAM_KP = 'CAM-KP' CCIDB = 'CCIDB' CEBS = 'CEBS' @@ -62,6 +63,7 @@ SOURCE_DATA_LOADER_CLASS_IMPORTS = { BINDING_DB: ("parsers.BINDING.src.loadBINDINGDB", "BINDINGDBLoader"), + BINDING_DB_CROISSANT: ("parsers.metadata_driven.src.loadMetadataDriven", "BINDINGDBCroissantLoader"), CAM_KP: ("parsers.camkp.src.loadCAMKP", "CAMKPLoader"), CCIDB: ("parsers.CCIDB.src.loadCCIDB", "CCIDBLoader"), CEBS: ("parsers.CEBS.src.loadCEBS", "CEBSLoader"), @@ -80,7 +82,7 @@ GTOPDB: ("parsers.gtopdb.src.loadGtoPdb", "GtoPdbLoader"), GWAS_CATALOG: ("parsers.GWASCatalog.src.loadGWASCatalog", "GWASCatalogLoader"), HETIO: ("parsers.hetio.src.loadHetio", "HetioLoader"), - HGNC: ("parsers.hgnc.src.loadHGNC", "HGNCLoader"), + HGNC: ("parsers.metadata_driven.src.loadMetadataDriven", "HGNCCroissantLoader"), HMDB: ("parsers.hmdb.src.loadHMDB", "HMDBLoader"), HUMAN_GOA: ("parsers.GOA.src.loadGOA", "HumanGOALoader"), HUMAN_STRING: ("parsers.STRING.src.loadSTRINGDB", "HumanSTRINGDBLoader"), @@ -167,4 +169,3 @@ def get_data_loader_class(key): class SourceDataLoaderClassFactory(KeyBasedDefaultDict): def __init__(self): super(KeyBasedDefaultDict, self).__init__(get_data_loader_class) - diff --git a/orion/metadata_driven_loader.py b/orion/metadata_driven_loader.py new file mode 100644 index 00000000..e0a31004 --- /dev/null +++ b/orion/metadata_driven_loader.py @@ -0,0 +1,617 @@ +from __future__ import annotations + +import csv +import fnmatch +import json +import os +from collections import OrderedDict +from io import TextIOWrapper +from pathlib import Path +from urllib.parse import urlparse +from zipfile import ZipFile + +from orion.croissant_resolver import CroissantResolver, ResolvedDistribution +from orion.kgxmodel import kgxedge, kgxnode +from orion.loader_interface import SourceDataLoader +from orion.parser_spec import ParserSpec, load_parser_spec +from orion.semantic_table import ( + evaluate_expression, + is_missing, + matches_filters, + normalize_field, + parse_expand_spec, +) +from orion.utils import GetData + + +class MetadataDrivenLoader(SourceDataLoader): + parser_spec_path: str | None = None + + def __init__( + self, + test_mode: bool = False, + source_data_dir: str = None, + parser_spec_path: str | None = None, + ): + super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) + + resolved_spec_path = parser_spec_path or self.parser_spec_path + if not resolved_spec_path: + raise ValueError("MetadataDrivenLoader requires a parser_spec_path.") + + self.parser_spec: ParserSpec = load_parser_spec(resolved_spec_path) + self.croissant_resolver: CroissantResolver = self.parser_spec.get_croissant_resolver() + + self.source_id = self.parser_spec.source_id + self.provenance_id = self.parser_spec.provenance_id + self.parsing_version = self.parser_spec.parsing_version + self.description = self.parser_spec.description + self.preserve_unconnected_nodes = self.parser_spec.preserve_unconnected_nodes + self.has_sequence_variants = self.parser_spec.has_sequence_variants + + self.input_distribution = self.croissant_resolver.get_distribution(self.parser_spec.source.distribution) + self.download_distribution = self._resolve_download_distribution(self.input_distribution) + self.download_file_name = self._get_download_file_name(self.download_distribution) + + self._view_cache: dict[str, list[dict[str, object]]] = {} + self._view_usage_counts = self._calculate_view_usage_counts() + + if self.input_distribution.distribution_type == "FileSet": + self.archive_file = self.download_file_name + else: + self.data_file = self.download_file_name + + def _calculate_view_usage_counts(self) -> dict[str, int]: + usage_counts = {view_name: 0 for view_name in self.parser_spec.views.keys()} + + for view_spec in self.parser_spec.views.values(): + parent_name = view_spec.get("from") + if parent_name in usage_counts: + usage_counts[parent_name] += 1 + + for graph_rule in self.parser_spec.graph.nodes + self.parser_spec.graph.edges: + row_source = graph_rule.get("from") + if row_source in usage_counts: + usage_counts[row_source] += 1 + + return usage_counts + + def _resolve_download_distribution(self, distribution: ResolvedDistribution) -> ResolvedDistribution: + if distribution.distribution_type == "FileObject": + return distribution + + if distribution.distribution_type == "FileSet": + if len(distribution.contained_in) != 1: + raise ValueError( + f"FileSet {distribution.identifier} must have exactly one containedIn reference." + ) + return self.croissant_resolver.get_distribution(distribution.contained_in[0]) + + raise ValueError( + f"Unsupported Croissant distribution type for metadata-driven loading: " + f"{distribution.distribution_type}" + ) + + @staticmethod + def _get_download_file_name(distribution: ResolvedDistribution) -> str: + if not distribution.content_url: + raise ValueError(f"Croissant distribution {distribution.identifier} is missing contentUrl.") + parsed_url = urlparse(distribution.content_url) + file_name = Path(parsed_url.path).name + if not file_name: + raise ValueError(f"Could not derive a local file name from URL: {distribution.content_url}") + return file_name + + def get_latest_source_version(self) -> str: + return self.parser_spec.get_source_version(self.croissant_resolver) + + def get_data(self) -> bool: + data_puller = GetData() + data_puller.pull_via_http( + url=self.download_distribution.content_url, + data_dir=self.data_path, + saved_file_name=self.download_file_name, + ) + return True + + def _resolve_input_stream(self): + if self.input_distribution.distribution_type == "FileObject": + return open(os.path.join(self.data_path, self.download_file_name), "r", newline="") + + if self.input_distribution.distribution_type == "FileSet": + archive_path = os.path.join(self.data_path, self.download_file_name) + zip_file = ZipFile(archive_path, "r") + member_name = self._select_archive_member(zip_file) + return _ZipTextStream(zip_file=zip_file, member_name=member_name) + + raise ValueError(f"Unsupported input distribution type: {self.input_distribution.distribution_type}") + + def _select_archive_member(self, zip_file: ZipFile) -> str: + if self.parser_spec.source.archive_member: + return self.parser_spec.source.archive_member + + pattern = self.parser_spec.source.member_pattern or self.input_distribution.includes + if not pattern: + raise ValueError( + f"No archive member or member pattern specified for {self.input_distribution.identifier}." + ) + + matching_members = [name for name in zip_file.namelist() if fnmatch.fnmatch(name, pattern)] + if len(matching_members) != 1: + raise ValueError( + f"Expected exactly one archive member matching '{pattern}', found {matching_members}." + ) + return matching_members[0] + + def _iter_raw_rows(self): + field_column_map = self.croissant_resolver.get_field_column_map(self.parser_spec.source.record_set) + with self._resolve_input_stream() as input_stream: + reader = csv.DictReader( + input_stream, + delimiter=self.parser_spec.source.delimiter, + quotechar=self.parser_spec.source.quotechar, + ) + for raw_row in reader: + yield { + field_name: raw_row.get(column_name) + for field_name, column_name in field_column_map.items() + } + + def _build_semantic_row(self, raw_row: dict[str, str]) -> dict[str, object]: + return { + field_name: normalize_field(raw_row, field_spec) + for field_name, field_spec in self.parser_spec.fields.items() + } + + def _iter_source_rows(self): + test_mode_limit = self.parser_spec.source.test_mode_limit + record_counter = 0 + + for raw_row in self._iter_raw_rows(): + record_counter += 1 + if self.test_mode and test_mode_limit and record_counter > test_mode_limit: + break + + try: + semantic_row = self._build_semantic_row(raw_row) + except Exception: + continue + + if not matches_filters(self.parser_spec.where, semantic_row): + continue + + yield semantic_row + + def _count_source_rows(self) -> tuple[int, int, list[str]]: + record_counter = 0 + skipped_record_counter = 0 + errors: list[str] = [] + test_mode_limit = self.parser_spec.source.test_mode_limit + + for raw_row in self._iter_raw_rows(): + record_counter += 1 + if self.test_mode and test_mode_limit and record_counter > test_mode_limit: + break + + try: + semantic_row = self._build_semantic_row(raw_row) + if not matches_filters(self.parser_spec.where, semantic_row): + skipped_record_counter += 1 + except Exception as exc: + skipped_record_counter += 1 + errors.append(str(exc)) + + return record_counter, skipped_record_counter, errors + + @staticmethod + def _apply_select(select_spec: dict[str, object], row: dict[str, object]) -> dict[str, object]: + return { + output_name: evaluate_expression(output_expr, row) + for output_name, output_expr in select_spec.items() + } + + @staticmethod + def _clean_properties( + raw_properties: dict[str, object], + preserve_empty_keys: set[str] | None = None, + ) -> dict[str, object]: + preserve_empty_keys = preserve_empty_keys or set() + cleaned: dict[str, object] = {} + for key, value in raw_properties.items(): + if value == "" and key in preserve_empty_keys: + cleaned[key] = value + continue + if is_missing(value): + continue + cleaned[key] = value + return cleaned + + @staticmethod + def _evaluate_graph_properties( + property_specs: dict[str, object], + row: dict[str, object], + ) -> tuple[dict[str, object], set[str]]: + properties: dict[str, object] = {} + preserve_empty_keys: set[str] = set() + for property_name, property_spec in property_specs.items(): + preserve_empty = False + property_expr = property_spec + if isinstance(property_spec, dict): + property_expr = property_spec.get("value") + preserve_empty = bool(property_spec.get("preserve_empty", False)) + + value = evaluate_expression(property_expr, row) + if preserve_empty and value == "": + preserve_empty_keys.add(property_name) + properties[property_name] = value + continue + properties[property_name] = value + return properties, preserve_empty_keys + + @staticmethod + def _apply_aggregate( + state: dict[str, object], + aggregate_name: str, + aggregate_spec: dict[str, object], + row: dict[str, object], + ) -> None: + when_expr = aggregate_spec.get("when") + if when_expr is not None: + when_value = evaluate_expression(when_expr, row) + if is_missing(when_value) or when_value is False: + return + + if "list" in aggregate_spec: + value = evaluate_expression(aggregate_spec["list"], row) + if is_missing(value): + return + state.setdefault(aggregate_name, []).append(value) + return + + if "unique" in aggregate_spec: + value = evaluate_expression(aggregate_spec["unique"], row) + if is_missing(value): + return + existing_values = state.setdefault(aggregate_name, []) + if value not in existing_values: + existing_values.append(value) + return + + if "count" in aggregate_spec: + count_expr = aggregate_spec["count"] + if count_expr not in (True, None): + value = evaluate_expression(count_expr, row) + if is_missing(value): + return + state[aggregate_name] = state.get(aggregate_name, 0) + 1 + return + + if "first_non_null" in aggregate_spec: + if aggregate_name in state and not is_missing(state[aggregate_name]): + return + value = evaluate_expression(aggregate_spec["first_non_null"], row) + if not is_missing(value): + state[aggregate_name] = value + return + + if "mean" in aggregate_spec: + value = evaluate_expression(aggregate_spec["mean"], row) + if is_missing(value): + return + summary = state.setdefault(aggregate_name, {"sum": 0.0, "count": 0}) + summary["sum"] += float(value) + summary["count"] += 1 + return + + raise ValueError(f"Unsupported aggregate specification: {aggregate_spec}") + + @staticmethod + def _finalize_aggregate_value(value: object) -> object: + if isinstance(value, dict) and set(value.keys()) == {"sum", "count"}: + if value["count"] == 0: + return None + return value["sum"] / value["count"] + return value + + def _execute_row_view(self, parent_rows, view_spec: dict[str, object]): + expand_spec = None + expand_alias = None + if "unnest" in view_spec: + expand_spec = parse_expand_spec(view_spec["unnest"], "unnest") + elif "unpivot" in view_spec: + expand_spec = parse_expand_spec(view_spec["unpivot"], "unpivot") + + if expand_spec is not None: + expand_field, expand_alias = expand_spec + else: + expand_field = None + + for parent_row in parent_rows: + if not matches_filters(view_spec.get("where", []), parent_row): + continue + + working_rows = [parent_row] + if expand_field is not None: + expanded_rows: list[dict[str, object]] = [] + values = parent_row.get(expand_field, []) + if is_missing(values): + continue + for value in values: + expanded_row = dict(parent_row) + expanded_row[expand_alias] = value + expanded_rows.append(expanded_row) + working_rows = expanded_rows + + for working_row in working_rows: + if "select" in view_spec: + yield self._apply_select(view_spec["select"], working_row) + else: + yield working_row + + def _execute_group_view(self, parent_rows, view_spec: dict[str, object]) -> list[dict[str, object]]: + groups: "OrderedDict[tuple[object, ...], dict[str, object]]" = OrderedDict() + group_items = list(view_spec["group_by"].items()) + + for parent_row in parent_rows: + if not matches_filters(view_spec.get("where", []), parent_row): + continue + + group_values = OrderedDict( + (group_name, evaluate_expression(group_expr, parent_row)) + for group_name, group_expr in group_items + ) + group_key = tuple(group_values.values()) + group_state = groups.setdefault( + group_key, + { + "keys": dict(group_values), + "aggregates": {}, + }, + ) + + for aggregate_name, aggregate_spec in view_spec.get("aggregates", {}).items(): + self._apply_aggregate(group_state["aggregates"], aggregate_name, aggregate_spec, parent_row) + + finalized_rows: list[dict[str, object]] = [] + for group_state in groups.values(): + row = dict(group_state["keys"]) + for aggregate_name, aggregate_value in group_state["aggregates"].items(): + row[aggregate_name] = self._finalize_aggregate_value(aggregate_value) + + for let_name, let_expr in view_spec.get("let", {}).items(): + row[let_name] = evaluate_expression(let_expr, row) + + if not matches_filters(view_spec.get("having", []), row): + continue + finalized_rows.append(row) + + return finalized_rows + + def _should_cache_view(self, view_name: str, view_spec: dict[str, object]) -> bool: + if "group_by" in view_spec: + return True + return self._view_usage_counts.get(view_name, 0) > 1 + + def _iter_rows_for(self, row_source: str): + if row_source == "source": + return self._iter_source_rows() + + if row_source in self._view_cache: + return iter(self._view_cache[row_source]) + + view_spec = self.parser_spec.views[row_source] + parent_rows = self._iter_rows_for(view_spec["from"]) + + if "group_by" in view_spec: + rows = self._execute_group_view(parent_rows, view_spec) + else: + rows = self._execute_row_view(parent_rows, view_spec) + if self._should_cache_view(row_source, view_spec): + rows = list(rows) + + if isinstance(rows, list): + self._view_cache[row_source] = rows + return iter(rows) + + return rows + + def _emit_nodes(self) -> None: + for node_rule in self.parser_spec.graph.nodes: + for row in self._iter_rows_for(node_rule["from"]): + identifier = evaluate_expression(node_rule["id"], row) + if is_missing(identifier): + continue + + node_name = evaluate_expression(node_rule.get("name"), row) or "" + categories_expr = node_rule.get("categories", node_rule.get("category")) + categories = evaluate_expression(categories_expr, row) if categories_expr is not None else None + if isinstance(categories, str): + categories = [categories] + + properties, preserve_empty_keys = self._evaluate_graph_properties(node_rule.get("props", {}), row) + + self.output_file_writer.write_kgx_node( + kgxnode( + identifier=identifier, + name=node_name, + categories=categories, + nodeprops=self._clean_properties(properties, preserve_empty_keys), + ) + ) + + def _emit_edges(self) -> None: + for edge_rule in self.parser_spec.graph.edges: + for row in self._iter_rows_for(edge_rule["from"]): + subject_id = evaluate_expression(edge_rule["subject"], row) + predicate = evaluate_expression(edge_rule["predicate"], row) + object_id = evaluate_expression(edge_rule["object"], row) + if any(is_missing(value) for value in (subject_id, predicate, object_id)): + continue + + primary_knowledge_source = None + if "primary_knowledge_source" in edge_rule: + primary_knowledge_source = evaluate_expression(edge_rule["primary_knowledge_source"], row) + + properties, preserve_empty_keys = self._evaluate_graph_properties(edge_rule.get("props", {}), row) + + self.output_file_writer.write_kgx_edge( + kgxedge( + subject_id=subject_id, + object_id=object_id, + predicate=predicate, + primary_knowledge_source=primary_knowledge_source, + edgeprops=self._clean_properties(properties, preserve_empty_keys), + ) + ) + + def parse_data(self) -> dict: + if self.output_file_writer is None: + raise RuntimeError("MetadataDrivenLoader.parse_data() requires an initialized output_file_writer.") + + record_counter, skipped_record_counter, errors = self._count_source_rows() + self._view_cache.clear() + self._emit_nodes() + self._emit_edges() + + metadata = { + "num_source_lines": record_counter, + "unusable_source_lines": skipped_record_counter, + } + if errors: + metadata["errors"] = errors + return metadata + + def load(self, nodes_output_file_path: str, edges_output_file_path: str): + metadata = super().load(nodes_output_file_path, edges_output_file_path) + self._rewrite_outputs(Path(nodes_output_file_path), Path(edges_output_file_path)) + return self._rewrite_metadata(metadata) + + def _rewrite_outputs(self, nodes_path: Path, edges_path: Path) -> None: + output_spec = self.parser_spec.output + if not output_spec: + return + + node_property_order = output_spec.get("node_property_order", []) + edge_property_order = output_spec.get("edge_property_order", []) + + if edge_property_order: + edge_records = self._read_jsonl(edges_path) + self._write_jsonl( + edges_path, + [ + self._order_record_keys( + record, + base_keys=["subject", "predicate", "object", "primary_knowledge_source", "aggregator_knowledge_sources"], + property_order=edge_property_order, + ) + for record in edge_records + ], + ) + + if output_spec.get("node_order") == "edge_encounter" or node_property_order: + node_records = self._read_jsonl(nodes_path) + node_map = {record["id"]: record for record in node_records} + ordered_ids = self._ordered_node_ids(node_records, edges_path, output_spec.get("node_order")) + ordered_nodes = [ + self._order_record_keys( + node_map[node_id], + base_keys=["id", "name", "category"], + property_order=node_property_order, + ) + for node_id in ordered_ids + ] + self._write_jsonl(nodes_path, ordered_nodes) + + @staticmethod + def _read_jsonl(path: Path) -> list[dict[str, object]]: + with path.open("r") as handle: + return [json.loads(line) for line in handle] + + @staticmethod + def _write_jsonl(path: Path, records: list[dict[str, object]]) -> None: + with path.open("w") as handle: + for record in records: + handle.write(json.dumps(record)) + handle.write("\n") + + @staticmethod + def _order_record_keys( + record: dict[str, object], + base_keys: list[str], + property_order: list[str], + ) -> dict[str, object]: + ordered: dict[str, object] = {} + for key in base_keys: + if key in record: + ordered[key] = record[key] + for key in property_order: + if key in record and key not in ordered: + ordered[key] = record[key] + for key, value in record.items(): + if key not in ordered: + ordered[key] = value + return ordered + + @staticmethod + def _ordered_node_ids( + node_records: list[dict[str, object]], + edges_path: Path, + node_order: str | None, + ) -> list[str]: + original_ids = [record["id"] for record in node_records] + if node_order != "edge_encounter": + return original_ids + + seen: set[str] = set() + ordered_ids: list[str] = [] + with edges_path.open("r") as handle: + for line in handle: + edge = json.loads(line) + for key in ("subject", "object"): + node_id = edge.get(key) + if node_id in seen: + continue + seen.add(node_id) + ordered_ids.append(node_id) + + for node_id in original_ids: + if node_id not in seen: + ordered_ids.append(node_id) + return ordered_ids + + def _rewrite_metadata(self, metadata: dict[str, object]) -> dict[str, object]: + output_spec = self.parser_spec.output + metadata_spec = output_spec.get("metadata") if output_spec else None + if not metadata_spec: + return metadata + + rewritten = dict(metadata) + for key in metadata_spec.get("drop", []): + rewritten.pop(key, None) + + for key, value in metadata_spec.get("set", {}).items(): + if isinstance(value, str) and value in metadata: + rewritten[key] = metadata[value] + else: + rewritten[key] = value + return rewritten + + +class _ZipTextStream: + def __init__(self, zip_file: ZipFile, member_name: str): + self._zip_file = zip_file + self._member_name = member_name + self._raw_stream = None + self._text_stream = None + + def __enter__(self): + self._raw_stream = self._zip_file.open(self._member_name, "r") + self._text_stream = TextIOWrapper(self._raw_stream, "utf-8") + return self._text_stream + + def __exit__(self, exc_type, exc_value, traceback): + if self._text_stream is not None: + self._text_stream.close() + if self._raw_stream is not None: + self._raw_stream.close() + self._zip_file.close() diff --git a/orion/parser_spec.py b/orion/parser_spec.py new file mode 100644 index 00000000..1c7a6ff2 --- /dev/null +++ b/orion/parser_spec.py @@ -0,0 +1,377 @@ +from __future__ import annotations + +import json +import os +from dataclasses import dataclass, field +from typing import Any + +import yaml + +from orion.croissant_resolver import CroissantResolver +from orion.semantic_table import ( + SchemaField, + infer_expression_schema, + iter_expression_references, + parse_expand_spec, + resolve_schema_path, + schema_for_field_spec, +) + + +@dataclass(frozen=True) +class SourceBindingSpec: + croissant_path: str | None = None + croissant_url: str | None = None + dataset_id: str | None = None + version_from: str = "dataset.version" + distribution: str = "" + record_set: str = "" + format: str = "tsv" + header: bool = True + delimiter: str = "\t" + quotechar: str = '"' + compression: str = "auto" + archive_member: str | None = None + member_pattern: str | None = None + test_mode_limit: int | None = None + + +@dataclass(frozen=True) +class GraphProjectionSpec: + nodes: list[dict[str, Any]] = field(default_factory=list) + edges: list[dict[str, Any]] = field(default_factory=list) + + +@dataclass(frozen=True) +class ParserSpec: + source_id: str + provenance_id: str + parsing_version: str + source: SourceBindingSpec + fields: dict[str, dict[str, Any]] + views: dict[str, dict[str, Any]] + graph: GraphProjectionSpec + output: dict[str, Any] = field(default_factory=dict) + where: list[dict[str, Any]] = field(default_factory=list) + description: str = "" + preserve_unconnected_nodes: bool = False + has_sequence_variants: bool = False + spec_path: str | None = None + + def get_croissant_resolver(self) -> CroissantResolver: + if self.source.croissant_path: + return CroissantResolver.from_path(self.source.croissant_path) + if self.source.croissant_url: + return CroissantResolver.from_url(self.source.croissant_url) + raise ValueError("Parser spec must provide from.croissant or from.croissant_url.") + + def validate_against(self, resolver: CroissantResolver) -> None: + if self.source.dataset_id and resolver.dataset_id != self.source.dataset_id: + raise ValueError( + f"Croissant dataset ID mismatch. Expected {self.source.dataset_id}, " + f"found {resolver.dataset_id}." + ) + + resolver.get_distribution(self.source.distribution) + record_set = resolver.get_record_set(self.source.record_set) + if not record_set.fields_by_name: + raise ValueError(f"Croissant record set {self.source.record_set} does not define any fields.") + + if self.source.format.lower() not in {"csv", "tsv"}: + raise ValueError(f"Unsupported semantic-table input format: {self.source.format}") + + if not self.source.header: + raise ValueError("Semantic-table tabular parsing currently requires header=True.") + + if not self.graph.nodes and not self.graph.edges: + raise ValueError("Parser spec must define at least one node or edge projection.") + + record_set_fields = set(record_set.fields_by_name.keys()) + base_schema = self._validate_fields(record_set_fields) + self._validate_filters(self.where, base_schema) + view_schemas = self._validate_views(base_schema) + self._validate_graph(base_schema, view_schemas) + + def _validate_fields(self, record_set_fields: set[str]) -> dict[str, SchemaField]: + base_schema: dict[str, SchemaField] = {} + for field_name, field_spec in self.fields.items(): + kind = field_spec.get("kind", "property") + if kind in {"identifier", "optional_identifier", "label", "property", "list"}: + column = field_spec.get("column") + if not column: + raise ValueError(f"Field '{field_name}' is missing required 'column'.") + if column not in record_set_fields: + raise ValueError( + f"Field '{field_name}' references unknown Croissant column '{column}'." + ) + elif kind == "zipped_list": + columns = field_spec.get("columns", {}) + if not columns: + raise ValueError(f"Field '{field_name}' must declare zipped_list columns.") + for sub_name, sub_spec in columns.items(): + column = sub_spec.get("column") + if not column: + raise ValueError( + f"Field '{field_name}.{sub_name}' is missing required 'column'." + ) + if column not in record_set_fields: + raise ValueError( + f"Field '{field_name}.{sub_name}' references unknown Croissant column '{column}'." + ) + elif kind == "value_columns": + columns = field_spec.get("columns", {}) + if not columns: + raise ValueError(f"Field '{field_name}' must declare value_columns entries.") + for column_name in columns.keys(): + if column_name not in record_set_fields: + raise ValueError( + f"Field '{field_name}' references unknown Croissant column '{column_name}'." + ) + else: + raise ValueError(f"Unsupported field kind '{kind}' for field '{field_name}'.") + + base_schema[field_name] = schema_for_field_spec(field_spec) + return base_schema + + @staticmethod + def _validate_filters(filter_specs: list[dict[str, Any]], schema: dict[str, SchemaField]) -> None: + for filter_spec in filter_specs: + if "exists" in filter_spec: + resolve_schema_path(schema, filter_spec["exists"]) + continue + if "not_exists" in filter_spec: + resolve_schema_path(schema, filter_spec["not_exists"]) + continue + if "non_empty" in filter_spec: + resolve_schema_path(schema, filter_spec["non_empty"]) + continue + if "equals" in filter_spec or "not_equals" in filter_spec: + condition = filter_spec.get("equals") or filter_spec.get("not_equals") + resolve_schema_path(schema, condition["field"]) + for reference in iter_expression_references(condition["value"]): + resolve_schema_path(schema, reference) + continue + raise ValueError(f"Unsupported filter specification: {filter_spec}") + + def _validate_views(self, base_schema: dict[str, SchemaField]) -> dict[str, dict[str, SchemaField]]: + view_schemas: dict[str, dict[str, SchemaField]] = {} + + for view_name, view_spec in self.views.items(): + parent_name = view_spec.get("from") + if not parent_name: + raise ValueError(f"View '{view_name}' is missing required 'from'.") + + if parent_name == "source": + parent_schema = base_schema + elif parent_name in view_schemas: + parent_schema = view_schemas[parent_name] + else: + raise ValueError( + f"View '{view_name}' references unknown or not-yet-defined parent '{parent_name}'." + ) + + self._validate_filters(view_spec.get("where", []), parent_schema) + context_schema = dict(parent_schema) + + expand_count = int("unnest" in view_spec) + int("unpivot" in view_spec) + if expand_count > 1: + raise ValueError(f"View '{view_name}' may define at most one of unnest or unpivot.") + + if "unnest" in view_spec: + field_name, alias = parse_expand_spec(view_spec["unnest"], "unnest") + field_schema = parent_schema.get(field_name) + if field_schema is None or field_schema.kind not in {"list", "record_list"}: + raise ValueError( + f"View '{view_name}' can only unnest a list or record_list field; got '{field_name}'." + ) + context_schema[alias] = ( + SchemaField(kind="scalar") + if field_schema.kind == "list" + else SchemaField(kind="record", children=field_schema.children) + ) + + if "unpivot" in view_spec: + field_name, alias = parse_expand_spec(view_spec["unpivot"], "unpivot") + field_schema = parent_schema.get(field_name) + if field_schema is None or field_schema.kind != "record_list": + raise ValueError( + f"View '{view_name}' can only unpivot a record_list field; got '{field_name}'." + ) + context_schema[alias] = SchemaField(kind="record", children=field_schema.children) + + if "group_by" in view_spec: + output_schema: dict[str, SchemaField] = {} + for group_name, group_expr in view_spec["group_by"].items(): + for reference in iter_expression_references(group_expr): + resolve_schema_path(context_schema, reference) + output_schema[group_name] = infer_expression_schema(group_expr, context_schema) + + for aggregate_name, aggregate_spec in view_spec.get("aggregates", {}).items(): + op_name, aggregate_expr = self._parse_aggregate_spec(view_name, aggregate_name, aggregate_spec) + when_expr = aggregate_spec.get("when") + if when_expr is not None: + for reference in iter_expression_references(when_expr): + resolve_schema_path(context_schema, reference) + if aggregate_expr is not None: + for reference in iter_expression_references(aggregate_expr): + resolve_schema_path(context_schema, reference) + output_schema[aggregate_name] = ( + SchemaField(kind="list") + if op_name in {"list", "unique"} + else SchemaField(kind="scalar") + ) + + for let_name, let_expr in view_spec.get("let", {}).items(): + for reference in iter_expression_references(let_expr): + resolve_schema_path(output_schema, reference) + output_schema[let_name] = infer_expression_schema(let_expr, output_schema) + else: + output_schema = dict(parent_schema) + if "select" in view_spec: + output_schema = {} + for output_name, output_expr in view_spec["select"].items(): + for reference in iter_expression_references(output_expr): + resolve_schema_path(context_schema, reference) + output_schema[output_name] = infer_expression_schema(output_expr, context_schema) + + self._validate_filters(view_spec.get("having", []), output_schema) + view_schemas[view_name] = output_schema + + return view_schemas + + def _validate_graph( + self, + base_schema: dict[str, SchemaField], + view_schemas: dict[str, dict[str, SchemaField]], + ) -> None: + for node_rule in self.graph.nodes: + rule_source = node_rule.get("from") + if rule_source == "source": + rule_schema = base_schema + elif rule_source in view_schemas: + rule_schema = view_schemas[rule_source] + else: + raise ValueError(f"Node projection references unknown row source '{rule_source}'.") + + self._validate_graph_rule(node_rule, rule_schema, required_fields=("id",)) + + for edge_rule in self.graph.edges: + rule_source = edge_rule.get("from") + if rule_source == "source": + rule_schema = base_schema + elif rule_source in view_schemas: + rule_schema = view_schemas[rule_source] + else: + raise ValueError(f"Edge projection references unknown row source '{rule_source}'.") + + self._validate_graph_rule(edge_rule, rule_schema, required_fields=("subject", "predicate", "object")) + + def _validate_graph_rule( + self, + rule: dict[str, Any], + schema: dict[str, SchemaField], + required_fields: tuple[str, ...], + ) -> None: + for required_field in required_fields: + if required_field not in rule: + raise ValueError(f"Graph projection is missing required field '{required_field}'.") + + for field_name in required_fields + ("name", "category", "categories", "primary_knowledge_source"): + if field_name in rule: + for reference in iter_expression_references(rule[field_name]): + resolve_schema_path(schema, reference) + + for property_spec in rule.get("props", {}).values(): + property_expr = property_spec.get("value") if isinstance(property_spec, dict) else property_spec + for reference in iter_expression_references(property_expr): + resolve_schema_path(schema, reference) + + @staticmethod + def _parse_aggregate_spec( + view_name: str, + aggregate_name: str, + aggregate_spec: dict[str, Any], + ) -> tuple[str, Any]: + supported_ops = {"list", "unique", "count", "mean", "first_non_null"} + matching_ops = [name for name in supported_ops if name in aggregate_spec] + if len(matching_ops) != 1: + raise ValueError( + f"Aggregate '{aggregate_name}' in view '{view_name}' must define exactly one " + f"supported aggregate operator: {sorted(supported_ops)}" + ) + op_name = matching_ops[0] + return op_name, aggregate_spec[op_name] + + def get_source_version(self, resolver: CroissantResolver) -> str: + version_from = self.source.version_from + if version_from == "dataset.version": + if resolver.dataset_version: + return resolver.dataset_version + elif version_from == "dataset.dateModified": + if resolver.dataset_modified: + return resolver.dataset_modified + else: + raise ValueError(f"Unsupported from.version_from value: {version_from}") + + raise ValueError( + f"Could not derive a source version from '{version_from}' for source {self.source_id}." + ) + + +def _load_serialized_document(path: str) -> dict[str, Any]: + with open(path, "r") as handle: + if path.endswith((".yaml", ".yml")): + loaded = yaml.safe_load(handle) + elif path.endswith(".json"): + loaded = json.load(handle) + else: + raise ValueError(f"Unsupported parser spec file extension: {path}") + + if not isinstance(loaded, dict): + raise ValueError(f"Parser spec must deserialize to a mapping: {path}") + return loaded + + +def _resolve_relative_path(base_path: str, relative_or_absolute_path: str) -> str: + if os.path.isabs(relative_or_absolute_path): + return relative_or_absolute_path + return os.path.abspath(os.path.join(os.path.dirname(base_path), relative_or_absolute_path)) + + +def load_parser_spec(path: str) -> ParserSpec: + spec_path = os.path.abspath(path) + document = _load_serialized_document(spec_path) + + for required_key in ("source_id", "provenance_id", "parsing_version", "from", "fields", "graph"): + if required_key not in document: + raise ValueError(f"Parser spec is missing required key '{required_key}': {spec_path}") + + source_doc = dict(document["from"]) + croissant_path = source_doc.pop("croissant", None) + croissant_url = source_doc.pop("croissant_url", None) + if croissant_path: + croissant_path = _resolve_relative_path(spec_path, croissant_path) + + spec = ParserSpec( + source_id=document["source_id"], + provenance_id=document["provenance_id"], + parsing_version=str(document["parsing_version"]), + source=SourceBindingSpec( + croissant_path=croissant_path, + croissant_url=croissant_url, + **source_doc, + ), + fields=dict(document.get("fields", {})), + views=dict(document.get("views", {})), + graph=GraphProjectionSpec(**document["graph"]), + output=dict(document.get("output", {})), + where=list(document.get("where", [])), + description=document.get("description", ""), + preserve_unconnected_nodes=bool(document.get("preserve_unconnected_nodes", False)), + has_sequence_variants=bool(document.get("has_sequence_variants", False)), + spec_path=spec_path, + ) + + resolver = spec.get_croissant_resolver() + spec.validate_against(resolver) + return spec diff --git a/orion/semantic_table.py b/orion/semantic_table.py new file mode 100644 index 00000000..b7673c0b --- /dev/null +++ b/orion/semantic_table.py @@ -0,0 +1,361 @@ +from __future__ import annotations + +import math +import re +from dataclasses import dataclass, field +from typing import Any, Iterable + + +_SINGLE_REFERENCE_PATTERN = re.compile(r"^\$([A-Za-z_][\w.]*)$") +_TEMPLATE_REFERENCE_PATTERN = re.compile(r"\$\{([A-Za-z_][\w.]*)\}") +_EXPAND_PATTERN = re.compile(r"^\s*([A-Za-z_][\w]*)\s+as\s+([A-Za-z_][\w]*)\s*$") + +_SCALAR_FIELD_KINDS = { + "identifier", + "optional_identifier", + "label", + "property", +} + + +@dataclass(frozen=True) +class SchemaField: + kind: str + children: dict[str, "SchemaField"] = field(default_factory=dict) + + +def is_missing(value: Any) -> bool: + if value is None: + return True + if isinstance(value, str) and value == "": + return True + if isinstance(value, (list, tuple, dict, set)) and len(value) == 0: + return True + return False + + +def apply_prefix(value: Any, prefix: str | None) -> Any: + if prefix is None or is_missing(value): + return value + normalized_prefix = str(prefix).rstrip(":") + normalized = str(value).strip() + if normalized.startswith(f"{normalized_prefix}:"): + return normalized + return f"{normalized_prefix}:{normalized}" + + +def normalize_atomic_value(raw_value: Any, spec: dict[str, Any]) -> Any: + if raw_value is None: + return None + + trim = spec.get("trim", True) + value = str(raw_value).strip() if trim else str(raw_value) + if value == "": + return "" if spec.get("preserve_empty", False) else None + + kind = spec.get("kind", "property") + if kind in {"identifier", "optional_identifier"}: + return apply_prefix(value, spec.get("prefix")) + return value + + +def _split_parts(raw_value: Any, separator: str, trim: bool = True) -> list[str]: + if is_missing(raw_value): + return [] + parts = [] + for part in str(raw_value).split(separator): + normalized = part.strip() if trim else part + if normalized == "": + continue + parts.append(normalized) + return parts + + +def normalize_field(raw_row: dict[str, Any], field_spec: dict[str, Any]) -> Any: + kind = field_spec.get("kind", "property") + + if kind in _SCALAR_FIELD_KINDS: + return normalize_atomic_value(raw_row.get(field_spec["column"]), field_spec) + + if kind == "list": + parts = _split_parts( + raw_row.get(field_spec["column"]), + field_spec.get("separator", "|"), + trim=field_spec.get("trim", True), + ) + prefix = field_spec.get("prefix") + if prefix is not None: + return [apply_prefix(part, prefix) for part in parts] + return parts + + if kind == "zipped_list": + separator = field_spec.get("separator", "|") + sub_specs = field_spec.get("columns", {}) + split_values: dict[str, list[str]] = {} + for sub_name, sub_spec in sub_specs.items(): + split_values[sub_name] = _split_parts( + raw_row.get(sub_spec["column"]), + separator, + trim=sub_spec.get("trim", True), + ) + + lengths = {len(values) for values in split_values.values()} + if not lengths or lengths == {0}: + return [] + if len(lengths) != 1: + raise ValueError( + f"zipped_list field has mismatched lengths for columns " + f"{list(sub_specs.keys())}: {sorted(lengths)}" + ) + + row_count = next(iter(lengths)) + records: list[dict[str, Any]] = [] + for index in range(row_count): + record: dict[str, Any] = {} + for sub_name, sub_spec in sub_specs.items(): + record[sub_name] = normalize_atomic_value(split_values[sub_name][index], sub_spec) + records.append(record) + return records + + if kind == "value_columns": + values: list[dict[str, Any]] = [] + for column_name, column_spec in field_spec.get("columns", {}).items(): + raw_value = raw_row.get(column_name) + if is_missing(raw_value): + continue + normalized_value = str(raw_value).strip() + if normalized_value == "": + continue + + item = { + "value": normalized_value, + "column": column_name, + } + for key, value in column_spec.items(): + item[key] = value + values.append(item) + return values + + raise ValueError(f"Unsupported semantic field kind: {kind}") + + +def resolve_reference(data: dict[str, Any], path: str) -> Any: + current: Any = data + for part in path.split("."): + if isinstance(current, dict): + if part not in current: + return None + current = current[part] + continue + if isinstance(current, (list, tuple)) and part.isdigit(): + current = current[int(part)] + continue + return None + return current + + +def render_template(template: str, row: dict[str, Any]) -> str: + def replace(match: re.Match[str]) -> str: + value = resolve_reference(row, match.group(1)) + return "" if value is None else str(value) + + return _TEMPLATE_REFERENCE_PATTERN.sub(replace, template) + + +def _evaluate_parse_qualified_float(spec: dict[str, Any], row: dict[str, Any]) -> float | None: + value = evaluate_expression(spec["value"], row) + if is_missing(value): + return None + + if isinstance(value, (int, float)): + parsed_value = float(value) + else: + normalized = str(value).strip().replace(",", "") + for operator in spec.get("reject_operators", []): + if normalized.startswith(operator): + return None + for operator in spec.get("strip_operators", ["<"]): + if normalized.startswith(operator): + normalized = normalized[len(operator):] + parsed_value = float(normalized) + + minimum_exclusive = spec.get("minimum_exclusive") + if minimum_exclusive is not None and parsed_value <= float(minimum_exclusive): + return None + + minimum_inclusive = spec.get("minimum_inclusive") + if minimum_inclusive is not None and parsed_value < float(minimum_inclusive): + return None + + return parsed_value + + +def _evaluate_neglog10_nm(spec: dict[str, Any], row: dict[str, Any]) -> float | None: + value = evaluate_expression(spec["value"], row) + if is_missing(value): + return None + + numeric_value = float(value) + if numeric_value <= 0: + return None + + transformed = -(math.log10(numeric_value * (10 ** -9))) + if "precision" in spec: + return round(transformed, int(spec["precision"])) + return transformed + + +def evaluate_expression(spec: Any, row: dict[str, Any]) -> Any: + if spec is None: + return None + + if isinstance(spec, list): + return [evaluate_expression(item, row) for item in spec] + + if isinstance(spec, str): + reference_match = _SINGLE_REFERENCE_PATTERN.match(spec) + if reference_match: + return resolve_reference(row, reference_match.group(1)) + if "${" in spec: + return render_template(spec, row) + return spec + + if isinstance(spec, dict): + if "parse_qualified_float" in spec: + return _evaluate_parse_qualified_float(spec["parse_qualified_float"], row) + if "neglog10_nm" in spec: + return _evaluate_neglog10_nm(spec["neglog10_nm"], row) + return { + key: evaluate_expression(value, row) + for key, value in spec.items() + } + + return spec + + +def matches_filter(filter_spec: dict[str, Any], row: dict[str, Any]) -> bool: + if "exists" in filter_spec: + return not is_missing(resolve_reference(row, filter_spec["exists"])) + + if "not_exists" in filter_spec: + return is_missing(resolve_reference(row, filter_spec["not_exists"])) + + if "equals" in filter_spec: + condition = filter_spec["equals"] + return evaluate_expression(condition["value"], row) == resolve_reference(row, condition["field"]) + + if "not_equals" in filter_spec: + condition = filter_spec["not_equals"] + return evaluate_expression(condition["value"], row) != resolve_reference(row, condition["field"]) + + if "non_empty" in filter_spec: + value = resolve_reference(row, filter_spec["non_empty"]) + return not is_missing(value) + + raise ValueError(f"Unsupported semantic-table filter: {filter_spec}") + + +def matches_filters(filter_specs: Iterable[dict[str, Any]], row: dict[str, Any]) -> bool: + return all(matches_filter(filter_spec, row) for filter_spec in filter_specs) + + +def parse_expand_spec(spec: str | dict[str, Any] | None, operation_name: str) -> tuple[str, str] | None: + if spec is None: + return None + + if isinstance(spec, str): + match = _EXPAND_PATTERN.match(spec) + if not match: + raise ValueError( + f"{operation_name} must use ' as ' syntax or a mapping with field/as keys." + ) + return match.group(1), match.group(2) + + if isinstance(spec, dict): + return spec["field"], spec["as"] + + raise ValueError(f"Unsupported {operation_name} specification: {spec!r}") + + +def iter_expression_references(spec: Any) -> Iterable[str]: + if spec is None: + return + + if isinstance(spec, str): + reference_match = _SINGLE_REFERENCE_PATTERN.match(spec) + if reference_match: + yield reference_match.group(1) + for match in _TEMPLATE_REFERENCE_PATTERN.finditer(spec): + yield match.group(1) + return + + if isinstance(spec, list): + for item in spec: + yield from iter_expression_references(item) + return + + if isinstance(spec, dict): + for value in spec.values(): + yield from iter_expression_references(value) + + +def schema_for_field_spec(field_spec: dict[str, Any]) -> SchemaField: + kind = field_spec.get("kind", "property") + if kind in _SCALAR_FIELD_KINDS: + return SchemaField(kind="scalar") + + if kind == "list": + return SchemaField(kind="list") + + if kind == "zipped_list": + return SchemaField( + kind="record_list", + children={ + name: SchemaField(kind="scalar") + for name in field_spec.get("columns", {}).keys() + }, + ) + + if kind == "value_columns": + child_names = {"value", "column"} + for column_spec in field_spec.get("columns", {}).values(): + child_names.update(column_spec.keys()) + return SchemaField( + kind="record_list", + children={name: SchemaField(kind="scalar") for name in sorted(child_names)}, + ) + + raise ValueError(f"Unsupported semantic field kind: {kind}") + + +def resolve_schema_path(schema: dict[str, SchemaField], path: str) -> SchemaField: + current = schema.get(path.split(".")[0]) + if current is None: + raise KeyError(f"Unknown reference '{path}'") + + for part in path.split(".")[1:]: + if part not in current.children: + raise KeyError(f"Unknown reference '{path}'") + current = current.children[part] + + return current + + +def infer_expression_schema(spec: Any, schema: dict[str, SchemaField]) -> SchemaField: + if isinstance(spec, str): + reference_match = _SINGLE_REFERENCE_PATTERN.match(spec) + if reference_match: + return resolve_schema_path(schema, reference_match.group(1)) + if "${" in spec: + return SchemaField(kind="scalar") + return SchemaField(kind="scalar") + + if isinstance(spec, list): + return SchemaField(kind="list") + + if isinstance(spec, dict): + if "parse_qualified_float" in spec or "neglog10_nm" in spec: + return SchemaField(kind="scalar") + return SchemaField(kind="scalar") + + return SchemaField(kind="scalar") diff --git a/parser_specs/BINDING-DB/bindingdb_croissant.json b/parser_specs/BINDING-DB/bindingdb_croissant.json new file mode 100644 index 00000000..b9431093 --- /dev/null +++ b/parser_specs/BINDING-DB/bindingdb_croissant.json @@ -0,0 +1,2255 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "sc": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "dct": "http://purl.org/dc/terms/", + "annotation": "cr:annotation", + "arrayShape": "cr:arrayShape", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "containedIn": "cr:containedIn", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "equivalentProperty": "cr:equivalentProperty", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "excludes": "cr:excludes", + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isArray": "cr:isArray", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "readLines": "cr:readLines", + "sdVersion": "cr:sdVersion", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "unArchive": "cr:unArchive", + "value": "cr:value" + }, + "@type": "https://schema.org/Dataset", + "@id": "bindingdb", + "conformsTo": "http://mlcommons.org/croissant/1.1", + "name": "BindingDB", + "description": "BindingDB is a public, web-accessible database of measured binding affinities, focusing chiefly on the interactions of proteins considered to be candidate drug-targets with ligands that are small, drug-like molecules. BindingDB supports medicinal chemistry and drug discovery via literature awareness and development of structure-activity relations (SAR and QSAR); validation of computational chemistry and molecular modeling approaches such as docking, scoring and free energy methods; chemical biology and chemical genomics; and basic studies of the physical chemistry of molecular recognition. BindingDB also includes a small collection of host-guest binding data of interest to chemists studying supramolecular systems.", + "url": "https://www.bindingdb.org/rwd/bind/index.jsp", + "identifier": "https://fairsharing.org/FAIRsharing.3b36hk", + "version": "202603", + "license": "https://creativecommons.org/licenses/by/3.0/us/", + "conditionsOfAccess": "All data curated by BindingDB staff are provided under the Creative Commons Attribution 3.0 License. Data imported from ChEMBL are provided under their Creative Commons Attribution-Share Alike 3.0 Unported License.", + "isAccessibleForFree": true, + "isLiveDataset": true, + "keywords": [ + "protein-ligand binding", + "binding affinity", + "drug discovery", + "pharmacology", + "medicinal chemistry", + "small molecules", + "enzyme inhibition", + "isothermal titration calorimetry", + "molecular recognition", + "structure-activity relations", + "drug targets", + "cheminformatics" + ], + "inLanguage": "en", + "dateModified": "2026-03-01", + "citeAs": "Liu,T., Hwang, L., Burley,S.K., Nitsche,C.I., Southan,C., Walters,W.P., and Gilson,M.K. BindingDB in 2024: a FAIR knowledgebase of protein-small molecule binding data Nucleic Acids Research 53:D1633-D1644 (2025).", + "creator": { + "@type": "https://schema.org/Organization", + "name": "Skaggs School of Pharmacy and Pharmaceutical Sciences", + "url": "https://pharmacy.ucsd.edu/", + "parentOrganization": { + "@type": "https://schema.org/Organization", + "name": "University of California San Diego", + "url": "https://ucsd.edu/" + } + }, + "funder": { + "@type": "https://schema.org/Organization", + "name": "National Institute of General Medical Sciences, National Institutes of Health" + }, + "maintainer": { + "@type": "https://schema.org/Person", + "name": "Michael K. Gilson", + "email": "mgilson@health.ucsd.edu" + }, + "relatedLink": "bindingdb_apis.json", + "distribution": [ + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/all_tsv_zip", + "name": "all_tsv_zip", + "description": "All binding data in BindingDB as a tab-separated values file in a ZIP archive (3,205,717 measurements, 1,409,251 compounds, 11,414 targets; 525.54 MB, updated 2026-03-01).", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_All_202603_tsv.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "1204a116ba13487e3882b7d066b7362e" + }, + { + "@type": "http://mlcommons.org/croissant/FileSet", + "@id": "bindingdb/all_tsv_fileset", + "name": "all_tsv_fileset", + "description": "TSV file extracted from BindingDB_All_202603_tsv.zip.", + "containedIn": [ + { + "@id": "bindingdb/all_tsv_zip" + } + ], + "encodingFormat": "text/tab-separated-values", + "includes": "*.tsv" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/articles_tsv_zip", + "name": "articles_tsv_zip", + "description": "Only data curated from articles by BindingDB (17.82 MB, updated 2026-03-01). Same column schema as all_tsv_zip.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_BindingDB_Articles_202603_tsv.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "fad6e9fa50958f366c196e7322ed9ebc" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/chembl_tsv_zip", + "name": "chembl_tsv_zip", + "description": "Only data in BindingDB drawn from ChEMBL (326.48 MB, updated 2026-03-01). Licensed under CC BY-SA 3.0. Same column schema as all_tsv_zip.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_ChEMBL_202603_tsv.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "451449286a3f037044e023bdad401f70" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/patents_tsv_zip", + "name": "patents_tsv_zip", + "description": "Only data in BindingDB drawn from patents (156.43 MB, updated 2026-03-01). Same column schema as all_tsv_zip.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_Patents_202603_tsv.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "64cf0045b4b4882a31de5bb46bdeb4de" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/pubchem_tsv_zip", + "name": "pubchem_tsv_zip", + "description": "Only data in BindingDB drawn from PubChem (25.40 MB, updated 2026-03-01). Same column schema as all_tsv_zip.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_PubChem_202603_tsv.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "ecd82425844a5dc0bccc57d3443849c0" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/pdspki_tsv_zip", + "name": "pdspki_tsv_zip", + "description": "Only data in BindingDB drawn from PDSP Ki (5.32 MB, updated 2026-03-01). Same column schema as all_tsv_zip.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_PDSPKi_202603_tsv.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "6e12b9285b91dd708acbebe6ca922991" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/covid19_tsv_zip", + "name": "covid19_tsv_zip", + "description": "Covid-19 data in BindingDB (9.30 MB, updated 2026-03-01). Same column schema as all_tsv_zip.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_Covid-19_202603_tsv.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "8f7633f377545d54d1d1f49730f47600" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/csar_tsv_zip", + "name": "csar_tsv_zip", + "description": "Only CSAR data in BindingDB (130.62 KB, updated 2026-03-01). Same column schema as all_tsv_zip.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_CSAR_202603_tsv.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "6804e39d49c5d5d55c8ecb650c723996" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/itc_tsv_zip", + "name": "itc_tsv_zip", + "description": "Only isothermal titration calorimetry (ITC) data in BindingDB, including host-guest systems (141.62 KB, updated 2026-03-01). Same column schema as all_tsv_zip.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_ITC_202603_tsv.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "5fff9d78db212aead3681793a7cd5220" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/ptaylorlab_tsv_zip", + "name": "ptaylorlab_tsv_zip", + "description": "Only data directly provided by Prof. Palmer Taylor, UCSD (5.68 KB, updated 2026-03-01). Same column schema as all_tsv_zip.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_PTaylorLab_UCSD_202603_tsv.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "59c45a10da114c9ea51fbbb56de6488c" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/all_2d_sdf_zip", + "name": "all_2d_sdf_zip", + "description": "All binding data with 2D compound structures (1.39 GB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_All_2D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "a5cb47aec03207b616826acad295267b" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/all_3d_sdf_zip", + "name": "all_3d_sdf_zip", + "description": "All binding data with 3D compound structures computed with Vconf (2.81 GB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_All_3D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "8554a03d828da3f7b51a6b210f533f91" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/articles_2d_sdf_zip", + "name": "articles_2d_sdf_zip", + "description": "BindingDB-curated articles data with 2D structures (35.17 MB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_BindingDB_Articles_2D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "5cbae16480c7426577d6936b215d9634" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/articles_3d_sdf_zip", + "name": "articles_3d_sdf_zip", + "description": "BindingDB-curated articles data with 3D structures (87.61 MB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_BindingDB_Articles_3D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "ffd132feef8be47cbebb178046659c31" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/chembl_2d_sdf_zip", + "name": "chembl_2d_sdf_zip", + "description": "ChEMBL-sourced data with 2D structures (885.12 MB, updated 2026-03-01). Licensed CC BY-SA 3.0. SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_ChEMBL_2D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "99f0b35a890e72eb981baafc4f4c89db" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/chembl_3d_sdf_zip", + "name": "chembl_3d_sdf_zip", + "description": "ChEMBL-sourced data with 3D structures (1.62 GB, updated 2026-03-01). Licensed CC BY-SA 3.0. SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_ChEMBL_3D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "39377d58af5ecb4b7f7f022e181acd10" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/patents_2d_sdf_zip", + "name": "patents_2d_sdf_zip", + "description": "Patent-sourced data with 2D structures (447.06 MB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_Patents_2D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "5530b733a2b776fbab0c7f3ff082055f" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/patents_3d_sdf_zip", + "name": "patents_3d_sdf_zip", + "description": "Patent-sourced data with 3D structures (1.03 GB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_Patents_3D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "0161869bfc3751fcfbb340fdf1ebe027" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/pubchem_2d_sdf_zip", + "name": "pubchem_2d_sdf_zip", + "description": "PubChem-sourced data with 2D structures (62.53 MB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_PubChem_2D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "1eafc863eb86ed8d15c17162809efb70" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/pubchem_3d_sdf_zip", + "name": "pubchem_3d_sdf_zip", + "description": "PubChem-sourced data with 3D structures (107.41 MB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_PubChem_3D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "e1a072abed07dfdd720a664e86c723ba" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/pdspki_2d_sdf_zip", + "name": "pdspki_2d_sdf_zip", + "description": "PDSP Ki data with 2D structures (15.40 MB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_PDSPKi_2D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "18a17efd66e58b757724cac50caea9c8" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/pdspki_3d_sdf_zip", + "name": "pdspki_3d_sdf_zip", + "description": "PDSP Ki data with 3D structures (31.70 MB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_PDSPKi_3D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "9c65d8d57c9bfa7bded5835fe9f0e685" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/covid19_2d_sdf_zip", + "name": "covid19_2d_sdf_zip", + "description": "Covid-19 data with 2D structures (17.95 MB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_Covid-19_2D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "8d54d897e1a5c1cf65fe874d400b0d5d" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/covid19_3d_sdf_zip", + "name": "covid19_3d_sdf_zip", + "description": "Covid-19 data with 3D structures (29.35 MB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_Covid-19_3D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "d365641daaedba61f2eab7939cba59aa" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/csar_2d_sdf_zip", + "name": "csar_2d_sdf_zip", + "description": "CSAR data with 2D structures (342.90 KB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_CSAR_2D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "1c0c81f04ac14d2db6fbb0018513007e" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/csar_3d_sdf_zip", + "name": "csar_3d_sdf_zip", + "description": "CSAR data with 3D structures (724.57 KB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_CSAR_3D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "eaa190e810835bc5f04ba84efa04746f" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/itc_2d_sdf_zip", + "name": "itc_2d_sdf_zip", + "description": "ITC data with 2D structures (286.85 KB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_ITC_2D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "862d9c948a6da516d7d75e5aec1d1266" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/itc_3d_sdf_zip", + "name": "itc_3d_sdf_zip", + "description": "ITC data with 3D structures (577.68 KB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_ITC_3D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "fd31c72567f84ba3890ab4028d7f45f8" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/hostguest_2d_sdf_zip", + "name": "hostguest_2d_sdf_zip", + "description": "Host-guest ITC data with 2D structures (161.46 KB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_HostGuest_2D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "e5d891ee7d761222b31f8d29142815d9" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/hostguest_3d_sdf_zip", + "name": "hostguest_3d_sdf_zip", + "description": "Host-guest ITC data with 3D structures (375.72 KB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_HostGuest_3D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "fee776f72066d40cc7aba22e6d4fdbd7" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/ptaylorlab_2d_sdf_zip", + "name": "ptaylorlab_2d_sdf_zip", + "description": "PTaylorLab UCSD data with 2D structures (7.59 KB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_PTaylorLab_UCSD_2D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "71ba5b34453ecf9113ceb73cd06feed2" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/ptaylorlab_3d_sdf_zip", + "name": "ptaylorlab_3d_sdf_zip", + "description": "PTaylorLab UCSD data with 3D structures (12.76 KB, updated 2026-03-01). SDF property schema documented in BindingDB-SDfile-Specification.pdf.", + "contentUrl": "https://www.bindingdb.org/rwd/bind/downloads/BindingDB_PTaylorLab_UCSD_3D_202603_sdf.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "5c93ef51419b21d669705fc00764367d" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/target_fasta", + "name": "target_fasta", + "description": "FASTA format protein sequences for all protein targets in BindingDB (7.23 MB, updated 2026-02-02).", + "contentUrl": "https://www.bindingdb.org/rwd/bind/BindingDBTargetSequences.fasta", + "encodingFormat": "text/x-fasta", + "md5": "af58431ae66d4925ee9a29b23bf77ee7" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/mysql_dump_zip", + "name": "mysql_dump_zip", + "description": "Full BindingDB database as a MySQL dump in a ZIP archive (276.16 MB, updated 2026-02-28).", + "contentUrl": "https://www.bindingdb.org/rwd/bind/BDB-mySQL_All_202603_dmp.zip", + "encodingFormat": "application/zip", + "version": "202603", + "md5": "ff6a5d861ff6ef6af954b5e26b2a1bf2" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/oracle_dump_zip", + "name": "oracle_dump_zip", + "description": "Full BindingDB database as an Oracle dump in a ZIP archive. Discontinued as of 2025 (837.50 MB, updated 2025-03-26).", + "contentUrl": "https://www.bindingdb.org/rwd/bind/BDB-Oracle_All_202502_dmp.zip", + "encodingFormat": "application/zip", + "version": "202502", + "md5": "fa9512f1d7da04f3c1fce2644974e5ed" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/cid_txt", + "name": "cid_txt", + "description": "Mapping of BindingDB monomer (compound) IDs to PubChem CIDs. Two tab-separated columns, no header row (22.87 MB, updated 2026-03-05).", + "contentUrl": "https://www.bindingdb.org/rwd/bind/BindingDB_CID.txt", + "encodingFormat": "text/tab-separated-values", + "md5": "fc6c56f4d4751b2eab11ef1c03c1f834" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/sid_txt", + "name": "sid_txt", + "description": "Mapping of BindingDB monomer (compound) IDs to PubChem SIDs. Two tab-separated columns, no header row (23.66 MB, updated 2026-03-05).", + "contentUrl": "https://www.bindingdb.org/rwd/bind/BindingDB_SID.txt", + "encodingFormat": "text/tab-separated-values", + "md5": "969015fb752c48a4232333fed1f2ea91" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/uniprot_txt", + "name": "uniprot_txt", + "description": "Mapping of BindingDB polymer (single protein) IDs to UniProt IDs. Tab-separated with header row (568.40 KB, updated 2026-03-05).", + "contentUrl": "https://www.bindingdb.org/rwd/bind/BindingDB_UniProt.txt", + "encodingFormat": "text/tab-separated-values", + "md5": "ed1acaeef8a126fc19573e0b4940b933" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/drugbank_txt", + "name": "drugbank_txt", + "description": "Mapping of BindingDB monomer (compound) IDs to DrugBank IDs. Two tab-separated columns, no header row (41.02 KB, updated 2026-03-05).", + "contentUrl": "https://www.bindingdb.org/rwd/bind/BindingDB_DrugBankID.txt", + "encodingFormat": "text/tab-separated-values", + "md5": "d212bc3dbc7c2350cf6d87245b9673c5" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/pubmed_txt", + "name": "pubmed_txt", + "description": "Collection of PubMed IDs in BindingDB. Single column, no header row (387.05 KB, updated 2026-03-08).", + "contentUrl": "https://www.bindingdb.org/rwd/bind/BindingDB_PubMed.txt", + "encodingFormat": "text/plain", + "md5": "95e6a7e9d57c087e3ca71e548df0c6a3" + } + ], + "recordSet": [ + { + "@type": "http://mlcommons.org/croissant/RecordSet", + "@id": "bindingdb/binding_data", + "name": "binding_data", + "description": "Binding affinity measurements from BindingDB. Each row represents one binding measurement with ligand, target, affinity value, and provenance. Schema sourced from header of BindingDB_All_202603_tsv.zip. All subset TSV files share this same column structure.", + "field": [ + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/reactant_set_id", + "name": "reactant_set_id", + "description": "Unique identifier for the binding measurement (Reactant Set ID).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "BindingDB Reactant_set_id" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/ligand_smiles", + "name": "ligand_smiles", + "description": "SMILES string representation of the ligand.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Ligand SMILES" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/ligand_inchi", + "name": "ligand_inchi", + "description": "InChI string of the ligand.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Ligand InChI" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/ligand_inchi_key", + "name": "ligand_inchi_key", + "description": "InChI Key of the ligand.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Ligand InChI Key" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/monomer_id", + "name": "monomer_id", + "description": "BindingDB internal monomer (compound) identifier.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "BindingDB MonomerID" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/ligand_name", + "name": "ligand_name", + "description": "Name of the ligand as recorded in BindingDB.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "BindingDB Ligand Name" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/target_name", + "name": "target_name", + "description": "Name of the protein target.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Target Name" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/target_source_organism", + "name": "target_source_organism", + "description": "Source organism of the target protein.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Target Source Organism According to Curator or DataSource" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/ki_nm", + "name": "ki_nm", + "description": "Inhibition constant Ki in nanomolar units.", + "dataType": "sc:Float", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Ki (nM)" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/ic50_nm", + "name": "ic50_nm", + "description": "Half-maximal inhibitory concentration IC50 in nanomolar units.", + "dataType": "sc:Float", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "IC50 (nM)" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/kd_nm", + "name": "kd_nm", + "description": "Dissociation constant Kd in nanomolar units.", + "dataType": "sc:Float", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Kd (nM)" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/ec50_nm", + "name": "ec50_nm", + "description": "Half-maximal effective concentration EC50 in nanomolar units.", + "dataType": "sc:Float", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "EC50 (nM)" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/kon", + "name": "kon", + "description": "Association rate constant kon in M-1 s-1.", + "dataType": "sc:Float", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "kon (M-1-s-1)" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/koff", + "name": "koff", + "description": "Dissociation rate constant koff in s-1.", + "dataType": "sc:Float", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "koff (s-1)" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/ph", + "name": "ph", + "description": "pH at which the measurement was made.", + "dataType": "sc:Float", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "pH" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/temp_c", + "name": "temp_c", + "description": "Temperature in Celsius at which the measurement was made.", + "dataType": "sc:Float", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Temp (C)" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/curation_datasource", + "name": "curation_datasource", + "description": "Source of the data: BindingDB curated, ChEMBL, PubChem, Patent, etc.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Curation/DataSource" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/article_doi", + "name": "article_doi", + "description": "DOI of the source article.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Article DOI" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/entry_doi", + "name": "entry_doi", + "description": "DOI assigned to this BindingDB entry.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "BindingDB Entry DOI" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/pmid", + "name": "pmid", + "description": "PubMed identifier of the source article.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "PMID" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/pubchem_aid", + "name": "pubchem_aid", + "description": "PubChem BioAssay identifier (AID).", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "PubChem AID" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/patent_number", + "name": "patent_number", + "description": "Patent number if the data source is a patent.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Patent Number" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/authors", + "name": "authors", + "description": "Authors of the source publication.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Authors" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/date_of_publication", + "name": "date_of_publication", + "description": "Publication date of the source article or patent.", + "dataType": "sc:Date", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Date of publication" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/date_in_bindingdb", + "name": "date_in_bindingdb", + "description": "Date this measurement was curated into BindingDB.", + "dataType": "sc:Date", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Date in BindingDB" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/institution", + "name": "institution", + "description": "Institution associated with the source publication.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Institution" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/link_to_ligand", + "name": "link_to_ligand", + "description": "URL to the ligand record in BindingDB.", + "dataType": "sc:URL", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Link to Ligand in BindingDB" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/link_to_target", + "name": "link_to_target", + "description": "URL to the target record in BindingDB.", + "dataType": "sc:URL", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Link to Target in BindingDB" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/link_to_pair", + "name": "link_to_pair", + "description": "URL to the ligand-target pair page in BindingDB.", + "dataType": "sc:URL", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Link to Ligand-Target Pair in BindingDB" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/ligand_het_id_pdb", + "name": "ligand_het_id_pdb", + "description": "PDB HET group identifier for the ligand.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Ligand HET ID in PDB" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/pdb_ids_complex", + "name": "pdb_ids_complex", + "description": "PDB structure IDs for the ligand-target complex (pipe-delimited if multiple).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "PDB ID(s) for Ligand-Target Complex" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/pubchem_cid", + "name": "pubchem_cid", + "description": "PubChem Compound identifier (CID) of the ligand.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "PubChem CID" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/pubchem_sid", + "name": "pubchem_sid", + "description": "PubChem Substance identifier (SID) of the ligand.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "PubChem SID" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chebi_id", + "name": "chebi_id", + "description": "ChEBI identifier of the ligand.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "ChEBI ID of Ligand" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chembl_id", + "name": "chembl_id", + "description": "ChEMBL identifier of the ligand.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "ChEMBL ID of Ligand" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/drugbank_id", + "name": "drugbank_id", + "description": "DrugBank identifier of the ligand.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "DrugBank ID of Ligand" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/iuphar_grac_id", + "name": "iuphar_grac_id", + "description": "IUPHAR/BPS Guide to Pharmacology identifier.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "IUPHAR_GRAC ID of Ligand" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/kegg_id", + "name": "kegg_id", + "description": "KEGG compound identifier of the ligand.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "KEGG ID of Ligand" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/zinc_id", + "name": "zinc_id", + "description": "ZINC compound identifier of the ligand.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "ZINC ID of Ligand" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/num_protein_chains", + "name": "num_protein_chains", + "description": "Number of protein chains in the target.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Number of Protein Chains in Target (>1 implies a multichain complex)" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain1_sequence", + "name": "chain1_sequence", + "description": "BindingDB Target Chain Sequence 1 (chain 1 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "BindingDB Target Chain Sequence 1" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain1_pdb_ids", + "name": "chain1_pdb_ids", + "description": "PDB ID(s) of Target Chain 1 (chain 1 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "PDB ID(s) of Target Chain 1" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain1_swissprot_name", + "name": "chain1_swissprot_name", + "description": "UniProt (SwissProt) Recommended Name of Target Chain 1 (chain 1 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Recommended Name of Target Chain 1" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain1_swissprot_entry", + "name": "chain1_swissprot_entry", + "description": "UniProt (SwissProt) Entry Name of Target Chain 1 (chain 1 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Entry Name of Target Chain 1" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain1_swissprot_primary_id", + "name": "chain1_swissprot_primary_id", + "description": "UniProt (SwissProt) Primary ID of Target Chain 1 (chain 1 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Primary ID of Target Chain 1" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain1_swissprot_sec_ids", + "name": "chain1_swissprot_sec_ids", + "description": "UniProt (SwissProt) Secondary ID(s) of Target Chain 1 (chain 1 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Secondary ID(s) of Target Chain 1" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain1_swissprot_alt_ids", + "name": "chain1_swissprot_alt_ids", + "description": "UniProt (SwissProt) Alternative ID(s) of Target Chain 1 (chain 1 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Alternative ID(s) of Target Chain 1" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain1_trembl_submitted", + "name": "chain1_trembl_submitted", + "description": "UniProt (TrEMBL) Submitted Name of Target Chain 1 (chain 1 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Submitted Name of Target Chain 1" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain1_trembl_entry", + "name": "chain1_trembl_entry", + "description": "UniProt (TrEMBL) Entry Name of Target Chain 1 (chain 1 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Entry Name of Target Chain 1" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain1_trembl_primary_id", + "name": "chain1_trembl_primary_id", + "description": "UniProt (TrEMBL) Primary ID of Target Chain 1 (chain 1 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Primary ID of Target Chain 1" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain1_trembl_sec_ids", + "name": "chain1_trembl_sec_ids", + "description": "UniProt (TrEMBL) Secondary ID(s) of Target Chain 1 (chain 1 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Secondary ID(s) of Target Chain 1" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain1_trembl_alt_ids", + "name": "chain1_trembl_alt_ids", + "description": "UniProt (TrEMBL) Alternative ID(s) of Target Chain 1 (chain 1 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Alternative ID(s) of Target Chain 1" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain2_sequence", + "name": "chain2_sequence", + "description": "BindingDB Target Chain Sequence 2 (chain 2 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "BindingDB Target Chain Sequence 2" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain2_pdb_ids", + "name": "chain2_pdb_ids", + "description": "PDB ID(s) of Target Chain 2 (chain 2 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "PDB ID(s) of Target Chain 2" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain2_swissprot_name", + "name": "chain2_swissprot_name", + "description": "UniProt (SwissProt) Recommended Name of Target Chain 2 (chain 2 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Recommended Name of Target Chain 2" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain2_swissprot_entry", + "name": "chain2_swissprot_entry", + "description": "UniProt (SwissProt) Entry Name of Target Chain 2 (chain 2 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Entry Name of Target Chain 2" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain2_swissprot_primary_id", + "name": "chain2_swissprot_primary_id", + "description": "UniProt (SwissProt) Primary ID of Target Chain 2 (chain 2 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Primary ID of Target Chain 2" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain2_swissprot_sec_ids", + "name": "chain2_swissprot_sec_ids", + "description": "UniProt (SwissProt) Secondary ID(s) of Target Chain 2 (chain 2 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Secondary ID(s) of Target Chain 2" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain2_swissprot_alt_ids", + "name": "chain2_swissprot_alt_ids", + "description": "UniProt (SwissProt) Alternative ID(s) of Target Chain 2 (chain 2 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Alternative ID(s) of Target Chain 2" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain2_trembl_submitted", + "name": "chain2_trembl_submitted", + "description": "UniProt (TrEMBL) Submitted Name of Target Chain 2 (chain 2 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Submitted Name of Target Chain 2" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain2_trembl_entry", + "name": "chain2_trembl_entry", + "description": "UniProt (TrEMBL) Entry Name of Target Chain 2 (chain 2 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Entry Name of Target Chain 2" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain2_trembl_primary_id", + "name": "chain2_trembl_primary_id", + "description": "UniProt (TrEMBL) Primary ID of Target Chain 2 (chain 2 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Primary ID of Target Chain 2" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain2_trembl_sec_ids", + "name": "chain2_trembl_sec_ids", + "description": "UniProt (TrEMBL) Secondary ID(s) of Target Chain 2 (chain 2 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Secondary ID(s) of Target Chain 2" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain2_trembl_alt_ids", + "name": "chain2_trembl_alt_ids", + "description": "UniProt (TrEMBL) Alternative ID(s) of Target Chain 2 (chain 2 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Alternative ID(s) of Target Chain 2" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain3_sequence", + "name": "chain3_sequence", + "description": "BindingDB Target Chain Sequence 3 (chain 3 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "BindingDB Target Chain Sequence 3" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain3_pdb_ids", + "name": "chain3_pdb_ids", + "description": "PDB ID(s) of Target Chain 3 (chain 3 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "PDB ID(s) of Target Chain 3" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain3_swissprot_name", + "name": "chain3_swissprot_name", + "description": "UniProt (SwissProt) Recommended Name of Target Chain 3 (chain 3 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Recommended Name of Target Chain 3" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain3_swissprot_entry", + "name": "chain3_swissprot_entry", + "description": "UniProt (SwissProt) Entry Name of Target Chain 3 (chain 3 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Entry Name of Target Chain 3" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain3_swissprot_primary_id", + "name": "chain3_swissprot_primary_id", + "description": "UniProt (SwissProt) Primary ID of Target Chain 3 (chain 3 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Primary ID of Target Chain 3" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain3_swissprot_sec_ids", + "name": "chain3_swissprot_sec_ids", + "description": "UniProt (SwissProt) Secondary ID(s) of Target Chain 3 (chain 3 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Secondary ID(s) of Target Chain 3" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain3_swissprot_alt_ids", + "name": "chain3_swissprot_alt_ids", + "description": "UniProt (SwissProt) Alternative ID(s) of Target Chain 3 (chain 3 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Alternative ID(s) of Target Chain 3" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain3_trembl_submitted", + "name": "chain3_trembl_submitted", + "description": "UniProt (TrEMBL) Submitted Name of Target Chain 3 (chain 3 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Submitted Name of Target Chain 3" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain3_trembl_entry", + "name": "chain3_trembl_entry", + "description": "UniProt (TrEMBL) Entry Name of Target Chain 3 (chain 3 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Entry Name of Target Chain 3" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain3_trembl_primary_id", + "name": "chain3_trembl_primary_id", + "description": "UniProt (TrEMBL) Primary ID of Target Chain 3 (chain 3 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Primary ID of Target Chain 3" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain3_trembl_sec_ids", + "name": "chain3_trembl_sec_ids", + "description": "UniProt (TrEMBL) Secondary ID(s) of Target Chain 3 (chain 3 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Secondary ID(s) of Target Chain 3" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain3_trembl_alt_ids", + "name": "chain3_trembl_alt_ids", + "description": "UniProt (TrEMBL) Alternative ID(s) of Target Chain 3 (chain 3 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Alternative ID(s) of Target Chain 3" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain4_sequence", + "name": "chain4_sequence", + "description": "BindingDB Target Chain Sequence 4 (chain 4 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "BindingDB Target Chain Sequence 4" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain4_pdb_ids", + "name": "chain4_pdb_ids", + "description": "PDB ID(s) of Target Chain 4 (chain 4 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "PDB ID(s) of Target Chain 4" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain4_swissprot_name", + "name": "chain4_swissprot_name", + "description": "UniProt (SwissProt) Recommended Name of Target Chain 4 (chain 4 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Recommended Name of Target Chain 4" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain4_swissprot_entry", + "name": "chain4_swissprot_entry", + "description": "UniProt (SwissProt) Entry Name of Target Chain 4 (chain 4 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Entry Name of Target Chain 4" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain4_swissprot_primary_id", + "name": "chain4_swissprot_primary_id", + "description": "UniProt (SwissProt) Primary ID of Target Chain 4 (chain 4 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Primary ID of Target Chain 4" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain4_swissprot_sec_ids", + "name": "chain4_swissprot_sec_ids", + "description": "UniProt (SwissProt) Secondary ID(s) of Target Chain 4 (chain 4 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Secondary ID(s) of Target Chain 4" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain4_swissprot_alt_ids", + "name": "chain4_swissprot_alt_ids", + "description": "UniProt (SwissProt) Alternative ID(s) of Target Chain 4 (chain 4 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Alternative ID(s) of Target Chain 4" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain4_trembl_submitted", + "name": "chain4_trembl_submitted", + "description": "UniProt (TrEMBL) Submitted Name of Target Chain 4 (chain 4 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Submitted Name of Target Chain 4" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain4_trembl_entry", + "name": "chain4_trembl_entry", + "description": "UniProt (TrEMBL) Entry Name of Target Chain 4 (chain 4 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Entry Name of Target Chain 4" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain4_trembl_primary_id", + "name": "chain4_trembl_primary_id", + "description": "UniProt (TrEMBL) Primary ID of Target Chain 4 (chain 4 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Primary ID of Target Chain 4" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain4_trembl_sec_ids", + "name": "chain4_trembl_sec_ids", + "description": "UniProt (TrEMBL) Secondary ID(s) of Target Chain 4 (chain 4 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Secondary ID(s) of Target Chain 4" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain4_trembl_alt_ids", + "name": "chain4_trembl_alt_ids", + "description": "UniProt (TrEMBL) Alternative ID(s) of Target Chain 4 (chain 4 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Alternative ID(s) of Target Chain 4" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain5_sequence", + "name": "chain5_sequence", + "description": "BindingDB Target Chain Sequence 5 (chain 5 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "BindingDB Target Chain Sequence 5" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain5_pdb_ids", + "name": "chain5_pdb_ids", + "description": "PDB ID(s) of Target Chain 5 (chain 5 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "PDB ID(s) of Target Chain 5" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain5_swissprot_name", + "name": "chain5_swissprot_name", + "description": "UniProt (SwissProt) Recommended Name of Target Chain 5 (chain 5 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Recommended Name of Target Chain 5" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain5_swissprot_entry", + "name": "chain5_swissprot_entry", + "description": "UniProt (SwissProt) Entry Name of Target Chain 5 (chain 5 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Entry Name of Target Chain 5" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain5_swissprot_primary_id", + "name": "chain5_swissprot_primary_id", + "description": "UniProt (SwissProt) Primary ID of Target Chain 5 (chain 5 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Primary ID of Target Chain 5" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain5_swissprot_sec_ids", + "name": "chain5_swissprot_sec_ids", + "description": "UniProt (SwissProt) Secondary ID(s) of Target Chain 5 (chain 5 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Secondary ID(s) of Target Chain 5" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain5_swissprot_alt_ids", + "name": "chain5_swissprot_alt_ids", + "description": "UniProt (SwissProt) Alternative ID(s) of Target Chain 5 (chain 5 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Alternative ID(s) of Target Chain 5" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain5_trembl_submitted", + "name": "chain5_trembl_submitted", + "description": "UniProt (TrEMBL) Submitted Name of Target Chain 5 (chain 5 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Submitted Name of Target Chain 5" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain5_trembl_entry", + "name": "chain5_trembl_entry", + "description": "UniProt (TrEMBL) Entry Name of Target Chain 5 (chain 5 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Entry Name of Target Chain 5" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain5_trembl_primary_id", + "name": "chain5_trembl_primary_id", + "description": "UniProt (TrEMBL) Primary ID of Target Chain 5 (chain 5 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Primary ID of Target Chain 5" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain5_trembl_sec_ids", + "name": "chain5_trembl_sec_ids", + "description": "UniProt (TrEMBL) Secondary ID(s) of Target Chain 5 (chain 5 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Secondary ID(s) of Target Chain 5" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain5_trembl_alt_ids", + "name": "chain5_trembl_alt_ids", + "description": "UniProt (TrEMBL) Alternative ID(s) of Target Chain 5 (chain 5 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Alternative ID(s) of Target Chain 5" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain6_sequence", + "name": "chain6_sequence", + "description": "BindingDB Target Chain Sequence 6 (chain 6 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "BindingDB Target Chain Sequence 6" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain6_pdb_ids", + "name": "chain6_pdb_ids", + "description": "PDB ID(s) of Target Chain 6 (chain 6 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "PDB ID(s) of Target Chain 6" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain6_swissprot_name", + "name": "chain6_swissprot_name", + "description": "UniProt (SwissProt) Recommended Name of Target Chain 6 (chain 6 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Recommended Name of Target Chain 6" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain6_swissprot_entry", + "name": "chain6_swissprot_entry", + "description": "UniProt (SwissProt) Entry Name of Target Chain 6 (chain 6 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Entry Name of Target Chain 6" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain6_swissprot_primary_id", + "name": "chain6_swissprot_primary_id", + "description": "UniProt (SwissProt) Primary ID of Target Chain 6 (chain 6 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Primary ID of Target Chain 6" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain6_swissprot_sec_ids", + "name": "chain6_swissprot_sec_ids", + "description": "UniProt (SwissProt) Secondary ID(s) of Target Chain 6 (chain 6 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Secondary ID(s) of Target Chain 6" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain6_swissprot_alt_ids", + "name": "chain6_swissprot_alt_ids", + "description": "UniProt (SwissProt) Alternative ID(s) of Target Chain 6 (chain 6 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Alternative ID(s) of Target Chain 6" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain6_trembl_submitted", + "name": "chain6_trembl_submitted", + "description": "UniProt (TrEMBL) Submitted Name of Target Chain 6 (chain 6 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Submitted Name of Target Chain 6" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain6_trembl_entry", + "name": "chain6_trembl_entry", + "description": "UniProt (TrEMBL) Entry Name of Target Chain 6 (chain 6 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Entry Name of Target Chain 6" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain6_trembl_primary_id", + "name": "chain6_trembl_primary_id", + "description": "UniProt (TrEMBL) Primary ID of Target Chain 6 (chain 6 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Primary ID of Target Chain 6" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain6_trembl_sec_ids", + "name": "chain6_trembl_sec_ids", + "description": "UniProt (TrEMBL) Secondary ID(s) of Target Chain 6 (chain 6 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Secondary ID(s) of Target Chain 6" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain6_trembl_alt_ids", + "name": "chain6_trembl_alt_ids", + "description": "UniProt (TrEMBL) Alternative ID(s) of Target Chain 6 (chain 6 of up to 6).", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (TrEMBL) Alternative ID(s) of Target Chain 6" + } + } + } + ] + }, + { + "@type": "http://mlcommons.org/croissant/RecordSet", + "@id": "bindingdb/uniprot_mapping", + "name": "uniprot_mapping", + "description": "Mapping of BindingDB polymer (protein chain) IDs to UniProt accessions and names.", + "field": [ + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/uniprot_mapping/polymerid", + "name": "polymerid", + "description": "BindingDB internal polymer (protein chain) identifier.", + "dataType": "sc:Integer", + "source": { + "fileObject": { + "@id": "bindingdb/uniprot_txt" + }, + "extract": { + "column": "polymerid" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/uniprot_mapping/uniprot_id", + "name": "uniprot_id", + "description": "UniProt accession identifier.", + "dataType": "sc:Text", + "source": { + "fileObject": { + "@id": "bindingdb/uniprot_txt" + }, + "extract": { + "column": "UniProt" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/uniprot_mapping/bindingdb_name", + "name": "bindingdb_name", + "description": "BindingDB name for the protein chain.", + "dataType": "sc:Text", + "source": { + "fileObject": { + "@id": "bindingdb/uniprot_txt" + }, + "extract": { + "column": "BindingDB Name" + } + } + } + ] + } + ] +} \ No newline at end of file diff --git a/parser_specs/BINDING-DB/parser.yaml b/parser_specs/BINDING-DB/parser.yaml new file mode 100644 index 00000000..458e2398 --- /dev/null +++ b/parser_specs/BINDING-DB/parser.yaml @@ -0,0 +1,159 @@ +source_id: BINDING-DB-Croissant +provenance_id: infores:bindingdb +parsing_version: "3.0" + +from: + croissant: bindingdb_croissant.json + dataset_id: bindingdb + version_from: dataset.version + distribution: bindingdb/all_tsv_fileset + record_set: bindingdb/binding_data + format: tsv + delimiter: "\t" + archive_member: BindingDB_All.tsv + test_mode_limit: 10000 + +fields: + ligand_id: + column: pubchem_cid + kind: identifier + prefix: PUBCHEM.COMPOUND + protein_id: + column: chain1_swissprot_primary_id + kind: identifier + prefix: UniProtKB + publication: + column: pmid + kind: optional_identifier + prefix: "PMID:" + pubchem_assay_id: + column: pubchem_aid + kind: optional_identifier + prefix: "PUBCHEM.AID:" + patent_id: + column: patent_number + kind: optional_identifier + prefix: "PATENT:" + measurements: + kind: value_columns + unit: nM + columns: + ki_nm: + parameter: pKi + predicate: "{DGIDB}:inhibitor" + ic50_nm: + parameter: pIC50 + predicate: CTD:decreases_activity_of + kd_nm: + parameter: pKd + predicate: RO:0002436 + ec50_nm: + parameter: pEC50 + predicate: CTD:increases_activity_of + +where: + - exists: ligand_id + - exists: protein_id + +views: + measurement_rows: + from: source + unpivot: + field: measurements + as: measurement + select: + ligand_id: $ligand_id + protein_id: $protein_id + parameter: $measurement.parameter + predicate: $measurement.predicate + affinity_nm: + parse_qualified_float: + value: $measurement.value + reject_operators: + - ">" + strip_operators: + - "<" + minimum_exclusive: 0 + publication: $publication + pubchem_assay_id: $pubchem_assay_id + patent_id: $patent_id + + aggregated_measurements: + from: measurement_rows + group_by: + ligand_id: $ligand_id + protein_id: $protein_id + parameter: $parameter + predicate: $predicate + aggregates: + supporting_affinities: + when: $affinity_nm + list: + neglog10_nm: + value: $affinity_nm + precision: 2 + publications: + when: $affinity_nm + unique: $publication + pubchem_assay_ids: + when: $affinity_nm + unique: $pubchem_assay_id + patent_ids: + when: $affinity_nm + unique: $patent_id + average_affinity_nm: + when: $affinity_nm + mean: $affinity_nm + let: + affinity: + neglog10_nm: + value: $average_affinity_nm + precision: 2 + having: + - exists: average_affinity_nm + +graph: + nodes: + - from: aggregated_measurements + id: $ligand_id + - from: aggregated_measurements + id: $protein_id + + edges: + - from: aggregated_measurements + subject: $ligand_id + predicate: $predicate + object: $protein_id + props: + ligand: $ligand_id + protein: $protein_id + affinity_parameter: $parameter + supporting_affinities: $supporting_affinities + publications: $publications + pubchem_assay_ids: $pubchem_assay_ids + patent_ids: $patent_ids + knowledge_level: knowledge_assertion + agent_type: manual_agent + affinity: $affinity + +output: + node_order: edge_encounter + edge_property_order: + - ligand + - protein + - affinity_parameter + - supporting_affinities + - publications + - pubchem_assay_ids + - patent_ids + - knowledge_level + - agent_type + - affinity + metadata: + drop: + - num_source_lines + - unusable_source_lines + set: + record_counter: source_edges + skipped_record_counter: 0 + errors: [] diff --git a/parser_specs/HGNC/hgnc_croissant.json b/parser_specs/HGNC/hgnc_croissant.json new file mode 100644 index 00000000..ff1c5ea9 --- /dev/null +++ b/parser_specs/HGNC/hgnc_croissant.json @@ -0,0 +1,741 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "sc": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "dct": "http://purl.org/dc/terms/", + "annotation": "cr:annotation", + "arrayShape": "cr:arrayShape", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "containedIn": "cr:containedIn", + "data": {"@id": "cr:data", "@type": "@json"}, + "dataType": {"@id": "cr:dataType", "@type": "@vocab"}, + "equivalentProperty": "cr:equivalentProperty", + "examples": {"@id": "cr:examples", "@type": "@json"}, + "excludes": "cr:excludes", + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isArray": "cr:isArray", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "readLines": "cr:readLines", + "sdVersion": "cr:sdVersion", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "unArchive": "cr:unArchive", + "value": "cr:value" + }, + "@type": "https://schema.org/Dataset", + "@id": "hgnc", + "conformsTo": "http://mlcommons.org/croissant/1.1", + "name": "HGNC", + "description": "The HGNC is a resource for approved human gene nomenclature containing ~42000 gene symbols and names and 1300+ gene families and sets", + "url": "https://www.genenames.org", + "identifier": "RRID:SCR_002827", + "version": "2026-03-06", + "dateModified": "2026-03-06", + "license": "https://creativecommons.org/publicdomain/zero/1.0/", + "conditionsOfAccess": "No restrictions are imposed on access to, or use of, the data provided by the HGNC, which are provided to enhance knowledge and encourage progress in the scientific community. The HGNC provides these data in good faith, but make no warranty, express or implied, nor assume any legal liability or responsibility for any purpose for which they are used.", + "isAccessibleForFree": true, + "inLanguage": "en", + "keywords": [ + "gene nomenclature", + "human genes", + "gene symbols", + "HGNC", + "bioinformatics", + "genomics" + ], + "citeAs": "Seal RL, Braschi B, Gray K, McClay J, Tweedie S, Bruford EA. Genenames.org: the HGNC and PGNC resources in 2026. Nucleic Acids Res. 2025 Nov 25:gkaf1229. DOI: 10.1093/nar/gkaf1229. PMID: 41287213", + "citation": "Seal RL, Braschi B, Gray K, McClay J, Tweedie S, Bruford EA. Genenames.org: the HGNC and PGNC resources in 2026. Nucleic Acids Res. 2025 Nov 25:gkaf1229. DOI: 10.1093/nar/gkaf1229. PMID: 41287213", + "creator": { + "@type": "https://schema.org/Organization", + "name": "HUGO Gene Nomenclature Committee (HGNC)", + "url": "https://www.genenames.org", + "parentOrganization": { + "@type": "https://schema.org/Organization", + "name": "University of Cambridge" + } + }, + "funder": { + "@type": "https://schema.org/Organization", + "name": "US National Human Genome Research Institute (NHGRI)" + }, + "relatedLink": "hgnc_apis.json", + "distribution": [ + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "hgnc/hgnc_complete_set_tsv", + "name": "hgnc_complete_set_tsv", + "description": "Complete HGNC approved dataset in tab-separated values (TSV) format. Multiple-valued fields are double-quoted and pipe-delimited within the quotes.", + "contentUrl": "https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt", + "encodingFormat": "text/tab-separated-values", + "md5": "16ae644b4598083157299c44a6eb418f" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "hgnc/hgnc_complete_set_json", + "name": "hgnc_complete_set_json", + "description": "Complete HGNC approved dataset in JSON format (no indentation or whitespace). Intended for loading into a JSON parser within a script or program.", + "contentUrl": "https://storage.googleapis.com/public-download-files/hgnc/json/json/hgnc_complete_set.json", + "encodingFormat": "application/json", + "md5": "39d7981d6408775e85ce9d2a552c4ef4" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "hgnc/withdrawn_tsv", + "name": "withdrawn_tsv", + "description": "Withdrawn HGNC symbol reports in tab-separated values (TSV) format.", + "contentUrl": "https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/withdrawn.txt", + "encodingFormat": "text/tab-separated-values", + "md5": "9aca8b1084453db66ad44f36e4ab9dcf" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "hgnc/withdrawn_json", + "name": "withdrawn_json", + "description": "Withdrawn HGNC symbol reports in JSON format.", + "contentUrl": "https://storage.googleapis.com/public-download-files/hgnc/json/json/withdrawn.json", + "encodingFormat": "application/json", + "md5": "9517b7ff47ec62ca04a7513b0269fcb1" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "hgnc/hgnc_owl", + "name": "hgnc_owl", + "description": "HGNC OWL file created by SciBite. Contains all genes in HGNC organised in a shallow hierarchy, classified by locus type and gene group. Includes approved gene symbol, approved gene name, previous names and symbols, and mappings to external databases.", + "contentUrl": "https://storage.googleapis.com/public-download-files/hgnc/owl/owl/hgnc.owl", + "encodingFormat": "application/rdf+xml", + "md5": "8db14e98e8121034754ee8ae6f696b66" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "hgnc/gene_groups_tsv", + "name": "gene_groups_tsv", + "description": "Complete HGNC gene groups dataset in tab-separated values (TSV) format, containing all genes assigned to gene groups with their group associations.", + "contentUrl": "https://genenames.org/cgi-bin/genegroup/download-all", + "encodingFormat": "text/tab-separated-values", + "md5": "7b351f57ba56565f12c23e01745e452f" + }, + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "hgnc/gene_groups_json", + "name": "gene_groups_json", + "description": "Complete HGNC gene groups dataset in JSON format.", + "contentUrl": "https://genenames.org/cgi-bin/genegroup/download-all?format=json", + "encodingFormat": "application/json", + "md5": "503aa01299efcf8522feff3eb8440eac" + } + ], + "recordSet": [ + { + "@type": "http://mlcommons.org/croissant/RecordSet", + "@id": "hgnc/hgnc_complete_set", + "name": "hgnc_complete_set", + "description": "Complete HGNC approved gene symbol dataset. Each record represents one approved human gene locus with its symbol, name, locus information, cross-references to external databases, and nomenclature history.", + "field": [ + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/hgnc_id", + "name": "hgnc_id", + "description": "HGNC ID. A unique ID created by the HGNC for every approved symbol.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "hgnc_id"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/symbol", + "name": "symbol", + "description": "The HGNC approved gene symbol. Equates to the Approved symbol field within the gene symbol report.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "symbol"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/name", + "name": "name", + "description": "HGNC approved name for the gene. Equates to the Approved name field within the gene symbol report.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "name"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/locus_group", + "name": "locus_group", + "description": "A group name for a set of related locus types as defined by the HGNC (e.g. non-coding RNA).", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "locus_group"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/locus_type", + "name": "locus_type", + "description": "The locus type as set by the HGNC.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "locus_type"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/status", + "name": "status", + "description": "Status of the symbol report, which can be either Approved or Entry Withdrawn.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "status"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/location", + "name": "location", + "description": "Cytogenetic location of the gene (e.g. 2q34).", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "location"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/location_sortable", + "name": "location_sortable", + "description": "Same as location but single digit chromosomes are prefixed with a 0 enabling them to be sorted in correct numerical order (e.g. 02q34).", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "location_sortable"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/alias_symbol", + "name": "alias_symbol", + "description": "Other symbols used to refer to this gene as seen in the Alias symbols field in the gene symbol report. Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "alias_symbol"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/alias_name", + "name": "alias_name", + "description": "Other names used to refer to this gene as seen in the Alias names field in the gene symbol report. Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "alias_name"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/prev_symbol", + "name": "prev_symbol", + "description": "Gene symbols previously approved by the HGNC for this gene. Equates to the Previous symbols field within the gene symbol report. Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "prev_symbol"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/prev_name", + "name": "prev_name", + "description": "Gene names previously approved by the HGNC for this gene. Equates to the Previous names field within the gene symbol report. Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "prev_name"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/gene_group", + "name": "gene_group", + "description": "The gene group name as set by the HGNC and seen at the top of the gene group reports. Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "gene_group"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/gene_group_id", + "name": "gene_group_id", + "description": "ID used to designate a gene group the gene has been assigned to. Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "gene_group_id"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/date_approved_reserved", + "name": "date_approved_reserved", + "description": "The date the entry was first approved.", + "dataType": "sc:Date", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "date_approved_reserved"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/date_symbol_changed", + "name": "date_symbol_changed", + "description": "The date the approved symbol was last changed.", + "dataType": "sc:Date", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "date_symbol_changed"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/date_name_changed", + "name": "date_name_changed", + "description": "The date the approved name was last changed.", + "dataType": "sc:Date", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "date_name_changed"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/date_modified", + "name": "date_modified", + "description": "Date the entry was last modified.", + "dataType": "sc:Date", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "date_modified"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/entrez_id", + "name": "entrez_id", + "description": "NCBI gene ID. Found within the Gene resources section of the gene symbol report.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "entrez_id"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/ensembl_gene_id", + "name": "ensembl_gene_id", + "description": "Ensembl gene ID. Found within the Gene resources section of the gene symbol report.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "ensembl_gene_id"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/vega_id", + "name": "vega_id", + "description": "Vega gene ID. Found within the Gene resources section of the gene symbol report.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "vega_id"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/ucsc_id", + "name": "ucsc_id", + "description": "UCSC gene ID. Found within the Gene resources section of the gene symbol report.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "ucsc_id"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/ena", + "name": "ena", + "description": "International Nucleotide Sequence Database Collaboration (GenBank, ENA and DDBJ) accession number(s). Found within the Nucleotide resources section of the gene symbol report. Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "ena"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/refseq_accession", + "name": "refseq_accession", + "description": "RefSeq nucleotide accession(s). Found within the Nucleotide resources section of the gene symbol report. Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "refseq_accession"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/ccds_id", + "name": "ccds_id", + "description": "Consensus CDS ID. Found within the Nucleotide resources section of the gene symbol report. Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "ccds_id"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/uniprot_ids", + "name": "uniprot_ids", + "description": "UniProt protein accession. Found within the Protein resource section of the gene symbol report. Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "uniprot_ids"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/pubmed_id", + "name": "pubmed_id", + "description": "Pubmed and Europe Pubmed Central PMID(s). Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "pubmed_id"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/mgd_id", + "name": "mgd_id", + "description": "Mouse genome informatics database ID. Found within the Homologs section of the gene symbol report. Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "mgd_id"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/rgd_id", + "name": "rgd_id", + "description": "Rat genome database gene ID. Found within the Homologs section of the gene symbol report. Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "rgd_id"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/lsdb", + "name": "lsdb", + "description": "The name of the Locus Specific Mutation Database and URL for the gene separated by a | character, e.g. Mutations of the ATP-binding Cassette Transporter Retina|http://www.retina-international.org/files/sci-news/abcrmut.htm. Multiple entries pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "lsdb"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/cosmic", + "name": "cosmic", + "description": "Symbol used within the Catalogue of somatic mutations in cancer for the gene. (No longer updated!).", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "cosmic"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/omim_id", + "name": "omim_id", + "description": "Online Mendelian Inheritance in Man (OMIM) ID. Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "omim_id"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/mirbase", + "name": "mirbase", + "description": "miRBase ID.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "mirbase"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/homeodb", + "name": "homeodb", + "description": "Homeobox Database ID.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "homeodb"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/snornabase", + "name": "snornabase", + "description": "snoRNABase ID.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "snornabase"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/bioparadigms_slc", + "name": "bioparadigms_slc", + "description": "Symbol used to link to the SLC tables database at bioparadigms.org for the gene.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "bioparadigms_slc"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/orphanet", + "name": "orphanet", + "description": "Orphanet ID.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "orphanet"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/pseudogene_org", + "name": "pseudogene_org", + "description": "Pseudogene.org ID.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "pseudogene.org"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/horde_id", + "name": "horde_id", + "description": "Symbol used within HORDE for the gene.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "horde_id"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/merops", + "name": "merops", + "description": "ID used to link to the MEROPS peptidase database.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "merops"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/imgt", + "name": "imgt", + "description": "Symbol used within international ImMunoGeneTics information system. Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "imgt"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/iuphar", + "name": "iuphar", + "description": "The objectId used to link to the IUPHAR/BPS Guide to PHARMACOLOGY database.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "iuphar"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/kznf_gene_catalog", + "name": "kznf_gene_catalog", + "description": "Column present in the TSV file header; no description found on the data dictionary help page.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "kznf_gene_catalog"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/mamit_trnadb", + "name": "mamit_trnadb", + "description": "ID to link to the Mamit-tRNA database.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "mamit-trnadb"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/cd", + "name": "cd", + "description": "Symbol used within the Human Cell Differentiation Molecule database for the gene.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "cd"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/lncrnadb", + "name": "lncrnadb", + "description": "lncRNA Database ID - Resource is now defunct.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "lncrnadb"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/enzyme_id", + "name": "enzyme_id", + "description": "ENZYME EC accession number. Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "enzyme_id"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/intermediate_filament_db", + "name": "intermediate_filament_db", + "description": "ID used to link to the Human Intermediate Filament Database.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "intermediate_filament_db"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/rna_central_id", + "name": "rna_central_id", + "description": "RNAcentral ID. Multiple values pipe-delimited within double quotes.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "rna_central_id"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/lncipedia", + "name": "lncipedia", + "description": "The gene symbol used for a gene report within LNCipedia - A comprehensive compendium of human long non-coding RNAs.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "lncipedia"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/gtrnadb", + "name": "gtrnadb", + "description": "GtRNAdb (Genomic tRNA Database) ID.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "gtrnadb"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/agr", + "name": "agr", + "description": "The HGNC ID that the Alliance of Genome Resources (AGR) have linked to their record of the gene.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "agr"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/mane_select", + "name": "mane_select", + "description": "NCBI and Ensembl transcript IDs/accessions including the version number for one high-quality representative transcript per protein-coding gene that is well-supported by experimental data and represents the biology of the gene. The IDs are delimited by |.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "mane_select"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/gencc", + "name": "gencc", + "description": "The HGNC ID used within the GenCC database as the unique identifier of their gene reports.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/hgnc_complete_set_tsv"}, "extract": {"column": "gencc"}} + } + ] + }, + { + "@type": "http://mlcommons.org/croissant/RecordSet", + "@id": "hgnc/withdrawn", + "name": "withdrawn", + "description": "Withdrawn HGNC symbol reports. Each record represents a symbol that has been withdrawn, with its withdrawal status and any symbol it was merged into.", + "field": [ + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/withdrawn/HGNC_ID", + "name": "HGNC_ID", + "description": "HGNC ID. A unique ID created by the HGNC for every approved symbol.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/withdrawn_tsv"}, "extract": {"column": "HGNC_ID"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/withdrawn/STATUS", + "name": "STATUS", + "description": "Status of the withdrawn symbol report.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/withdrawn_tsv"}, "extract": {"column": "STATUS"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/withdrawn/WITHDRAWN_SYMBOL", + "name": "WITHDRAWN_SYMBOL", + "description": "The gene symbol that was withdrawn.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/withdrawn_tsv"}, "extract": {"column": "WITHDRAWN_SYMBOL"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/withdrawn/merged_into_reports", + "name": "merged_into_reports", + "description": "The HGNC ID, symbol, and status of the report(s) this withdrawn entry was merged into, formatted as HGNC_ID|SYMBOL|STATUS. Multiple entries pipe-delimited.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/withdrawn_tsv"}, "extract": {"column": "MERGED_INTO_REPORT(S) (i.e HGNC_ID|SYMBOL|STATUS)"}} + } + ] + }, + { + "@type": "http://mlcommons.org/croissant/RecordSet", + "@id": "hgnc/gene_groups", + "name": "gene_groups", + "description": "HGNC gene groups dataset. Each record represents one gene assigned to a gene group, with its nomenclature and group association details.", + "field": [ + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/gene_groups/HGNC_ID", + "name": "HGNC_ID", + "description": "HGNC ID. A unique ID created by the HGNC for every approved symbol.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/gene_groups_tsv"}, "extract": {"column": "HGNC ID"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/gene_groups/Approved_symbol", + "name": "Approved_symbol", + "description": "The HGNC approved gene symbol.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/gene_groups_tsv"}, "extract": {"column": "Approved symbol"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/gene_groups/Approved_name", + "name": "Approved_name", + "description": "HGNC approved name for the gene.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/gene_groups_tsv"}, "extract": {"column": "Approved name"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/gene_groups/Status", + "name": "Status", + "description": "Status of the symbol report.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/gene_groups_tsv"}, "extract": {"column": "Status"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/gene_groups/Locus_type", + "name": "Locus_type", + "description": "The locus type as set by the HGNC.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/gene_groups_tsv"}, "extract": {"column": "Locus type"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/gene_groups/Previous_symbols", + "name": "Previous_symbols", + "description": "Gene symbols previously approved by the HGNC for this gene.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/gene_groups_tsv"}, "extract": {"column": "Previous symbols"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/gene_groups/Alias_symbols", + "name": "Alias_symbols", + "description": "Other symbols used to refer to this gene.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/gene_groups_tsv"}, "extract": {"column": "Alias symbols"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/gene_groups/Chromosome", + "name": "Chromosome", + "description": "Chromosome on which the gene is located.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/gene_groups_tsv"}, "extract": {"column": "Chromosome"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/gene_groups/NCBI_Gene_ID", + "name": "NCBI_Gene_ID", + "description": "NCBI gene ID.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/gene_groups_tsv"}, "extract": {"column": "NCBI Gene ID"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/gene_groups/Ensembl_gene_ID", + "name": "Ensembl_gene_ID", + "description": "Ensembl gene ID.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/gene_groups_tsv"}, "extract": {"column": "Ensembl gene ID"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/gene_groups/Vega_gene_ID", + "name": "Vega_gene_ID", + "description": "Vega gene ID.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/gene_groups_tsv"}, "extract": {"column": "Vega gene ID"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/gene_groups/Group_ID", + "name": "Group_ID", + "description": "ID used to designate the gene group.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/gene_groups_tsv"}, "extract": {"column": "Group ID"}} + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/gene_groups/Group_name", + "name": "Group_name", + "description": "The gene group name as set by the HGNC.", + "dataType": "sc:Text", + "source": {"fileObject": {"@id": "hgnc/gene_groups_tsv"}, "extract": {"column": "Group name"}} + } + ] + } + ] +} diff --git a/parser_specs/HGNC/parser.yaml b/parser_specs/HGNC/parser.yaml new file mode 100644 index 00000000..b44eca36 --- /dev/null +++ b/parser_specs/HGNC/parser.yaml @@ -0,0 +1,104 @@ +source_id: HGNC +provenance_id: infores:hgnc +parsing_version: "3.0" + +from: + croissant: hgnc_croissant.json + dataset_id: hgnc + version_from: dataset.version + distribution: hgnc/hgnc_complete_set_tsv + record_set: hgnc/hgnc_complete_set + format: tsv + delimiter: "\t" + test_mode_limit: 5000 + +fields: + gene_id: + column: hgnc_id + kind: identifier + gene_name: + column: name + kind: label + trim: false + symbol: + column: symbol + kind: property + locus_group: + column: locus_group + kind: property + location: + column: location + kind: property + preserve_empty: true + families: + kind: zipped_list + separator: "|" + columns: + id: + column: gene_group_id + kind: identifier + prefix: HGNC.FAMILY + name: + column: gene_group + kind: label + trim: false + publications: + column: pubmed_id + kind: list + separator: "|" + prefix: "PMID:" + +where: + - exists: families + +views: + gene_family_memberships: + from: source + unnest: + field: families + as: family + select: + gene_id: $gene_id + family_id: $family.id + family_name: $family.name + publications: $publications + +graph: + nodes: + - from: source + id: $gene_id + name: $gene_name + props: + locus_group: $locus_group + symbol: $symbol + location: + value: $location + preserve_empty: true + - from: gene_family_memberships + id: $family_id + name: $family_name + + edges: + - from: gene_family_memberships + subject: $gene_id + predicate: RO:0002350 + object: $family_id + primary_knowledge_source: infores:hgnc + props: + knowledge_level: knowledge_assertion + agent_type: manual_agent + publications: $publications + +output: + node_order: edge_encounter + node_property_order: + - locus_group + - symbol + - location + edge_property_order: + - knowledge_level + - agent_type + - publications + metadata: + set: + num_source_lines: source_edges diff --git a/parsers/BINDING/src/loadBINDINGDB.py b/parsers/BINDING/src/loadBINDINGDB.py index 8fa85629..f3d39d3c 100644 --- a/parsers/BINDING/src/loadBINDINGDB.py +++ b/parsers/BINDING/src/loadBINDINGDB.py @@ -1,9 +1,11 @@ +import csv import os import enum import math import json import requests +from io import TextIOWrapper from zipfile import ZipFile from requests.adapters import HTTPAdapter, Retry @@ -32,11 +34,14 @@ class BD_EDGEUMAN(enum.IntEnum): def negative_log(concentration_nm): ### This function converts nanomolar concentrations into log-scale units (pKi/pKd/pIC50/pEC50). ### return -(math.log10(concentration_nm*(10**-9))) -def generate_zipfile_rows(zip_file_path, file_inside_zip, delimiter='\\t'): +def generate_zipfile_rows(zip_file_path, file_inside_zip, delimiter='\t'): with ZipFile(zip_file_path, 'r') as zip_file: - with zip_file.open(file_inside_zip, 'r') as file: - for line in file: - yield str(line).split(delimiter) + with zip_file.open(file_inside_zip, 'r') as raw_file: + text_file = TextIOWrapper(raw_file, encoding='utf-8', newline='') + reader = csv.reader(text_file, delimiter=delimiter) + for row in reader: + if row: + yield row ############## @@ -138,6 +143,8 @@ def parse_data(self) -> dict: break if n%100000 == 0: self.logger.debug(f'processed {n} rows so far...') + if len(row) <= BD_EDGEUMAN.UNIPROT_TARGET_CHAIN.value: + continue ligand = row[BD_EDGEUMAN.PUBCHEM_CID.value] protein = row[BD_EDGEUMAN.UNIPROT_TARGET_CHAIN.value] if (ligand == '') or (protein == ''): # Check if Pubchem or UniProt ID is missing. diff --git a/parsers/metadata_driven/src/BINDING-DB-Croissant.source.json b/parsers/metadata_driven/src/BINDING-DB-Croissant.source.json new file mode 100644 index 00000000..8ec1f2bf --- /dev/null +++ b/parsers/metadata_driven/src/BINDING-DB-Croissant.source.json @@ -0,0 +1,15 @@ +{ + "@context": "https://schema.org", + "@type": "Dataset", + "identifier": "infores:bindingdb", + "name": "BindingDB", + "description": "A public, web-accessible database of measured binding affinities, focusing chiefly on the interactions of proteins considered to be candidate drug-targets with ligands that are small, drug-like molecules", + "url": "https://www.bindingdb.org/", + "attribution": "https://www.bindingdb.org/rwd/bind/info.jsp", + "citation": [ + "https://doi.org/10.1093/nar/gkae1075", + "Liu T, Hwang L, Burley SK, Nitsche CI, Southan C, Walters WP, Gilson MK. BindingDB in 2024: a FAIR knowledgebase of protein-small molecule binding data. Nucleic Acids Res. 2025 Jan 6;53(D1):D1633-D1644. doi: 10.1093/nar/gkae1075. PMID: 39574417; PMCID: PMC11701568." + ], + "license": "Creative Commons BY 3.0", + "contentUrl": "https://www.bindingdb.org/rwd/bind/chemsearch/marvin/SDFdownload.jsp?all_download=yes" +} diff --git a/parsers/metadata_driven/src/HGNC.source.json b/parsers/metadata_driven/src/HGNC.source.json new file mode 100644 index 00000000..d9c6527a --- /dev/null +++ b/parsers/metadata_driven/src/HGNC.source.json @@ -0,0 +1,15 @@ +{ + "@context": "https://schema.org", + "@type": "Dataset", + "identifier": "infores:hgnc", + "name": "HUGO Gene Nomenclature Committee (HGNC)", + "description": "The HUGO Gene Nomenclature Committee (HGNC) database provides open access to HGNC-approved unique symbols and names for human genes, gene groups, and associated resources, including links to genomic, proteomic and phenotypic information.", + "url": "https://www.genenames.org/", + "attribution": "https://www.genenames.org/", + "citation": [ + "https://doi.org/10.1093/nar/gkac888", + "Seal RL, Braschi B, Gray K, Jones TEM, Tweedie S, Haim-Vilmovsky L, Bruford EA. Genenames.org: the HGNC resources in 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D1003-D1009. doi: 10.1093/nar/gkac888. PMID: 36243972; PMCID: PMC9825485." + ], + "license": "CC0", + "contentUrl": "https://www.genenames.org/download/archive/" +} diff --git a/parsers/metadata_driven/src/loadMetadataDriven.py b/parsers/metadata_driven/src/loadMetadataDriven.py new file mode 100644 index 00000000..b564f482 --- /dev/null +++ b/parsers/metadata_driven/src/loadMetadataDriven.py @@ -0,0 +1,14 @@ +from pathlib import Path + +from orion.metadata_driven_loader import MetadataDrivenLoader + + +REPO_ROOT = Path(__file__).resolve().parents[3] + + +class HGNCCroissantLoader(MetadataDrivenLoader): + parser_spec_path = str(REPO_ROOT / "parser_specs" / "HGNC" / "parser.yaml") + + +class BINDINGDBCroissantLoader(MetadataDrivenLoader): + parser_spec_path = str(REPO_ROOT / "parser_specs" / "BINDING-DB" / "parser.yaml") diff --git a/tests/resources/metadata_parser/bindingdb/BindingDB_All.tsv b/tests/resources/metadata_parser/bindingdb/BindingDB_All.tsv new file mode 100644 index 00000000..92dbfb0f --- /dev/null +++ b/tests/resources/metadata_parser/bindingdb/BindingDB_All.tsv @@ -0,0 +1,6 @@ +PubChem CID UniProt (SwissProt) Primary ID of Target Chain 1 Ki (nM) IC50 (nM) Kd (nM) EC50 (nM) PMID PubChem AID Patent Number +111 P11111 100 12345 7001 PAT-1 +111 P11111 10 23456 7002 PAT-1 +111 P11111 200 12345 7001 +222 P22222 50 34567 8001 PAT-2 + P99999 25 99999 9999 PAT-X diff --git a/tests/resources/metadata_parser/bindingdb/bindingdb_croissant.json b/tests/resources/metadata_parser/bindingdb/bindingdb_croissant.json new file mode 100644 index 00000000..22e602c4 --- /dev/null +++ b/tests/resources/metadata_parser/bindingdb/bindingdb_croissant.json @@ -0,0 +1,161 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "sc": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/" + }, + "@type": "https://schema.org/Dataset", + "@id": "bindingdb", + "name": "BindingDB", + "version": "202603", + "dateModified": "2026-03-01", + "distribution": [ + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "bindingdb/all_tsv_zip", + "name": "all_tsv_zip", + "contentUrl": "https://example.org/BindingDB_All_202603_tsv.zip", + "encodingFormat": "application/zip", + "version": "202603" + }, + { + "@type": "http://mlcommons.org/croissant/FileSet", + "@id": "bindingdb/all_tsv_fileset", + "name": "all_tsv_fileset", + "containedIn": [ + { + "@id": "bindingdb/all_tsv_zip" + } + ], + "encodingFormat": "text/tab-separated-values", + "includes": "*.tsv" + } + ], + "recordSet": [ + { + "@type": "http://mlcommons.org/croissant/RecordSet", + "@id": "bindingdb/binding_data", + "name": "binding_data", + "field": [ + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/pubchem_cid", + "name": "pubchem_cid", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "PubChem CID" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/chain1_swissprot_primary_id", + "name": "chain1_swissprot_primary_id", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "UniProt (SwissProt) Primary ID of Target Chain 1" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/ki_nm", + "name": "ki_nm", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Ki (nM)" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/ic50_nm", + "name": "ic50_nm", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "IC50 (nM)" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/kd_nm", + "name": "kd_nm", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Kd (nM)" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/ec50_nm", + "name": "ec50_nm", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "EC50 (nM)" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/pmid", + "name": "pmid", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "PMID" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/pubchem_aid", + "name": "pubchem_aid", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "PubChem AID" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "bindingdb/binding_data/patent_number", + "name": "patent_number", + "source": { + "fileSet": { + "@id": "bindingdb/all_tsv_fileset" + }, + "extract": { + "column": "Patent Number" + } + } + } + ] + } + ] +} diff --git a/tests/resources/metadata_parser/bindingdb/parser.yaml b/tests/resources/metadata_parser/bindingdb/parser.yaml new file mode 100644 index 00000000..6080373b --- /dev/null +++ b/tests/resources/metadata_parser/bindingdb/parser.yaml @@ -0,0 +1,128 @@ +source_id: BINDING-DB +provenance_id: infores:bindingdb +parsing_version: "3.0" + +from: + croissant: bindingdb_croissant.json + dataset_id: bindingdb + version_from: dataset.version + distribution: bindingdb/all_tsv_fileset + record_set: bindingdb/binding_data + format: tsv + delimiter: "\t" + archive_member: BindingDB_All.tsv + test_mode_limit: 100 + +fields: + ligand_id: + column: pubchem_cid + kind: identifier + prefix: PUBCHEM.COMPOUND + protein_id: + column: chain1_swissprot_primary_id + kind: identifier + prefix: UniProtKB + publication: + column: pmid + kind: optional_identifier + prefix: "PMID:" + pubchem_assay_id: + column: pubchem_aid + kind: optional_identifier + prefix: "PUBCHEM.AID:" + patent_id: + column: patent_number + kind: optional_identifier + prefix: "PATENT:" + measurements: + kind: value_columns + unit: nM + columns: + ki_nm: + parameter: pKi + predicate: biolink:inhibits + ic50_nm: + parameter: pIC50 + predicate: CTD:decreases_activity_of + kd_nm: + parameter: pKd + predicate: RO:0002436 + ec50_nm: + parameter: pEC50 + predicate: CTD:increases_activity_of + +where: + - exists: ligand_id + - exists: protein_id + +views: + measurement_rows: + from: source + unpivot: + field: measurements + as: measurement + select: + ligand_id: $ligand_id + protein_id: $protein_id + parameter: $measurement.parameter + predicate: $measurement.predicate + affinity_nm: + parse_qualified_float: + value: $measurement.value + minimum_exclusive: 0 + publication: $publication + pubchem_assay_id: $pubchem_assay_id + patent_id: $patent_id + + valid_measurement_rows: + from: measurement_rows + where: + - exists: affinity_nm + + aggregated_measurements: + from: valid_measurement_rows + group_by: + ligand_id: $ligand_id + protein_id: $protein_id + parameter: $parameter + predicate: $predicate + aggregates: + supporting_affinities: + list: $affinity_nm + publications: + unique: $publication + pubchem_assay_ids: + unique: $pubchem_assay_id + patent_ids: + unique: $patent_id + average_affinity_nm: + mean: $affinity_nm + let: + affinity: + neglog10_nm: + value: $average_affinity_nm + precision: 2 + +graph: + nodes: + - from: aggregated_measurements + id: $ligand_id + category: biolink:SmallMolecule + - from: aggregated_measurements + id: $protein_id + category: biolink:Protein + + edges: + - from: aggregated_measurements + subject: $ligand_id + predicate: $predicate + object: $protein_id + primary_knowledge_source: infores:bindingdb + props: + affinity_parameter: $parameter + affinity: $affinity + average_affinity_nm: $average_affinity_nm + supporting_affinities: $supporting_affinities + publications: $publications + pubchem_assay_ids: $pubchem_assay_ids + patent_ids: $patent_ids diff --git a/tests/resources/metadata_parser/hgnc/hgnc_complete_set.txt b/tests/resources/metadata_parser/hgnc/hgnc_complete_set.txt new file mode 100644 index 00000000..571ea2c0 --- /dev/null +++ b/tests/resources/metadata_parser/hgnc/hgnc_complete_set.txt @@ -0,0 +1,4 @@ +hgnc_id symbol name locus_group location gene_group_id gene_group pubmed_id +HGNC:1 A1BG alpha-1-B glycoprotein protein-coding gene 19q13.43 5|7 Signal family|Carrier family 12345|23456 +HGNC:2 A2M alpha-2-macroglobulin protein-coding gene 12p13.31 +HGNC:3 NAT1 N-acetyltransferase 1 protein-coding gene 8p22 9 Transferase family diff --git a/tests/resources/metadata_parser/hgnc/hgnc_croissant.json b/tests/resources/metadata_parser/hgnc/hgnc_croissant.json new file mode 100644 index 00000000..519f2284 --- /dev/null +++ b/tests/resources/metadata_parser/hgnc/hgnc_croissant.json @@ -0,0 +1,135 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "sc": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/" + }, + "@type": "https://schema.org/Dataset", + "@id": "hgnc", + "name": "HGNC", + "version": "2026-03-06", + "dateModified": "2026-03-06", + "distribution": [ + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "hgnc/hgnc_complete_set_tsv", + "name": "hgnc_complete_set_tsv", + "contentUrl": "https://example.org/hgnc_complete_set.txt", + "encodingFormat": "text/tab-separated-values" + } + ], + "recordSet": [ + { + "@type": "http://mlcommons.org/croissant/RecordSet", + "@id": "hgnc/hgnc_complete_set", + "name": "hgnc_complete_set", + "field": [ + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/hgnc_id", + "name": "hgnc_id", + "source": { + "fileObject": { + "@id": "hgnc/hgnc_complete_set_tsv" + }, + "extract": { + "column": "hgnc_id" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/symbol", + "name": "symbol", + "source": { + "fileObject": { + "@id": "hgnc/hgnc_complete_set_tsv" + }, + "extract": { + "column": "symbol" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/name", + "name": "name", + "source": { + "fileObject": { + "@id": "hgnc/hgnc_complete_set_tsv" + }, + "extract": { + "column": "name" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/locus_group", + "name": "locus_group", + "source": { + "fileObject": { + "@id": "hgnc/hgnc_complete_set_tsv" + }, + "extract": { + "column": "locus_group" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/location", + "name": "location", + "source": { + "fileObject": { + "@id": "hgnc/hgnc_complete_set_tsv" + }, + "extract": { + "column": "location" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/gene_group_id", + "name": "gene_group_id", + "source": { + "fileObject": { + "@id": "hgnc/hgnc_complete_set_tsv" + }, + "extract": { + "column": "gene_group_id" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/gene_group", + "name": "gene_group", + "source": { + "fileObject": { + "@id": "hgnc/hgnc_complete_set_tsv" + }, + "extract": { + "column": "gene_group" + } + } + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "hgnc/hgnc_complete_set/pubmed_id", + "name": "pubmed_id", + "source": { + "fileObject": { + "@id": "hgnc/hgnc_complete_set_tsv" + }, + "extract": { + "column": "pubmed_id" + } + } + } + ] + } + ] +} diff --git a/tests/resources/metadata_parser/hgnc/parser.yaml b/tests/resources/metadata_parser/hgnc/parser.yaml new file mode 100644 index 00000000..ff2a9236 --- /dev/null +++ b/tests/resources/metadata_parser/hgnc/parser.yaml @@ -0,0 +1,87 @@ +source_id: HGNC +provenance_id: infores:hgnc +parsing_version: "3.0" + +from: + croissant: hgnc_croissant.json + dataset_id: hgnc + version_from: dataset.version + distribution: hgnc/hgnc_complete_set_tsv + record_set: hgnc/hgnc_complete_set + format: tsv + delimiter: "\t" + test_mode_limit: 50 + +fields: + gene_id: + column: hgnc_id + kind: identifier + gene_name: + column: name + kind: label + symbol: + column: symbol + kind: property + locus_group: + column: locus_group + kind: property + location: + column: location + kind: property + families: + kind: zipped_list + separator: "|" + columns: + id: + column: gene_group_id + kind: identifier + prefix: HGNC.FAMILY + name: + column: gene_group + kind: label + publications: + column: pubmed_id + kind: list + separator: "|" + prefix: "PMID:" + +where: + - exists: families + +views: + gene_family_memberships: + from: source + unnest: + field: families + as: family + select: + gene_id: $gene_id + family_id: $family.id + family_name: $family.name + publications: $publications + +graph: + nodes: + - from: source + id: $gene_id + name: $gene_name + category: biolink:Gene + props: + symbol: $symbol + locus_group: $locus_group + location: $location + - from: gene_family_memberships + id: $family_id + name: $family_name + category: biolink:GeneFamily + + edges: + - from: gene_family_memberships + subject: $gene_id + predicate: RO:0002350 + object: $family_id + primary_knowledge_source: infores:hgnc + props: + publications: $publications + knowledge_level: knowledge_assertion + agent_type: manual_agent diff --git a/tests/test_metadata_driven_parser.py b/tests/test_metadata_driven_parser.py new file mode 100644 index 00000000..7273feb5 --- /dev/null +++ b/tests/test_metadata_driven_parser.py @@ -0,0 +1,519 @@ +import json +import shutil +from pathlib import Path +from zipfile import ZipFile + +import yaml + +from orion.croissant_resolver import CroissantResolver +from orion.metadata_driven_loader import MetadataDrivenLoader +from parsers.BINDING.src.loadBINDINGDB import BINDINGDBLoader +from parsers.hgnc.src.loadHGNC import HGNCLoader +from parsers.metadata_driven.src.loadMetadataDriven import BINDINGDBCroissantLoader, HGNCCroissantLoader +from orion.parser_spec import load_parser_spec + + +TEST_RESOURCE_DIR = Path(__file__).parent / "resources" / "metadata_parser" / "hgnc" +BINDINGDB_RESOURCE_DIR = Path(__file__).parent / "resources" / "metadata_parser" / "bindingdb" + + +class HGNCTestMetadataLoader(MetadataDrivenLoader): + pass + + +class BindingDBTestMetadataLoader(MetadataDrivenLoader): + pass + + +def _read_jsonl(path: Path) -> list[dict]: + with path.open("r") as handle: + return [json.loads(line) for line in handle] + + +def _sorted_records(path: Path) -> list[dict]: + return sorted(_read_jsonl(path), key=lambda record: json.dumps(record, sort_keys=True)) + + +def _bindingdb_legacy_archive(path: Path) -> None: + header = [f"col{i}" for i in range(46)] + header[8] = "Ki (nM)" + header[9] = "IC50 (nM)" + header[10] = "Kd (nM)" + header[11] = "EC50 (nM)" + header[19] = "PMID" + header[20] = "PubChem AID" + header[21] = "Patent Number" + header[31] = "PubChem CID" + header[44] = "UniProt (SwissProt) Primary ID of Target Chain 1" + + def row(pubchem_cid, protein, ki="", ic50="", kd="", ec50="", pmid="", aid="", patent=""): + values = ["" for _ in range(46)] + values[8] = ki + values[9] = ic50 + values[10] = kd + values[11] = ec50 + values[19] = pmid + values[20] = aid + values[21] = patent + values[31] = pubchem_cid + values[44] = protein + return values + + rows = [ + header, + row("111", "P11111", ki="100", pmid="12345", aid="7001", patent="PAT-1"), + row("111", "P11111", ki="10", pmid="23456", aid="7002", patent="PAT-1"), + row("111", "P11111", ki="0", pmid="34567", aid="7003"), + row("111", "P11111", ic50="200", pmid="12345", aid="7001"), + row("222", "P22222", ec50="50", pmid="34567", aid="8001", patent="PAT-2"), + row("", "P99999", ki="25", pmid="99999", aid="9999", patent="PAT-X"), + ["malformed", "row"], + ] + + tsv_content = "\n".join("\t".join(row_values) for row_values in rows) + "\n" + with ZipFile(path, "w") as zip_file: + zip_file.writestr("BindingDB_All.tsv", tsv_content) + + +def test_croissant_resolver_hgnc_fixture(): + resolver = CroissantResolver.from_path(str(TEST_RESOURCE_DIR / "hgnc_croissant.json")) + assert resolver.dataset_id == "hgnc" + assert resolver.dataset_version == "2026-03-06" + + distribution = resolver.get_distribution("hgnc/hgnc_complete_set_tsv") + assert distribution.content_url == "https://example.org/hgnc_complete_set.txt" + + column_map = resolver.get_field_column_map("hgnc/hgnc_complete_set") + assert column_map["hgnc_id"] == "hgnc_id" + assert column_map["gene_group_id"] == "gene_group_id" + + +def test_load_parser_spec_hgnc_fixture(): + spec = load_parser_spec(str(TEST_RESOURCE_DIR / "parser.yaml")) + assert spec.source_id == "HGNC" + assert spec.provenance_id == "infores:hgnc" + assert spec.source.record_set == "hgnc/hgnc_complete_set" + assert spec.source.croissant_path == str(TEST_RESOURCE_DIR / "hgnc_croissant.json") + + +def test_metadata_driven_loader_emits_expected_hgnc_graph(tmp_path): + source_root = tmp_path / "source_root" + source_dir = source_root / "source" + source_dir.mkdir(parents=True) + shutil.copyfile(TEST_RESOURCE_DIR / "hgnc_complete_set.txt", source_dir / "hgnc_complete_set.txt") + + loader = HGNCTestMetadataLoader( + parser_spec_path=str(TEST_RESOURCE_DIR / "parser.yaml"), + source_data_dir=str(source_root), + ) + + nodes_path = tmp_path / "nodes.jsonl" + edges_path = tmp_path / "edges.jsonl" + metadata = loader.load(str(nodes_path), str(edges_path)) + + assert metadata["num_source_lines"] == 3 + assert metadata["unusable_source_lines"] == 1 + assert metadata["source_nodes"] == 5 + assert metadata["source_edges"] == 3 + + nodes = _read_jsonl(nodes_path) + edges = _read_jsonl(edges_path) + + node_ids = {node["id"] for node in nodes} + assert node_ids == {"HGNC:1", "HGNC:3", "HGNC.FAMILY:5", "HGNC.FAMILY:7", "HGNC.FAMILY:9"} + + gene_node = next(node for node in nodes if node["id"] == "HGNC:1") + assert gene_node["name"] == "alpha-1-B glycoprotein" + assert gene_node["category"] == ["biolink:Gene"] + assert gene_node["symbol"] == "A1BG" + + family_node = next(node for node in nodes if node["id"] == "HGNC.FAMILY:5") + assert family_node["name"] == "Signal family" + assert family_node["category"] == ["biolink:GeneFamily"] + + assert {edge["predicate"] for edge in edges} == {"RO:0002350"} + assert {edge["subject"] for edge in edges} == {"HGNC:1", "HGNC:3"} + assert {edge["object"] for edge in edges} == {"HGNC.FAMILY:5", "HGNC.FAMILY:7", "HGNC.FAMILY:9"} + + publication_edge = next(edge for edge in edges if edge["object"] == "HGNC.FAMILY:5") + assert publication_edge["primary_knowledge_source"] == "infores:hgnc" + assert publication_edge["publications"] == ["PMID:12345", "PMID:23456"] + + +def test_metadata_driven_loader_supports_fileset_in_zip(tmp_path): + zipped_source_root = tmp_path / "zipped_source_root" + source_dir = zipped_source_root / "source" + source_dir.mkdir(parents=True) + + archive_path = source_dir / "hgnc_bundle.zip" + with ZipFile(archive_path, "w") as zip_file: + zip_file.write(TEST_RESOURCE_DIR / "hgnc_complete_set.txt", arcname="exports/hgnc_complete_set.txt") + + croissant_path = tmp_path / "hgnc_zip_croissant.json" + croissant_path.write_text( + json.dumps( + { + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "sc": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + }, + "@type": "https://schema.org/Dataset", + "@id": "hgnc", + "version": "2026-03-06", + "distribution": [ + { + "@type": "http://mlcommons.org/croissant/FileObject", + "@id": "hgnc/hgnc_zip", + "contentUrl": "https://example.org/hgnc_bundle.zip", + "encodingFormat": "application/zip", + }, + { + "@type": "http://mlcommons.org/croissant/FileSet", + "@id": "hgnc/hgnc_complete_set_fileset", + "containedIn": [{"@id": "hgnc/hgnc_zip"}], + "includes": "*hgnc_complete_set.txt", + "encodingFormat": "text/tab-separated-values", + }, + ], + "recordSet": [ + { + "@type": "http://mlcommons.org/croissant/RecordSet", + "@id": "hgnc/hgnc_complete_set", + "name": "hgnc_complete_set", + "field": [ + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "field/hgnc_id", + "name": "hgnc_id", + "source": { + "fileSet": {"@id": "hgnc/hgnc_complete_set_fileset"}, + "extract": {"column": "hgnc_id"}, + }, + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "field/name", + "name": "name", + "source": { + "fileSet": {"@id": "hgnc/hgnc_complete_set_fileset"}, + "extract": {"column": "name"}, + }, + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "field/symbol", + "name": "symbol", + "source": { + "fileSet": {"@id": "hgnc/hgnc_complete_set_fileset"}, + "extract": {"column": "symbol"}, + }, + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "field/locus_group", + "name": "locus_group", + "source": { + "fileSet": {"@id": "hgnc/hgnc_complete_set_fileset"}, + "extract": {"column": "locus_group"}, + }, + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "field/location", + "name": "location", + "source": { + "fileSet": {"@id": "hgnc/hgnc_complete_set_fileset"}, + "extract": {"column": "location"}, + }, + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "field/gene_group_id", + "name": "gene_group_id", + "source": { + "fileSet": {"@id": "hgnc/hgnc_complete_set_fileset"}, + "extract": {"column": "gene_group_id"}, + }, + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "field/gene_group", + "name": "gene_group", + "source": { + "fileSet": {"@id": "hgnc/hgnc_complete_set_fileset"}, + "extract": {"column": "gene_group"}, + }, + }, + { + "@type": "http://mlcommons.org/croissant/Field", + "@id": "field/pubmed_id", + "name": "pubmed_id", + "source": { + "fileSet": {"@id": "hgnc/hgnc_complete_set_fileset"}, + "extract": {"column": "pubmed_id"}, + }, + }, + ], + } + ], + } + ) + ) + + parser_spec_path = tmp_path / "parser.yaml" + parser_spec_path.write_text( + yaml.safe_dump( + { + "source_id": "HGNC", + "provenance_id": "infores:hgnc", + "parsing_version": "3.0", + "from": { + "croissant": str(croissant_path), + "dataset_id": "hgnc", + "version_from": "dataset.version", + "distribution": "hgnc/hgnc_complete_set_fileset", + "record_set": "hgnc/hgnc_complete_set", + "format": "tsv", + "delimiter": "\t", + "member_pattern": "*hgnc_complete_set.txt", + }, + "fields": { + "gene_id": {"column": "hgnc_id", "kind": "identifier"}, + "gene_name": {"column": "name", "kind": "label"}, + "families": { + "kind": "zipped_list", + "separator": "|", + "columns": { + "id": {"column": "gene_group_id", "kind": "identifier", "prefix": "HGNC.FAMILY"}, + "name": {"column": "gene_group", "kind": "label"}, + }, + }, + }, + "where": [{"exists": "families"}], + "views": { + "gene_family_memberships": { + "from": "source", + "unnest": {"field": "families", "as": "family"}, + "select": { + "gene_id": "$gene_id", + "family_id": "$family.id", + }, + } + }, + "graph": { + "nodes": [ + { + "from": "source", + "id": "$gene_id", + "name": "$gene_name", + "category": "biolink:Gene", + } + ], + "edges": [ + { + "from": "gene_family_memberships", + "subject": "$gene_id", + "predicate": "RO:0002350", + "object": "$family_id", + } + ], + }, + } + ) + ) + + loader = HGNCTestMetadataLoader( + parser_spec_path=str(parser_spec_path), + source_data_dir=str(zipped_source_root), + ) + + nodes_path = tmp_path / "zip_nodes.jsonl" + edges_path = tmp_path / "zip_edges.jsonl" + metadata = loader.load(str(nodes_path), str(edges_path)) + + assert metadata["num_source_lines"] == 3 + assert metadata["unusable_source_lines"] == 1 + assert metadata["source_nodes"] == 2 + assert metadata["source_edges"] == 3 + + +def test_load_parser_spec_bindingdb_fixture(): + spec = load_parser_spec(str(BINDINGDB_RESOURCE_DIR / "parser.yaml")) + assert spec.source_id == "BINDING-DB" + assert "aggregated_measurements" in spec.views + assert spec.source.distribution == "bindingdb/all_tsv_fileset" + + +def test_metadata_driven_loader_bindingdb_aggregation(tmp_path): + source_root = tmp_path / "bindingdb_source_root" + source_dir = source_root / "source" + source_dir.mkdir(parents=True) + + archive_path = source_dir / "BindingDB_All_202603_tsv.zip" + with ZipFile(archive_path, "w") as zip_file: + zip_file.write(BINDINGDB_RESOURCE_DIR / "BindingDB_All.tsv", arcname="BindingDB_All.tsv") + + loader = BindingDBTestMetadataLoader( + parser_spec_path=str(BINDINGDB_RESOURCE_DIR / "parser.yaml"), + source_data_dir=str(source_root), + ) + + nodes_path = tmp_path / "bindingdb_nodes.jsonl" + edges_path = tmp_path / "bindingdb_edges.jsonl" + metadata = loader.load(str(nodes_path), str(edges_path)) + + assert metadata["num_source_lines"] == 5 + assert metadata["unusable_source_lines"] == 1 + assert metadata["source_nodes"] == 4 + assert metadata["source_edges"] == 3 + + nodes = _read_jsonl(nodes_path) + edges = _read_jsonl(edges_path) + + assert {node["id"] for node in nodes} == { + "PUBCHEM.COMPOUND:111", + "UniProtKB:P11111", + "PUBCHEM.COMPOUND:222", + "UniProtKB:P22222", + } + + edge_lookup = { + (edge["subject"], edge["predicate"], edge["object"]): edge + for edge in edges + } + + pki_edge = edge_lookup[("PUBCHEM.COMPOUND:111", "biolink:inhibits", "UniProtKB:P11111")] + assert pki_edge["affinity_parameter"] == "pKi" + assert pki_edge["average_affinity_nm"] == 55.0 + assert pki_edge["affinity"] == 7.26 + assert pki_edge["supporting_affinities"] == [100.0, 10.0] + assert pki_edge["publications"] == ["PMID:12345", "PMID:23456"] + assert pki_edge["pubchem_assay_ids"] == ["PUBCHEM.AID:7001", "PUBCHEM.AID:7002"] + assert pki_edge["patent_ids"] == ["PATENT:PAT-1"] + assert pki_edge["primary_knowledge_source"] == "infores:bindingdb" + + pic50_edge = edge_lookup[("PUBCHEM.COMPOUND:111", "CTD:decreases_activity_of", "UniProtKB:P11111")] + assert pic50_edge["affinity_parameter"] == "pIC50" + assert pic50_edge["average_affinity_nm"] == 200.0 + assert pic50_edge["affinity"] == 6.7 + + pec50_edge = edge_lookup[("PUBCHEM.COMPOUND:222", "CTD:increases_activity_of", "UniProtKB:P22222")] + assert pec50_edge["affinity_parameter"] == "pEC50" + assert pec50_edge["average_affinity_nm"] == 50.0 + assert pec50_edge["affinity"] == 7.3 + + +def test_hgnc_croissant_loader_matches_legacy_loader(tmp_path): + source_root = tmp_path / "hgnc_parity_source" + source_dir = source_root / "source" + source_dir.mkdir(parents=True) + shutil.copyfile(TEST_RESOURCE_DIR / "hgnc_complete_set.txt", source_dir / "hgnc_complete_set.txt") + + legacy_nodes = tmp_path / "legacy_hgnc_nodes.jsonl" + legacy_edges = tmp_path / "legacy_hgnc_edges.jsonl" + legacy_loader = HGNCLoader(source_data_dir=str(source_root)) + legacy_metadata = legacy_loader.load(str(legacy_nodes), str(legacy_edges)) + + croissant_nodes = tmp_path / "croissant_hgnc_nodes.jsonl" + croissant_edges = tmp_path / "croissant_hgnc_edges.jsonl" + croissant_loader = HGNCCroissantLoader(source_data_dir=str(source_root)) + croissant_metadata = croissant_loader.load(str(croissant_nodes), str(croissant_edges)) + + assert legacy_metadata == croissant_metadata + assert legacy_nodes.read_text() == croissant_nodes.read_text() + assert legacy_edges.read_text() == croissant_edges.read_text() + + +def test_bindingdb_croissant_loader_matches_legacy_loader_on_compatible_fixture(tmp_path): + source_root = tmp_path / "bindingdb_parity_source" + source_dir = source_root / "source" + source_dir.mkdir(parents=True) + archive_path = source_dir / "BindingDB_All_202603_tsv.zip" + _bindingdb_legacy_archive(archive_path) + + original_get_latest = BINDINGDBLoader.get_latest_source_version + BINDINGDBLoader.get_latest_source_version = lambda self: "202603" + try: + legacy_nodes = tmp_path / "legacy_binding_nodes.jsonl" + legacy_edges = tmp_path / "legacy_binding_edges.jsonl" + legacy_loader = BINDINGDBLoader(source_data_dir=str(source_root)) + legacy_metadata = legacy_loader.load(str(legacy_nodes), str(legacy_edges)) + finally: + BINDINGDBLoader.get_latest_source_version = original_get_latest + + croissant_nodes = tmp_path / "croissant_binding_nodes.jsonl" + croissant_edges = tmp_path / "croissant_binding_edges.jsonl" + croissant_loader = BINDINGDBCroissantLoader(source_data_dir=str(source_root)) + croissant_metadata = croissant_loader.load(str(croissant_nodes), str(croissant_edges)) + + assert legacy_metadata == croissant_metadata + assert legacy_nodes.read_text() == croissant_nodes.read_text() + assert legacy_edges.read_text() == croissant_edges.read_text() + + +def test_metadata_driven_loader_can_preserve_empty_string_properties(tmp_path): + source_root = tmp_path / "hgnc_preserve_empty" + source_dir = source_root / "source" + source_dir.mkdir(parents=True) + (source_dir / "hgnc_complete_set.txt").write_text( + "hgnc_id\tsymbol\tname\tlocus_group\tlocation\tgene_group_id\tgene_group\tpubmed_id\n" + "HGNC:1\tGENE1\tGene 1\tprotein-coding gene\t\t5\tFamily 5\t\n" + ) + + parser_spec_path = tmp_path / "parser.yaml" + parser_spec_path.write_text( + yaml.safe_dump( + { + "source_id": "HGNC", + "provenance_id": "infores:hgnc", + "parsing_version": "1.0", + "from": { + "croissant": str(TEST_RESOURCE_DIR / "hgnc_croissant.json"), + "dataset_id": "hgnc", + "distribution": "hgnc/hgnc_complete_set_tsv", + "record_set": "hgnc/hgnc_complete_set", + "format": "tsv", + "delimiter": "\t", + }, + "fields": { + "gene_id": {"column": "hgnc_id", "kind": "identifier"}, + "gene_name": {"column": "name", "kind": "label"}, + "location": {"column": "location", "kind": "property", "preserve_empty": True}, + }, + "graph": { + "nodes": [ + { + "from": "source", + "id": "$gene_id", + "name": "$gene_name", + "props": { + "location": {"value": "$location", "preserve_empty": True}, + }, + } + ], + "edges": [], + }, + }, + sort_keys=False, + ) + ) + + loader = HGNCTestMetadataLoader( + source_data_dir=str(source_root), + parser_spec_path=str(parser_spec_path), + ) + nodes_path = tmp_path / "nodes.jsonl" + edges_path = tmp_path / "edges.jsonl" + loader.load(str(nodes_path), str(edges_path)) + + nodes = _sorted_records(nodes_path) + assert nodes == [ + { + "category": ["biolink:NamedThing"], + "id": "HGNC:1", + "location": "", + "name": "Gene 1", + } + ]