Skip to content

Commit 6186a49

Browse files
authored
cadsr importer (#132)
* Adding caDSR CDE ingester * docs * Adding models * pytestified test_rdfs_importer. Fixed rdfs import bug
1 parent e9bbd52 commit 6186a49

16 files changed

+13302
-286
lines changed

poetry.lock

Lines changed: 334 additions & 275 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ packages = [
1212

1313
[tool.poetry.dependencies]
1414
python = "^3.9"
15-
linkml = ">=1.6.7"
15+
linkml = "^1.7.4"
1616
mkdocs = ">=1.2.3"
1717
pandas = ">=1.3.5"
1818
python-dateutil = ">=2.8.2"
@@ -30,6 +30,7 @@ inflect = ">=6.0.0"
3030
schemasheets = ">=0.1.24"
3131
xmltodict = "^0.13.0"
3232
click-default-group = "^1.2.4"
33+
linkml-runtime = "^1.7.2"
3334

3435

3536
[tool.poetry.dev-dependencies]

schema_automator/cli.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
"""
66
import logging
77
import os
8+
from pathlib import Path
9+
810
import click
911

1012

@@ -20,6 +22,7 @@
2022
from schema_automator.generalizers.csv_data_generalizer import CsvDataGeneralizer
2123
from schema_automator.generalizers.generalizer import DEFAULT_CLASS_NAME, DEFAULT_SCHEMA_NAME
2224
from schema_automator.generalizers.pandas_generalizer import PandasDataGeneralizer
25+
from schema_automator.importers.cadsr_import_engine import CADSRImportEngine
2326
from schema_automator.importers.dosdp_import_engine import DOSDPImportEngine
2427
from schema_automator.generalizers.json_instance_generalizer import JsonDataGeneralizer
2528
from schema_automator.importers.jsonschema_import_engine import JsonSchemaImportEngine
@@ -387,6 +390,27 @@ def import_frictionless(input, output, schema_name, schema_id, **kwargs):
387390
write_schema(schema, output)
388391

389392

393+
@main.command()
394+
@output_option
395+
@schema_name_option
396+
@schema_id_option
397+
@click.argument('input')
398+
def import_cadsr(input, output, schema_name, schema_id, **kwargs):
399+
"""
400+
Imports from CADSR CDE JSON API output to LinkML
401+
402+
See :ref:`importers` for more on the importer framework
403+
404+
Example:
405+
406+
schemauto import-cadsr "cdes/*.json"
407+
"""
408+
ie = CADSRImportEngine()
409+
paths = [str(gf.absolute()) for gf in Path().glob(input) if gf.is_file()]
410+
schema = ie.convert(paths, name=schema_name, id=schema_id)
411+
write_schema(schema, output)
412+
413+
390414
@main.command()
391415
@click.argument('owlfile')
392416
@output_option
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
"""
2+
CADSR CDE Import Engine
3+
4+
This ingests the output of the caDSR API https://cadsrapi.cancer.gov/rad/NCIAPI/1.0/api
5+
"""
6+
import logging
7+
import urllib
8+
from typing import Union, Dict, Tuple, List, Any, Optional, Iterable
9+
10+
from dataclasses import dataclass
11+
12+
from linkml.utils.schema_builder import SchemaBuilder
13+
from linkml_runtime.linkml_model import Annotation
14+
from linkml_runtime.linkml_model.meta import SchemaDefinition, SlotDefinition, EnumDefinition, \
15+
PermissibleValue, UniqueKey, ClassDefinition
16+
from linkml_runtime.loaders import json_loader
17+
from linkml_runtime.utils.formatutils import camelcase, underscore
18+
19+
from schema_automator.importers.import_engine import ImportEngine
20+
import schema_automator.metamodels.cadsr as cadsr
21+
22+
23+
TMAP = {
24+
"DATE": "date",
25+
"NUMBER": "float",
26+
"ALPHANUMERIC": "string",
27+
"CHARACTER": "string",
28+
"HL7EDv3": "string",
29+
"HL7CDv3": "string",
30+
"java.lang.Double": "float",
31+
"Numeric Alpha DVG": "float",
32+
"SAS Date": "string",
33+
"java.util.Date": "date",
34+
"DATE/TIME": "datetime",
35+
"TIME": "time",
36+
"Integer": "integer",
37+
"java.lang.Integer": "integer",
38+
"Floating-point": "float",
39+
}
40+
41+
@dataclass
42+
class CADSRImportEngine(ImportEngine):
43+
"""
44+
An ImportEngine that imports NCI CADSR CDEs
45+
46+
Ingests the output of `caDSR API <https://cadsrapi.cancer.gov/rad/NCIAPI/1.0/api>`_.
47+
48+
- Each CDE becomes a unique slot
49+
- the CDE is added as a lot of a context-specific class
50+
- the context-specific class is a subclass of the CDE's DataElementConcept
51+
52+
Note that this creates a lot of 1-1 classes, as in many cases there is no
53+
attempt to group concepts. However, this is not always the case.
54+
55+
E.g. the concept with publicId 2012668 (Access Route) is used in 5 contexts
56+
(AHRQ, CCR, ...)
57+
58+
Each context-specific concept has its own set of CDEs
59+
60+
See also https://github.com/monarch-initiative/cde-harmonization
61+
"""
62+
63+
def convert(self, paths: Iterable[str], id: str=None, name: str=None, **kwargs) -> SchemaDefinition:
64+
"""
65+
Converts one or more CDE JSON files into LinkML
66+
67+
:param files:
68+
:param kwargs:
69+
:return:
70+
"""
71+
sb = SchemaBuilder()
72+
schema = sb.schema
73+
if id:
74+
schema.id = id
75+
if not name:
76+
name = package.name
77+
if name:
78+
schema.name = name
79+
classes = {}
80+
slots = {}
81+
enums = {}
82+
for path in paths:
83+
logging.info(f"Loading {path}")
84+
with (open(path) as file):
85+
container: cadsr.DataElementContainer
86+
container = json_loader.load(file, target_class=cadsr.DataElementContainer)
87+
cde = container.DataElement
88+
ctxt = cde.context
89+
source = urllib.parse.quote(ctxt)
90+
source = f"cadsr:{source}"
91+
slot = SlotDefinition(
92+
name=urllib.parse.quote(underscore(f"{ctxt} {cde.preferredName}")),
93+
slot_uri=f"cadsr:{cde.publicId}",
94+
title=cde.preferredName,
95+
description=cde.preferredDefinition,
96+
aliases=[cde.longName],
97+
source=source,
98+
)
99+
slots[slot.name] = slot
100+
concept = cde.DataElementConcept
101+
concept_name = urllib.parse.quote(camelcase(f"{ctxt} {concept.preferredName}"))
102+
parent_concept_name = urllib.parse.quote(camelcase(concept.longName))
103+
if parent_concept_name not in classes:
104+
parent_cls = ClassDefinition(
105+
name=parent_concept_name,
106+
title=concept.preferredName,
107+
description=concept.preferredDefinition,
108+
#aliases=[concept.longName],
109+
class_uri=f"cadsr:{concept.publicId}",
110+
)
111+
classes[parent_concept_name] = parent_cls
112+
if concept_name not in classes:
113+
cls = ClassDefinition(
114+
name=concept_name,
115+
title=f"{concept.preferredName} ({ctxt})",
116+
description=concept.preferredDefinition,
117+
aliases=[concept.longName],
118+
class_uri=f"cadsr:{concept.publicId}",
119+
is_a=parent_concept_name,
120+
)
121+
classes[concept_name] = cls
122+
else:
123+
cls = classes[concept_name]
124+
cls.slots.append(slot.name)
125+
objectClass = concept.ObjectClass
126+
# TODO
127+
valueDomain = cde.ValueDomain
128+
conceptualDomain = valueDomain.ConceptualDomain
129+
pvs = valueDomain.PermissibleValues
130+
if pvs:
131+
enum_name = urllib.parse.quote(camelcase(valueDomain.preferredName))
132+
enum = EnumDefinition(
133+
name=enum_name,
134+
title=valueDomain.preferredName,
135+
description=valueDomain.preferredDefinition,
136+
aliases=[valueDomain.longName],
137+
# enum_uri=f"cadsr:{valueDomain.publicId}",
138+
)
139+
enums[enum_name] = enum
140+
rng = enum_name
141+
for pv in pvs:
142+
# url encode the value to escape symbols like <, >, etc.
143+
pv_value = urllib.parse.quote(pv.value)
144+
tgt_pv = PermissibleValue(
145+
text=pv_value,
146+
title=pv.value,
147+
description=pv.valueDescription,
148+
)
149+
enum.permissible_values[tgt_pv.text] = tgt_pv
150+
vm = pv.ValueMeaning
151+
tgt_pv.title = vm.preferredName
152+
if not tgt_pv.description:
153+
tgt_pv.description = vm.preferredDefinition
154+
for c in vm.Concepts:
155+
code = c.conceptCode.strip()
156+
tgt_pv.meaning = f"NCIT:{code}"
157+
else:
158+
datatype = valueDomain.dataType
159+
rng = TMAP.get(datatype, "string")
160+
slot.range = rng
161+
anns = []
162+
for rd in cde.ReferenceDocuments:
163+
rf_type = urllib.parse.quote(underscore(rd.type))
164+
anns.append(Annotation(
165+
tag=rf_type,
166+
value=rd.description,
167+
))
168+
for ann in anns:
169+
slot.annotations[ann.tag] = ann
170+
171+
sb.add_prefix("NCIT", "http://purl.obolibrary.org/obo/NCIT_")
172+
sb.add_prefix("cadsr", "http://example.org/cadsr/")
173+
sb.add_defaults()
174+
for c in schema.classes.values():
175+
c.from_schema = 'http://example.org/'
176+
schema = sb.schema
177+
schema.classes = classes
178+
schema.slots = slots
179+
schema.enums = enums
180+
return schema
181+
182+
183+
184+

schema_automator/importers/rdfs_import_engine.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from collections import defaultdict
66

77
from linkml.utils.schema_builder import SchemaBuilder
8+
from linkml_runtime import SchemaView
89
from linkml_runtime.linkml_model import (
910
SchemaDefinition,
1011
SlotDefinition,
@@ -50,9 +51,12 @@ class RdfsImportEngine(ImportEngine):
5051
reverse_metamodel_mappings: Dict[URIRef, List[str]] = None
5152
include_unmapped_annotations = False
5253
metamodel = None
54+
metamodel_schemaview: SchemaView = None
55+
classdef_slots: List[str] = None
5356

5457
def __post_init__(self):
5558
sv = package_schemaview("linkml_runtime.linkml_model.meta")
59+
self.metamodel_schemaview = sv
5660
self.metamodel = sv
5761
self.metamodel_mappings = defaultdict(list)
5862
self.reverse_metamodel_mappings = defaultdict(list)
@@ -73,6 +77,7 @@ def __post_init__(self):
7377
mappings.append(uri)
7478
self.reverse_metamodel_mappings[uri].append(e.name)
7579
self.metamodel_mappings[e.name] = mappings
80+
self.defclass_slots = [s.name for s in sv.class_induced_slots(ClassDefinition.class_name)]
7681

7782
def convert(
7883
self,
@@ -180,6 +185,8 @@ def _dict_for_subject(self, g: Graph, s: URIRef) -> Dict[str, Any]:
180185
if pp == RDF.type:
181186
continue
182187
metaslot_name = self._element_from_iri(pp)
188+
if metaslot_name not in self.defclass_slots:
189+
continue
183190
if metaslot_name is None:
184191
logging.warning(f"Not mapping {pp}")
185192
continue

0 commit comments

Comments
 (0)