|
| 1 | +""" |
| 2 | +CADSR CDE Import Engine |
| 3 | +
|
| 4 | +This ingests the output of the caDSR API https://cadsrapi.cancer.gov/rad/NCIAPI/1.0/api |
| 5 | +""" |
| 6 | +import logging |
| 7 | +import urllib |
| 8 | +from typing import Union, Dict, Tuple, List, Any, Optional, Iterable |
| 9 | + |
| 10 | +from dataclasses import dataclass |
| 11 | + |
| 12 | +from linkml.utils.schema_builder import SchemaBuilder |
| 13 | +from linkml_runtime.linkml_model import Annotation |
| 14 | +from linkml_runtime.linkml_model.meta import SchemaDefinition, SlotDefinition, EnumDefinition, \ |
| 15 | + PermissibleValue, UniqueKey, ClassDefinition |
| 16 | +from linkml_runtime.loaders import json_loader |
| 17 | +from linkml_runtime.utils.formatutils import camelcase, underscore |
| 18 | + |
| 19 | +from schema_automator.importers.import_engine import ImportEngine |
| 20 | +import schema_automator.metamodels.cadsr as cadsr |
| 21 | + |
| 22 | + |
| 23 | +TMAP = { |
| 24 | + "DATE": "date", |
| 25 | + "NUMBER": "float", |
| 26 | + "ALPHANUMERIC": "string", |
| 27 | + "CHARACTER": "string", |
| 28 | + "HL7EDv3": "string", |
| 29 | + "HL7CDv3": "string", |
| 30 | + "java.lang.Double": "float", |
| 31 | + "Numeric Alpha DVG": "float", |
| 32 | + "SAS Date": "string", |
| 33 | + "java.util.Date": "date", |
| 34 | + "DATE/TIME": "datetime", |
| 35 | + "TIME": "time", |
| 36 | + "Integer": "integer", |
| 37 | + "java.lang.Integer": "integer", |
| 38 | + "Floating-point": "float", |
| 39 | +} |
| 40 | + |
| 41 | +@dataclass |
| 42 | +class CADSRImportEngine(ImportEngine): |
| 43 | + """ |
| 44 | + An ImportEngine that imports NCI CADSR CDEs |
| 45 | +
|
| 46 | + Ingests the output of `caDSR API <https://cadsrapi.cancer.gov/rad/NCIAPI/1.0/api>`_. |
| 47 | +
|
| 48 | + - Each CDE becomes a unique slot |
| 49 | + - the CDE is added as a lot of a context-specific class |
| 50 | + - the context-specific class is a subclass of the CDE's DataElementConcept |
| 51 | +
|
| 52 | + Note that this creates a lot of 1-1 classes, as in many cases there is no |
| 53 | + attempt to group concepts. However, this is not always the case. |
| 54 | +
|
| 55 | + E.g. the concept with publicId 2012668 (Access Route) is used in 5 contexts |
| 56 | + (AHRQ, CCR, ...) |
| 57 | +
|
| 58 | + Each context-specific concept has its own set of CDEs |
| 59 | +
|
| 60 | + See also https://github.com/monarch-initiative/cde-harmonization |
| 61 | + """ |
| 62 | + |
| 63 | + def convert(self, paths: Iterable[str], id: str=None, name: str=None, **kwargs) -> SchemaDefinition: |
| 64 | + """ |
| 65 | + Converts one or more CDE JSON files into LinkML |
| 66 | +
|
| 67 | + :param files: |
| 68 | + :param kwargs: |
| 69 | + :return: |
| 70 | + """ |
| 71 | + sb = SchemaBuilder() |
| 72 | + schema = sb.schema |
| 73 | + if id: |
| 74 | + schema.id = id |
| 75 | + if not name: |
| 76 | + name = package.name |
| 77 | + if name: |
| 78 | + schema.name = name |
| 79 | + classes = {} |
| 80 | + slots = {} |
| 81 | + enums = {} |
| 82 | + for path in paths: |
| 83 | + logging.info(f"Loading {path}") |
| 84 | + with (open(path) as file): |
| 85 | + container: cadsr.DataElementContainer |
| 86 | + container = json_loader.load(file, target_class=cadsr.DataElementContainer) |
| 87 | + cde = container.DataElement |
| 88 | + ctxt = cde.context |
| 89 | + source = urllib.parse.quote(ctxt) |
| 90 | + source = f"cadsr:{source}" |
| 91 | + slot = SlotDefinition( |
| 92 | + name=urllib.parse.quote(underscore(f"{ctxt} {cde.preferredName}")), |
| 93 | + slot_uri=f"cadsr:{cde.publicId}", |
| 94 | + title=cde.preferredName, |
| 95 | + description=cde.preferredDefinition, |
| 96 | + aliases=[cde.longName], |
| 97 | + source=source, |
| 98 | + ) |
| 99 | + slots[slot.name] = slot |
| 100 | + concept = cde.DataElementConcept |
| 101 | + concept_name = urllib.parse.quote(camelcase(f"{ctxt} {concept.preferredName}")) |
| 102 | + parent_concept_name = urllib.parse.quote(camelcase(concept.longName)) |
| 103 | + if parent_concept_name not in classes: |
| 104 | + parent_cls = ClassDefinition( |
| 105 | + name=parent_concept_name, |
| 106 | + title=concept.preferredName, |
| 107 | + description=concept.preferredDefinition, |
| 108 | + #aliases=[concept.longName], |
| 109 | + class_uri=f"cadsr:{concept.publicId}", |
| 110 | + ) |
| 111 | + classes[parent_concept_name] = parent_cls |
| 112 | + if concept_name not in classes: |
| 113 | + cls = ClassDefinition( |
| 114 | + name=concept_name, |
| 115 | + title=f"{concept.preferredName} ({ctxt})", |
| 116 | + description=concept.preferredDefinition, |
| 117 | + aliases=[concept.longName], |
| 118 | + class_uri=f"cadsr:{concept.publicId}", |
| 119 | + is_a=parent_concept_name, |
| 120 | + ) |
| 121 | + classes[concept_name] = cls |
| 122 | + else: |
| 123 | + cls = classes[concept_name] |
| 124 | + cls.slots.append(slot.name) |
| 125 | + objectClass = concept.ObjectClass |
| 126 | + # TODO |
| 127 | + valueDomain = cde.ValueDomain |
| 128 | + conceptualDomain = valueDomain.ConceptualDomain |
| 129 | + pvs = valueDomain.PermissibleValues |
| 130 | + if pvs: |
| 131 | + enum_name = urllib.parse.quote(camelcase(valueDomain.preferredName)) |
| 132 | + enum = EnumDefinition( |
| 133 | + name=enum_name, |
| 134 | + title=valueDomain.preferredName, |
| 135 | + description=valueDomain.preferredDefinition, |
| 136 | + aliases=[valueDomain.longName], |
| 137 | + # enum_uri=f"cadsr:{valueDomain.publicId}", |
| 138 | + ) |
| 139 | + enums[enum_name] = enum |
| 140 | + rng = enum_name |
| 141 | + for pv in pvs: |
| 142 | + # url encode the value to escape symbols like <, >, etc. |
| 143 | + pv_value = urllib.parse.quote(pv.value) |
| 144 | + tgt_pv = PermissibleValue( |
| 145 | + text=pv_value, |
| 146 | + title=pv.value, |
| 147 | + description=pv.valueDescription, |
| 148 | + ) |
| 149 | + enum.permissible_values[tgt_pv.text] = tgt_pv |
| 150 | + vm = pv.ValueMeaning |
| 151 | + tgt_pv.title = vm.preferredName |
| 152 | + if not tgt_pv.description: |
| 153 | + tgt_pv.description = vm.preferredDefinition |
| 154 | + for c in vm.Concepts: |
| 155 | + code = c.conceptCode.strip() |
| 156 | + tgt_pv.meaning = f"NCIT:{code}" |
| 157 | + else: |
| 158 | + datatype = valueDomain.dataType |
| 159 | + rng = TMAP.get(datatype, "string") |
| 160 | + slot.range = rng |
| 161 | + anns = [] |
| 162 | + for rd in cde.ReferenceDocuments: |
| 163 | + rf_type = urllib.parse.quote(underscore(rd.type)) |
| 164 | + anns.append(Annotation( |
| 165 | + tag=rf_type, |
| 166 | + value=rd.description, |
| 167 | + )) |
| 168 | + for ann in anns: |
| 169 | + slot.annotations[ann.tag] = ann |
| 170 | + |
| 171 | + sb.add_prefix("NCIT", "http://purl.obolibrary.org/obo/NCIT_") |
| 172 | + sb.add_prefix("cadsr", "http://example.org/cadsr/") |
| 173 | + sb.add_defaults() |
| 174 | + for c in schema.classes.values(): |
| 175 | + c.from_schema = 'http://example.org/' |
| 176 | + schema = sb.schema |
| 177 | + schema.classes = classes |
| 178 | + schema.slots = slots |
| 179 | + schema.enums = enums |
| 180 | + return schema |
| 181 | + |
| 182 | + |
| 183 | + |
| 184 | + |
0 commit comments