Skip to content

Commit

Permalink
Include our modification of scispacy in ext (#9)
Browse files Browse the repository at this point in the history
* Integrate BRONCO and MUGIT

* Remake mugit.py

* Remove nb and imports

* Include our scispacy fork in ext and refactor

* update dependencies

* Revert changes from wrong branch

* Include download links bronco/mugit files

* Import from ext.scispacy only in umls.py

* Remove bigbio dependency in dataloaders.py

* Fix custom scispacy imports

---------

Co-authored-by: illorca <[email protected]>
Co-authored-by: illorca <[email protected]>
  • Loading branch information
3 people authored May 16, 2023
1 parent ff2520a commit 0af4222
Show file tree
Hide file tree
Showing 6 changed files with 202 additions and 10 deletions.
3 changes: 3 additions & 0 deletions conf/bronco.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,19 @@ name : bronco
dict:
icd10gm:
custom:
# Get from: https://www.bfarm.de/DE/Kodiersysteme/Services/Downloads/_node.html
icd10gm_path: ${oc.env:HOME}/temp/icd10gm2017syst_kodes.txt
lang:
- de
ops:
custom:
# Get from: https://www.bfarm.de/DE/Kodiersysteme/Services/Downloads/_node.html
ops_path: ${oc.env:HOME}/temp/ops2017syst_kodes.txt
lang:
- de
atc:
custom:
# Get from: https://www.wido.de/publikationen-produkte/arzneimittel-klassifikation/
atc_path: ${oc.env:HOME}/temp/ATC GKV-AI_2022.xlsm
lang:
- de
1 change: 1 addition & 0 deletions conf/mugit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ dict:
lang:
- de
- en
# Get from: https://user.medunigraz.at/stefan.schulz/mugit/
mugit_path: ${oc.env:HOME}/temp/SCT-GIT_de_large.dat
3 changes: 1 addition & 2 deletions xmen/data/dataloaders.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from typing import List, Union
from pathlib import Path
from scispacy import umls_utils
import datasets


Expand Down Expand Up @@ -145,4 +144,4 @@ def _load_bigbio_dataset(
output = datasets.dataset_dict.DatasetDict()
for s in splits:
output[s] = datasets.concatenate_datasets([d[s] for d in ds])
return output
return output
190 changes: 190 additions & 0 deletions xmen/ext/scispacy/umls_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
"""
This script is a modified version of a script originally licensed under the Apache 2.0 license.
Original script: https://github.com/allenai/scispacy/blob/main/scispacy/umls_utils.py
Original authors: Neumann, Mark and King, Daniel and Beltagy, Iz and Ammar, Waleed
Modifications: added two new arguments `lang` and `non_suppressed` to `read_umls_concepts`
(with their respective default values) which allow the function to filter the terms by language
and decide whether to keep only non-suppressed concepts or not.
"""

from typing import Optional, List, Dict

# TODO(Mark): Remove in scispacy v1.0, for backward compatability only.
from scispacy.linking_utils import Entity as UmlsEntity, UmlsKnowledgeBase # noqa

# preferred definition sources (from S2)
DEF_SOURCES_PREFERRED = {"NCI_BRIDG", "NCI_NCI-GLOSS", "NCI", "GO", "MSH", "NCI_FDA"}


def read_umls_file_headers(meta_path: str, filename: str) -> List[str]:
"""
Read the file descriptor MRFILES.RRF from a UMLS release and get column headers (names)
for the given file
MRFILES.RRF file format: a pipe-separated values
Useful columns:
column 0: name of one of the files in the META directory
column 2: column names of that file
Args:
meta_path: path to the META directory of an UMLS release
filename: name of the file to get its column headers
Returns:
a list of column names
"""
file_descriptors = f"{meta_path}/MRFILES.RRF" # to get column names
with open(file_descriptors, encoding="utf-8") as fin:
for line in fin:
splits = line.split("|")
found_filename = splits[0]
column_names = (splits[2] + ",").split(
","
) # ugly hack because all files end with an empty column
if found_filename in filename:
return column_names
assert False, f"Couldn't find column names for file {filename}"
return None


def read_umls_concepts(
meta_path: str,
concept_details: Dict,
source: Optional[str] = None,
lang: str = "ENG",
non_suppressed: bool = True,
):
"""
Read the concepts file MRCONSO.RRF from a UMLS release and store it in
concept_details dictionary. Each concept is represented with
- concept_id
- canonical_name
- aliases
- types
- definition
This function fills the first three. If a canonical name is not found, it is left empty.
MRFILES.RRF file format: a pipe-separated values
Useful columns: CUI, LAT, SUPPRESS, STR, ISPREF, TS, STT
Args:
meta_path: path to the META directory of an UMLS release
concept_details: a dictionary to be filled with concept informations
source: An optional source identifier, used as a filter to extract only a
specific source from UMLS.
lang: An optional language identifier, used to filter terms by language
non_suppressed: flag to indicate whether only non-suppressed concepts should be kept
"""
concepts_filename = "MRCONSO.RRF"
headers = read_umls_file_headers(meta_path, concepts_filename)
with open(f"{meta_path}/{concepts_filename}", encoding="utf-8") as fin:
for line in fin:
splits = line.strip().split("|")
assert len(headers) == len(splits), (headers, splits)
concept = dict(zip(headers, splits))
if (lang is not None and concept["LAT"] != lang) or (
non_suppressed and concept["SUPPRESS"] != "N"
):
continue # Keep non-suppressed concepts in target language only

if source is not None:
if concept["SAB"] != source:
continue

concept_id = concept["CUI"]
if concept_id not in concept_details: # a new concept
# add it to the dictionary with an empty list of aliases and types
concept_details[concept_id] = {
"concept_id": concept_id,
"aliases": [],
"types": [],
}

concept_name = concept["STR"]
# this condition is copied from S2. It checks if the concept name is canonical or not
is_canonical = (
concept["ISPREF"] == "Y"
and concept["TS"] == "P"
and concept["STT"] == "PF"
)

if not is_canonical or "canonical_name" in concept_details[concept_id]:
# not a canonical name or a canonical name already found
concept_details[concept_id]["aliases"].append(
concept_name
) # add it as an alias
else:
concept_details[concept_id][
"canonical_name"
] = concept_name # set as canonical name


def read_umls_types(meta_path: str, concept_details: Dict):
"""
Read the types file MRSTY.RRF from a UMLS release and store it in
concept_details dictionary. This function adds the `types` field
to the information of each concept
MRSTY.RRF file format: a pipe-separated values
Useful columns: CUI, TUI
Args:
meta_path: path to the META directory of an UMLS release
concept_details: a dictionary to be filled with concept informations
"""
types_filename = "MRSTY.RRF"
headers = read_umls_file_headers(meta_path, types_filename)
with open(f"{meta_path}/{types_filename}", encoding="utf-8") as fin:
for line in fin:
splits = line.strip().split("|")
assert len(headers) == len(splits)
concept_type = dict(zip(headers, splits))

concept = concept_details.get(concept_type["CUI"])
if (
concept is not None
): # a small number of types are for concepts that don't exist
concept["types"].append(concept_type["TUI"])


def read_umls_definitions(meta_path: str, concept_details: Dict):
"""
Read the types file MRDEF.RRF from a UMLS release and store it in
concept_details dictionary. This function adds the `definition` field
to the information of each concept
MRDEF.RRF file format: a pipe-separated values
Useful columns: CUI, SAB, SUPPRESS, DEF
Args:
meta_path: path to the META directory of an UMLS release
concept_details: a dictionary to be filled with concept informations
"""
definitions_filename = "MRDEF.RRF"
headers = read_umls_file_headers(meta_path, definitions_filename)
with open(f"{meta_path}/{definitions_filename}", encoding="utf-8") as fin:
headers = read_umls_file_headers(meta_path, definitions_filename)
for line in fin:
splits = line.strip().split("|")
assert len(headers) == len(splits)
definition = dict(zip(headers, splits))

if definition["SUPPRESS"] != "N":
continue
is_from_preferred_source = definition["SAB"] in DEF_SOURCES_PREFERRED
concept = concept_details.get(definition["CUI"])
if (
concept is None
): # a small number of definitions are for concepts that don't exist
continue

if (
"definition" not in concept
or is_from_preferred_source
and concept["is_from_preferred_source"] == "N"
):
concept["definition"] = definition["DEF"]
concept["is_from_preferred_source"] = (
"Y" if is_from_preferred_source else "N"
)
3 changes: 1 addition & 2 deletions xmen/preprocessing/retired_cuis.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from scispacy import umls_utils

from xmen.ext.scispacy.umls_utils import read_umls_file_headers

class CUIReplacer:
"""
Expand Down
12 changes: 6 additions & 6 deletions xmen/umls.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from scispacy import umls_utils
from xmen.ext.scispacy.umls_utils import read_umls_file_headers, read_umls_concepts, read_umls_types, read_umls_definitions
from scispacy import umls_semantic_type_tree
from scispacy.linking_utils import DEFAULT_UMLS_TYPES_PATH
from langcodes import Language
Expand All @@ -25,7 +25,7 @@ def read_umls_sabs(meta_path):
"""
res = []
sab_filename = "MRSAB.RRF"
headers = umls_utils.read_umls_file_headers(meta_path, sab_filename)
headers = read_umls_file_headers(meta_path, sab_filename)
with open(f"{meta_path}/{sab_filename}") as fin:
for line in fin:
splits = line.strip().split("|")
Expand Down Expand Up @@ -119,7 +119,7 @@ def get_alias_count(concept_details):
return sum([len(c["aliases"]) + 1 for c in concept_details.values()])


def _expand_tuis(tuis, sem_type_tree):
def expand_tuis(tuis, sem_type_tree):
"""
Recursively expands a list of UMLS semantic type abbreviations to include their child semantic types,
using the specified semantic type tree.
Expand Down Expand Up @@ -218,12 +218,12 @@ def get_umls_concepts(
f'>> Reading concepts from {"all sources" if not source else source} and {"all languages" if not langs else f"languages: {langs}"}'
)
for lang in langs if langs else [None]:
umls_utils.read_umls_concepts(
read_umls_concepts(
meta_path, concept_details, source=source, lang=lang, non_suppressed=non_suppressed_only
)

logger.info(">> Reading types ... ")
umls_utils.read_umls_types(meta_path, concept_details)
read_umls_types(meta_path, concept_details)

if semantic_groups:
logger.info(f"> Number of concepts before semantic group filtering: {len(concept_details)}")
Expand All @@ -234,7 +234,7 @@ def get_umls_concepts(
concept_details = filter_semantic_types(semantic_types, expand_semantic_types, concept_details)

logger.info(">> Reading definitions ... ")
umls_utils.read_umls_definitions(meta_path, concept_details)
read_umls_definitions(meta_path, concept_details)

logger.info(f"> Number of concepts before de-duplication: {len(concept_details)}")
logger.info(f"> Number of aliases before de-duplication: {get_alias_count(concept_details)}")
Expand Down

0 comments on commit 0af4222

Please sign in to comment.