Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Include our modification of scispacy in ext #9

Merged
merged 11 commits into from
May 16, 2023
Merged
3 changes: 3 additions & 0 deletions conf/bronco.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,19 @@ name : bronco
dict:
icd10gm:
custom:
# Get from: https://www.bfarm.de/DE/Kodiersysteme/Services/Downloads/_node.html
icd10gm_path: ${oc.env:HOME}/temp/icd10gm2017syst_kodes.txt
lang:
- de
ops:
custom:
# Get from: https://www.bfarm.de/DE/Kodiersysteme/Services/Downloads/_node.html
ops_path: ${oc.env:HOME}/temp/ops2017syst_kodes.txt
lang:
- de
atc:
custom:
# Get from: https://www.wido.de/publikationen-produkte/arzneimittel-klassifikation/
atc_path: ${oc.env:HOME}/temp/ATC GKV-AI_2022.xlsm
lang:
- de
1 change: 1 addition & 0 deletions conf/mugit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ dict:
lang:
- de
- en
# Get from: https://user.medunigraz.at/stefan.schulz/mugit/
mugit_path: ${oc.env:HOME}/temp/SCT-GIT_de_large.dat
3 changes: 1 addition & 2 deletions xmen/data/dataloaders.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from typing import List, Union
from pathlib import Path
from scispacy import umls_utils
import datasets


Expand Down Expand Up @@ -145,4 +144,4 @@ def _load_bigbio_dataset(
output = datasets.dataset_dict.DatasetDict()
for s in splits:
output[s] = datasets.concatenate_datasets([d[s] for d in ds])
return output
return output
190 changes: 190 additions & 0 deletions xmen/ext/scispacy/umls_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
"""
This script is a modified version of a script originally licensed under the Apache 2.0 license.

Original script: https://github.com/allenai/scispacy/blob/main/scispacy/umls_utils.py
Original authors: Neumann, Mark and King, Daniel and Beltagy, Iz and Ammar, Waleed

Modifications: added two new arguments `lang` and `non_suppressed` to `read_umls_concepts`
(with their respective default values) which allow the function to filter the terms by language
and decide whether to keep only non-suppressed concepts or not.
"""

from typing import Optional, List, Dict

# TODO(Mark): Remove in scispacy v1.0, for backward compatability only.
from scispacy.linking_utils import Entity as UmlsEntity, UmlsKnowledgeBase # noqa

# preferred definition sources (from S2)
DEF_SOURCES_PREFERRED = {"NCI_BRIDG", "NCI_NCI-GLOSS", "NCI", "GO", "MSH", "NCI_FDA"}


def read_umls_file_headers(meta_path: str, filename: str) -> List[str]:
"""
Read the file descriptor MRFILES.RRF from a UMLS release and get column headers (names)
for the given file

MRFILES.RRF file format: a pipe-separated values
Useful columns:
column 0: name of one of the files in the META directory
column 2: column names of that file

Args:
meta_path: path to the META directory of an UMLS release
filename: name of the file to get its column headers
Returns:
a list of column names
"""
file_descriptors = f"{meta_path}/MRFILES.RRF" # to get column names
with open(file_descriptors, encoding="utf-8") as fin:
for line in fin:
splits = line.split("|")
found_filename = splits[0]
column_names = (splits[2] + ",").split(
","
) # ugly hack because all files end with an empty column
if found_filename in filename:
return column_names
assert False, f"Couldn't find column names for file {filename}"
return None


def read_umls_concepts(
meta_path: str,
concept_details: Dict,
source: Optional[str] = None,
lang: str = "ENG",
non_suppressed: bool = True,
):
"""
Read the concepts file MRCONSO.RRF from a UMLS release and store it in
concept_details dictionary. Each concept is represented with
- concept_id
- canonical_name
- aliases
- types
- definition
This function fills the first three. If a canonical name is not found, it is left empty.

MRFILES.RRF file format: a pipe-separated values
Useful columns: CUI, LAT, SUPPRESS, STR, ISPREF, TS, STT

Args:
meta_path: path to the META directory of an UMLS release
concept_details: a dictionary to be filled with concept informations
source: An optional source identifier, used as a filter to extract only a
specific source from UMLS.
lang: An optional language identifier, used to filter terms by language
non_suppressed: flag to indicate whether only non-suppressed concepts should be kept
"""
concepts_filename = "MRCONSO.RRF"
headers = read_umls_file_headers(meta_path, concepts_filename)
with open(f"{meta_path}/{concepts_filename}", encoding="utf-8") as fin:
for line in fin:
splits = line.strip().split("|")
assert len(headers) == len(splits), (headers, splits)
concept = dict(zip(headers, splits))
if (lang is not None and concept["LAT"] != lang) or (
non_suppressed and concept["SUPPRESS"] != "N"
):
continue # Keep non-suppressed concepts in target language only

if source is not None:
if concept["SAB"] != source:
continue

concept_id = concept["CUI"]
if concept_id not in concept_details: # a new concept
# add it to the dictionary with an empty list of aliases and types
concept_details[concept_id] = {
"concept_id": concept_id,
"aliases": [],
"types": [],
}

concept_name = concept["STR"]
# this condition is copied from S2. It checks if the concept name is canonical or not
is_canonical = (
concept["ISPREF"] == "Y"
and concept["TS"] == "P"
and concept["STT"] == "PF"
)

if not is_canonical or "canonical_name" in concept_details[concept_id]:
# not a canonical name or a canonical name already found
concept_details[concept_id]["aliases"].append(
concept_name
) # add it as an alias
else:
concept_details[concept_id][
"canonical_name"
] = concept_name # set as canonical name


def read_umls_types(meta_path: str, concept_details: Dict):
"""
Read the types file MRSTY.RRF from a UMLS release and store it in
concept_details dictionary. This function adds the `types` field
to the information of each concept

MRSTY.RRF file format: a pipe-separated values
Useful columns: CUI, TUI

Args:
meta_path: path to the META directory of an UMLS release
concept_details: a dictionary to be filled with concept informations
"""
types_filename = "MRSTY.RRF"
headers = read_umls_file_headers(meta_path, types_filename)
with open(f"{meta_path}/{types_filename}", encoding="utf-8") as fin:
for line in fin:
splits = line.strip().split("|")
assert len(headers) == len(splits)
concept_type = dict(zip(headers, splits))

concept = concept_details.get(concept_type["CUI"])
if (
concept is not None
): # a small number of types are for concepts that don't exist
concept["types"].append(concept_type["TUI"])


def read_umls_definitions(meta_path: str, concept_details: Dict):
"""
Read the types file MRDEF.RRF from a UMLS release and store it in
concept_details dictionary. This function adds the `definition` field
to the information of each concept

MRDEF.RRF file format: a pipe-separated values
Useful columns: CUI, SAB, SUPPRESS, DEF

Args:
meta_path: path to the META directory of an UMLS release
concept_details: a dictionary to be filled with concept informations
"""
definitions_filename = "MRDEF.RRF"
headers = read_umls_file_headers(meta_path, definitions_filename)
with open(f"{meta_path}/{definitions_filename}", encoding="utf-8") as fin:
headers = read_umls_file_headers(meta_path, definitions_filename)
for line in fin:
splits = line.strip().split("|")
assert len(headers) == len(splits)
definition = dict(zip(headers, splits))

if definition["SUPPRESS"] != "N":
continue
is_from_preferred_source = definition["SAB"] in DEF_SOURCES_PREFERRED
concept = concept_details.get(definition["CUI"])
if (
concept is None
): # a small number of definitions are for concepts that don't exist
continue

if (
"definition" not in concept
or is_from_preferred_source
and concept["is_from_preferred_source"] == "N"
):
concept["definition"] = definition["DEF"]
concept["is_from_preferred_source"] = (
"Y" if is_from_preferred_source else "N"
)
3 changes: 1 addition & 2 deletions xmen/preprocessing/retired_cuis.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from scispacy import umls_utils

from xmen.ext.scispacy.umls_utils import read_umls_file_headers

class CUIReplacer:
"""
Expand Down
12 changes: 6 additions & 6 deletions xmen/umls.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from scispacy import umls_utils
from xmen.ext.scispacy.umls_utils import read_umls_file_headers, read_umls_concepts, read_umls_types, read_umls_definitions
from scispacy import umls_semantic_type_tree
from scispacy.linking_utils import DEFAULT_UMLS_TYPES_PATH
from langcodes import Language
Expand All @@ -25,7 +25,7 @@ def read_umls_sabs(meta_path):
"""
res = []
sab_filename = "MRSAB.RRF"
headers = umls_utils.read_umls_file_headers(meta_path, sab_filename)
headers = read_umls_file_headers(meta_path, sab_filename)
with open(f"{meta_path}/{sab_filename}") as fin:
for line in fin:
splits = line.strip().split("|")
Expand Down Expand Up @@ -119,7 +119,7 @@ def get_alias_count(concept_details):
return sum([len(c["aliases"]) + 1 for c in concept_details.values()])


def _expand_tuis(tuis, sem_type_tree):
def expand_tuis(tuis, sem_type_tree):
"""
Recursively expands a list of UMLS semantic type abbreviations to include their child semantic types,
using the specified semantic type tree.
Expand Down Expand Up @@ -218,12 +218,12 @@ def get_umls_concepts(
f'>> Reading concepts from {"all sources" if not source else source} and {"all languages" if not langs else f"languages: {langs}"}'
)
for lang in langs if langs else [None]:
umls_utils.read_umls_concepts(
read_umls_concepts(
meta_path, concept_details, source=source, lang=lang, non_suppressed=non_suppressed_only
)

logger.info(">> Reading types ... ")
umls_utils.read_umls_types(meta_path, concept_details)
read_umls_types(meta_path, concept_details)

if semantic_groups:
logger.info(f"> Number of concepts before semantic group filtering: {len(concept_details)}")
Expand All @@ -234,7 +234,7 @@ def get_umls_concepts(
concept_details = filter_semantic_types(semantic_types, expand_semantic_types, concept_details)

logger.info(">> Reading definitions ... ")
umls_utils.read_umls_definitions(meta_path, concept_details)
read_umls_definitions(meta_path, concept_details)

logger.info(f"> Number of concepts before de-duplication: {len(concept_details)}")
logger.info(f"> Number of aliases before de-duplication: {get_alias_count(concept_details)}")
Expand Down