From 0af4222006e9cbf899c285608612215f5d586dba Mon Sep 17 00:00:00 2001 From: nachollorca Date: Tue, 16 May 2023 12:50:49 +0200 Subject: [PATCH] Include our modification of scispacy in ext (#9) * Integrate BRONCO and MUGIT * Remake mugit.py * Remove nb and imports * Include our scispacy fork in ext and refactor * update dependencies * Revert changes from wrong branch * Include download links bronco/mugit files * Import from ext.scispacy only in umls.py * Remove bigbio dependency in dataloaders.py * Fix custom scispacy imports --------- Co-authored-by: illorca Co-authored-by: illorca --- conf/bronco.yaml | 3 + conf/mugit.yaml | 1 + xmen/data/dataloaders.py | 3 +- xmen/ext/scispacy/umls_utils.py | 190 +++++++++++++++++++++++++++++ xmen/preprocessing/retired_cuis.py | 3 +- xmen/umls.py | 12 +- 6 files changed, 202 insertions(+), 10 deletions(-) create mode 100644 xmen/ext/scispacy/umls_utils.py diff --git a/conf/bronco.yaml b/conf/bronco.yaml index 0f3ccf6..db2ed49 100644 --- a/conf/bronco.yaml +++ b/conf/bronco.yaml @@ -3,16 +3,19 @@ name : bronco dict: icd10gm: custom: + # Get from: https://www.bfarm.de/DE/Kodiersysteme/Services/Downloads/_node.html icd10gm_path: ${oc.env:HOME}/temp/icd10gm2017syst_kodes.txt lang: - de ops: custom: + # Get from: https://www.bfarm.de/DE/Kodiersysteme/Services/Downloads/_node.html ops_path: ${oc.env:HOME}/temp/ops2017syst_kodes.txt lang: - de atc: custom: + # Get from: https://www.wido.de/publikationen-produkte/arzneimittel-klassifikation/ atc_path: ${oc.env:HOME}/temp/ATC GKV-AI_2022.xlsm lang: - de \ No newline at end of file diff --git a/conf/mugit.yaml b/conf/mugit.yaml index 941d210..d1f437d 100644 --- a/conf/mugit.yaml +++ b/conf/mugit.yaml @@ -5,4 +5,5 @@ dict: lang: - de - en + # Get from: https://user.medunigraz.at/stefan.schulz/mugit/ mugit_path: ${oc.env:HOME}/temp/SCT-GIT_de_large.dat diff --git a/xmen/data/dataloaders.py b/xmen/data/dataloaders.py index 0e4f4ce..1c8eda1 100644 --- a/xmen/data/dataloaders.py +++ b/xmen/data/dataloaders.py @@ -1,6 +1,5 @@ from typing import List, Union from pathlib import Path -from scispacy import umls_utils import datasets @@ -145,4 +144,4 @@ def _load_bigbio_dataset( output = datasets.dataset_dict.DatasetDict() for s in splits: output[s] = datasets.concatenate_datasets([d[s] for d in ds]) - return output + return output \ No newline at end of file diff --git a/xmen/ext/scispacy/umls_utils.py b/xmen/ext/scispacy/umls_utils.py new file mode 100644 index 0000000..7093433 --- /dev/null +++ b/xmen/ext/scispacy/umls_utils.py @@ -0,0 +1,190 @@ +""" +This script is a modified version of a script originally licensed under the Apache 2.0 license. + +Original script: https://github.com/allenai/scispacy/blob/main/scispacy/umls_utils.py +Original authors: Neumann, Mark and King, Daniel and Beltagy, Iz and Ammar, Waleed + +Modifications: added two new arguments `lang` and `non_suppressed` to `read_umls_concepts` +(with their respective default values) which allow the function to filter the terms by language +and decide whether to keep only non-suppressed concepts or not. +""" + +from typing import Optional, List, Dict + +# TODO(Mark): Remove in scispacy v1.0, for backward compatability only. +from scispacy.linking_utils import Entity as UmlsEntity, UmlsKnowledgeBase # noqa + +# preferred definition sources (from S2) +DEF_SOURCES_PREFERRED = {"NCI_BRIDG", "NCI_NCI-GLOSS", "NCI", "GO", "MSH", "NCI_FDA"} + + +def read_umls_file_headers(meta_path: str, filename: str) -> List[str]: + """ + Read the file descriptor MRFILES.RRF from a UMLS release and get column headers (names) + for the given file + + MRFILES.RRF file format: a pipe-separated values + Useful columns: + column 0: name of one of the files in the META directory + column 2: column names of that file + + Args: + meta_path: path to the META directory of an UMLS release + filename: name of the file to get its column headers + Returns: + a list of column names + """ + file_descriptors = f"{meta_path}/MRFILES.RRF" # to get column names + with open(file_descriptors, encoding="utf-8") as fin: + for line in fin: + splits = line.split("|") + found_filename = splits[0] + column_names = (splits[2] + ",").split( + "," + ) # ugly hack because all files end with an empty column + if found_filename in filename: + return column_names + assert False, f"Couldn't find column names for file {filename}" + return None + + +def read_umls_concepts( + meta_path: str, + concept_details: Dict, + source: Optional[str] = None, + lang: str = "ENG", + non_suppressed: bool = True, +): + """ + Read the concepts file MRCONSO.RRF from a UMLS release and store it in + concept_details dictionary. Each concept is represented with + - concept_id + - canonical_name + - aliases + - types + - definition + This function fills the first three. If a canonical name is not found, it is left empty. + + MRFILES.RRF file format: a pipe-separated values + Useful columns: CUI, LAT, SUPPRESS, STR, ISPREF, TS, STT + + Args: + meta_path: path to the META directory of an UMLS release + concept_details: a dictionary to be filled with concept informations + source: An optional source identifier, used as a filter to extract only a + specific source from UMLS. + lang: An optional language identifier, used to filter terms by language + non_suppressed: flag to indicate whether only non-suppressed concepts should be kept + """ + concepts_filename = "MRCONSO.RRF" + headers = read_umls_file_headers(meta_path, concepts_filename) + with open(f"{meta_path}/{concepts_filename}", encoding="utf-8") as fin: + for line in fin: + splits = line.strip().split("|") + assert len(headers) == len(splits), (headers, splits) + concept = dict(zip(headers, splits)) + if (lang is not None and concept["LAT"] != lang) or ( + non_suppressed and concept["SUPPRESS"] != "N" + ): + continue # Keep non-suppressed concepts in target language only + + if source is not None: + if concept["SAB"] != source: + continue + + concept_id = concept["CUI"] + if concept_id not in concept_details: # a new concept + # add it to the dictionary with an empty list of aliases and types + concept_details[concept_id] = { + "concept_id": concept_id, + "aliases": [], + "types": [], + } + + concept_name = concept["STR"] + # this condition is copied from S2. It checks if the concept name is canonical or not + is_canonical = ( + concept["ISPREF"] == "Y" + and concept["TS"] == "P" + and concept["STT"] == "PF" + ) + + if not is_canonical or "canonical_name" in concept_details[concept_id]: + # not a canonical name or a canonical name already found + concept_details[concept_id]["aliases"].append( + concept_name + ) # add it as an alias + else: + concept_details[concept_id][ + "canonical_name" + ] = concept_name # set as canonical name + + +def read_umls_types(meta_path: str, concept_details: Dict): + """ + Read the types file MRSTY.RRF from a UMLS release and store it in + concept_details dictionary. This function adds the `types` field + to the information of each concept + + MRSTY.RRF file format: a pipe-separated values + Useful columns: CUI, TUI + + Args: + meta_path: path to the META directory of an UMLS release + concept_details: a dictionary to be filled with concept informations + """ + types_filename = "MRSTY.RRF" + headers = read_umls_file_headers(meta_path, types_filename) + with open(f"{meta_path}/{types_filename}", encoding="utf-8") as fin: + for line in fin: + splits = line.strip().split("|") + assert len(headers) == len(splits) + concept_type = dict(zip(headers, splits)) + + concept = concept_details.get(concept_type["CUI"]) + if ( + concept is not None + ): # a small number of types are for concepts that don't exist + concept["types"].append(concept_type["TUI"]) + + +def read_umls_definitions(meta_path: str, concept_details: Dict): + """ + Read the types file MRDEF.RRF from a UMLS release and store it in + concept_details dictionary. This function adds the `definition` field + to the information of each concept + + MRDEF.RRF file format: a pipe-separated values + Useful columns: CUI, SAB, SUPPRESS, DEF + + Args: + meta_path: path to the META directory of an UMLS release + concept_details: a dictionary to be filled with concept informations + """ + definitions_filename = "MRDEF.RRF" + headers = read_umls_file_headers(meta_path, definitions_filename) + with open(f"{meta_path}/{definitions_filename}", encoding="utf-8") as fin: + headers = read_umls_file_headers(meta_path, definitions_filename) + for line in fin: + splits = line.strip().split("|") + assert len(headers) == len(splits) + definition = dict(zip(headers, splits)) + + if definition["SUPPRESS"] != "N": + continue + is_from_preferred_source = definition["SAB"] in DEF_SOURCES_PREFERRED + concept = concept_details.get(definition["CUI"]) + if ( + concept is None + ): # a small number of definitions are for concepts that don't exist + continue + + if ( + "definition" not in concept + or is_from_preferred_source + and concept["is_from_preferred_source"] == "N" + ): + concept["definition"] = definition["DEF"] + concept["is_from_preferred_source"] = ( + "Y" if is_from_preferred_source else "N" + ) \ No newline at end of file diff --git a/xmen/preprocessing/retired_cuis.py b/xmen/preprocessing/retired_cuis.py index 6ee5f13..41aa814 100644 --- a/xmen/preprocessing/retired_cuis.py +++ b/xmen/preprocessing/retired_cuis.py @@ -1,5 +1,4 @@ -from scispacy import umls_utils - +from xmen.ext.scispacy.umls_utils import read_umls_file_headers class CUIReplacer: """ diff --git a/xmen/umls.py b/xmen/umls.py index 6efe145..ff5bb47 100644 --- a/xmen/umls.py +++ b/xmen/umls.py @@ -1,4 +1,4 @@ -from scispacy import umls_utils +from xmen.ext.scispacy.umls_utils import read_umls_file_headers, read_umls_concepts, read_umls_types, read_umls_definitions from scispacy import umls_semantic_type_tree from scispacy.linking_utils import DEFAULT_UMLS_TYPES_PATH from langcodes import Language @@ -25,7 +25,7 @@ def read_umls_sabs(meta_path): """ res = [] sab_filename = "MRSAB.RRF" - headers = umls_utils.read_umls_file_headers(meta_path, sab_filename) + headers = read_umls_file_headers(meta_path, sab_filename) with open(f"{meta_path}/{sab_filename}") as fin: for line in fin: splits = line.strip().split("|") @@ -119,7 +119,7 @@ def get_alias_count(concept_details): return sum([len(c["aliases"]) + 1 for c in concept_details.values()]) -def _expand_tuis(tuis, sem_type_tree): +def expand_tuis(tuis, sem_type_tree): """ Recursively expands a list of UMLS semantic type abbreviations to include their child semantic types, using the specified semantic type tree. @@ -218,12 +218,12 @@ def get_umls_concepts( f'>> Reading concepts from {"all sources" if not source else source} and {"all languages" if not langs else f"languages: {langs}"}' ) for lang in langs if langs else [None]: - umls_utils.read_umls_concepts( + read_umls_concepts( meta_path, concept_details, source=source, lang=lang, non_suppressed=non_suppressed_only ) logger.info(">> Reading types ... ") - umls_utils.read_umls_types(meta_path, concept_details) + read_umls_types(meta_path, concept_details) if semantic_groups: logger.info(f"> Number of concepts before semantic group filtering: {len(concept_details)}") @@ -234,7 +234,7 @@ def get_umls_concepts( concept_details = filter_semantic_types(semantic_types, expand_semantic_types, concept_details) logger.info(">> Reading definitions ... ") - umls_utils.read_umls_definitions(meta_path, concept_details) + read_umls_definitions(meta_path, concept_details) logger.info(f"> Number of concepts before de-duplication: {len(concept_details)}") logger.info(f"> Number of aliases before de-duplication: {get_alias_count(concept_details)}")