From 0af4222006e9cbf899c285608612215f5d586dba Mon Sep 17 00:00:00 2001
From: nachollorca <madwayesp@gmail.com>
Date: Tue, 16 May 2023 12:50:49 +0200
Subject: [PATCH] Include our modification of scispacy in ext (#9)

* Integrate BRONCO and MUGIT

* Remake mugit.py

* Remove nb and imports

* Include our scispacy fork in ext and refactor

* update dependencies

* Revert changes from wrong branch

* Include download links bronco/mugit files

* Import from ext.scispacy only in umls.py

* Remove bigbio dependency in dataloaders.py

* Fix custom scispacy imports

---------

Co-authored-by: illorca <ignacio.rodriguez@access2.hpc.dhclab.i.hpi.de>
Co-authored-by: illorca <ignacio.rodriguez@access1.hpc.dhclab.i.hpi.de>
---
 conf/bronco.yaml                   |   3 +
 conf/mugit.yaml                    |   1 +
 xmen/data/dataloaders.py           |   3 +-
 xmen/ext/scispacy/umls_utils.py    | 190 +++++++++++++++++++++++++++++
 xmen/preprocessing/retired_cuis.py |   3 +-
 xmen/umls.py                       |  12 +-
 6 files changed, 202 insertions(+), 10 deletions(-)
 create mode 100644 xmen/ext/scispacy/umls_utils.py

diff --git a/conf/bronco.yaml b/conf/bronco.yaml
index 0f3ccf6..db2ed49 100644
--- a/conf/bronco.yaml
+++ b/conf/bronco.yaml
@@ -3,16 +3,19 @@ name : bronco
 dict:
   icd10gm:
     custom:
+      # Get from: https://www.bfarm.de/DE/Kodiersysteme/Services/Downloads/_node.html
       icd10gm_path: ${oc.env:HOME}/temp/icd10gm2017syst_kodes.txt
       lang:
         - de
   ops:
     custom:
+      # Get from: https://www.bfarm.de/DE/Kodiersysteme/Services/Downloads/_node.html
       ops_path: ${oc.env:HOME}/temp/ops2017syst_kodes.txt
       lang:
         - de
   atc:
     custom:
+      # Get from: https://www.wido.de/publikationen-produkte/arzneimittel-klassifikation/
       atc_path: ${oc.env:HOME}/temp/ATC GKV-AI_2022.xlsm
       lang:
         - de
\ No newline at end of file
diff --git a/conf/mugit.yaml b/conf/mugit.yaml
index 941d210..d1f437d 100644
--- a/conf/mugit.yaml
+++ b/conf/mugit.yaml
@@ -5,4 +5,5 @@ dict:
     lang:
       - de
       - en
+    # Get from: https://user.medunigraz.at/stefan.schulz/mugit/  
     mugit_path: ${oc.env:HOME}/temp/SCT-GIT_de_large.dat
diff --git a/xmen/data/dataloaders.py b/xmen/data/dataloaders.py
index 0e4f4ce..1c8eda1 100644
--- a/xmen/data/dataloaders.py
+++ b/xmen/data/dataloaders.py
@@ -1,6 +1,5 @@
 from typing import List, Union
 from pathlib import Path
-from scispacy import umls_utils
 import datasets
 
 
@@ -145,4 +144,4 @@ def _load_bigbio_dataset(
     output = datasets.dataset_dict.DatasetDict()
     for s in splits:
         output[s] = datasets.concatenate_datasets([d[s] for d in ds])
-    return output
+    return output
\ No newline at end of file
diff --git a/xmen/ext/scispacy/umls_utils.py b/xmen/ext/scispacy/umls_utils.py
new file mode 100644
index 0000000..7093433
--- /dev/null
+++ b/xmen/ext/scispacy/umls_utils.py
@@ -0,0 +1,190 @@
+"""
+This script is a modified version of a script originally licensed under the Apache 2.0 license.
+
+Original script: https://github.com/allenai/scispacy/blob/main/scispacy/umls_utils.py
+Original authors: Neumann, Mark and King, Daniel and Beltagy, Iz and Ammar, Waleed
+
+Modifications: added two new arguments `lang` and `non_suppressed` to `read_umls_concepts` 
+(with their respective default values) which allow the function to filter the terms by language 
+and decide whether to keep only non-suppressed concepts or not.
+"""
+
+from typing import Optional, List, Dict
+
+# TODO(Mark): Remove in scispacy v1.0, for backward compatability only.
+from scispacy.linking_utils import Entity as UmlsEntity, UmlsKnowledgeBase  # noqa
+
+# preferred definition sources (from S2)
+DEF_SOURCES_PREFERRED = {"NCI_BRIDG", "NCI_NCI-GLOSS", "NCI", "GO", "MSH", "NCI_FDA"}
+
+
+def read_umls_file_headers(meta_path: str, filename: str) -> List[str]:
+    """
+    Read the file descriptor MRFILES.RRF from a UMLS release and get column headers (names)
+    for the given file
+
+    MRFILES.RRF file format: a pipe-separated values
+    Useful columns:
+        column 0: name of one of the files in the META directory
+        column 2: column names of that file
+
+    Args:
+        meta_path: path to the META directory of an UMLS release
+        filename: name of the file to get its column headers
+    Returns:
+        a list of column names
+    """
+    file_descriptors = f"{meta_path}/MRFILES.RRF"  # to get column names
+    with open(file_descriptors, encoding="utf-8") as fin:
+        for line in fin:
+            splits = line.split("|")
+            found_filename = splits[0]
+            column_names = (splits[2] + ",").split(
+                ","
+            )  # ugly hack because all files end with an empty column
+            if found_filename in filename:
+                return column_names
+    assert False, f"Couldn't find column names for file {filename}"
+    return None
+
+
+def read_umls_concepts(
+    meta_path: str,
+    concept_details: Dict,
+    source: Optional[str] = None,
+    lang: str = "ENG",
+    non_suppressed: bool = True,
+):
+    """
+    Read the concepts file MRCONSO.RRF from a UMLS release and store it in
+    concept_details dictionary. Each concept is represented with
+    - concept_id
+    - canonical_name
+    - aliases
+    - types
+    - definition
+    This function fills the first three. If a canonical name is not found, it is left empty.
+
+    MRFILES.RRF file format: a pipe-separated values
+    Useful columns: CUI, LAT, SUPPRESS, STR, ISPREF, TS, STT
+
+    Args:
+        meta_path: path to the META directory of an UMLS release
+        concept_details: a dictionary to be filled with concept informations
+        source: An optional source identifier, used as a filter to extract only a
+                specific source from UMLS.
+        lang: An optional language identifier, used to filter terms by language
+        non_suppressed: flag to indicate whether only non-suppressed concepts should be kept
+    """
+    concepts_filename = "MRCONSO.RRF"
+    headers = read_umls_file_headers(meta_path, concepts_filename)
+    with open(f"{meta_path}/{concepts_filename}", encoding="utf-8") as fin:
+        for line in fin:
+            splits = line.strip().split("|")
+            assert len(headers) == len(splits), (headers, splits)
+            concept = dict(zip(headers, splits))
+            if (lang is not None and concept["LAT"] != lang) or (
+                non_suppressed and concept["SUPPRESS"] != "N"
+            ):
+                continue  # Keep non-suppressed concepts in target language only
+
+            if source is not None:
+                if concept["SAB"] != source:
+                    continue
+
+            concept_id = concept["CUI"]
+            if concept_id not in concept_details:  # a new concept
+                # add it to the dictionary with an empty list of aliases and types
+                concept_details[concept_id] = {
+                    "concept_id": concept_id,
+                    "aliases": [],
+                    "types": [],
+                }
+
+            concept_name = concept["STR"]
+            # this condition is copied from S2. It checks if the concept name is canonical or not
+            is_canonical = (
+                concept["ISPREF"] == "Y"
+                and concept["TS"] == "P"
+                and concept["STT"] == "PF"
+            )
+
+            if not is_canonical or "canonical_name" in concept_details[concept_id]:
+                # not a canonical name or a canonical name already found
+                concept_details[concept_id]["aliases"].append(
+                    concept_name
+                )  # add it as an alias
+            else:
+                concept_details[concept_id][
+                    "canonical_name"
+                ] = concept_name  # set as canonical name
+
+
+def read_umls_types(meta_path: str, concept_details: Dict):
+    """
+    Read the types file MRSTY.RRF from a UMLS release and store it in
+    concept_details dictionary. This function adds the `types` field
+    to the information of each concept
+
+    MRSTY.RRF file format: a pipe-separated values
+    Useful columns: CUI, TUI
+
+    Args:
+        meta_path: path to the META directory of an UMLS release
+        concept_details: a dictionary to be filled with concept informations
+    """
+    types_filename = "MRSTY.RRF"
+    headers = read_umls_file_headers(meta_path, types_filename)
+    with open(f"{meta_path}/{types_filename}", encoding="utf-8") as fin:
+        for line in fin:
+            splits = line.strip().split("|")
+            assert len(headers) == len(splits)
+            concept_type = dict(zip(headers, splits))
+
+            concept = concept_details.get(concept_type["CUI"])
+            if (
+                concept is not None
+            ):  # a small number of types are for concepts that don't exist
+                concept["types"].append(concept_type["TUI"])
+
+
+def read_umls_definitions(meta_path: str, concept_details: Dict):
+    """
+    Read the types file MRDEF.RRF from a UMLS release and store it in
+    concept_details dictionary. This function adds the `definition` field
+    to the information of each concept
+
+    MRDEF.RRF file format: a pipe-separated values
+    Useful columns: CUI, SAB, SUPPRESS, DEF
+
+    Args:
+        meta_path: path to the META directory of an UMLS release
+        concept_details: a dictionary to be filled with concept informations
+    """
+    definitions_filename = "MRDEF.RRF"
+    headers = read_umls_file_headers(meta_path, definitions_filename)
+    with open(f"{meta_path}/{definitions_filename}", encoding="utf-8") as fin:
+        headers = read_umls_file_headers(meta_path, definitions_filename)
+        for line in fin:
+            splits = line.strip().split("|")
+            assert len(headers) == len(splits)
+            definition = dict(zip(headers, splits))
+
+            if definition["SUPPRESS"] != "N":
+                continue
+            is_from_preferred_source = definition["SAB"] in DEF_SOURCES_PREFERRED
+            concept = concept_details.get(definition["CUI"])
+            if (
+                concept is None
+            ):  # a small number of definitions are for concepts that don't exist
+                continue
+
+            if (
+                "definition" not in concept
+                or is_from_preferred_source
+                and concept["is_from_preferred_source"] == "N"
+            ):
+                concept["definition"] = definition["DEF"]
+                concept["is_from_preferred_source"] = (
+                    "Y" if is_from_preferred_source else "N"
+                )
\ No newline at end of file
diff --git a/xmen/preprocessing/retired_cuis.py b/xmen/preprocessing/retired_cuis.py
index 6ee5f13..41aa814 100644
--- a/xmen/preprocessing/retired_cuis.py
+++ b/xmen/preprocessing/retired_cuis.py
@@ -1,5 +1,4 @@
-from scispacy import umls_utils
-
+from xmen.ext.scispacy.umls_utils import read_umls_file_headers
 
 class CUIReplacer:
     """
diff --git a/xmen/umls.py b/xmen/umls.py
index 6efe145..ff5bb47 100644
--- a/xmen/umls.py
+++ b/xmen/umls.py
@@ -1,4 +1,4 @@
-from scispacy import umls_utils
+from xmen.ext.scispacy.umls_utils import read_umls_file_headers, read_umls_concepts, read_umls_types, read_umls_definitions
 from scispacy import umls_semantic_type_tree
 from scispacy.linking_utils import DEFAULT_UMLS_TYPES_PATH
 from langcodes import Language
@@ -25,7 +25,7 @@ def read_umls_sabs(meta_path):
     """
     res = []
     sab_filename = "MRSAB.RRF"
-    headers = umls_utils.read_umls_file_headers(meta_path, sab_filename)
+    headers = read_umls_file_headers(meta_path, sab_filename)
     with open(f"{meta_path}/{sab_filename}") as fin:
         for line in fin:
             splits = line.strip().split("|")
@@ -119,7 +119,7 @@ def get_alias_count(concept_details):
     return sum([len(c["aliases"]) + 1 for c in concept_details.values()])
 
 
-def _expand_tuis(tuis, sem_type_tree):
+def expand_tuis(tuis, sem_type_tree):
     """
     Recursively expands a list of UMLS semantic type abbreviations to include their child semantic types,
     using the specified semantic type tree.
@@ -218,12 +218,12 @@ def get_umls_concepts(
             f'>> Reading concepts from {"all sources" if not source else source} and {"all languages" if not langs else f"languages: {langs}"}'
         )
         for lang in langs if langs else [None]:
-            umls_utils.read_umls_concepts(
+            read_umls_concepts(
                 meta_path, concept_details, source=source, lang=lang, non_suppressed=non_suppressed_only
             )
 
     logger.info(">> Reading types ... ")
-    umls_utils.read_umls_types(meta_path, concept_details)
+    read_umls_types(meta_path, concept_details)
 
     if semantic_groups:
         logger.info(f"> Number of concepts before semantic group filtering: {len(concept_details)}")
@@ -234,7 +234,7 @@ def get_umls_concepts(
         concept_details = filter_semantic_types(semantic_types, expand_semantic_types, concept_details)
 
     logger.info(">> Reading definitions ... ")
-    umls_utils.read_umls_definitions(meta_path, concept_details)
+    read_umls_definitions(meta_path, concept_details)
 
     logger.info(f"> Number of concepts before de-duplication: {len(concept_details)}")
     logger.info(f"> Number of aliases before de-duplication: {get_alias_count(concept_details)}")