-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Example for linking against ATC without mention context
- Loading branch information
Showing
6 changed files
with
345 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,263 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "62ff831a-ce8b-4fc0-89e6-d910b31c5f85", | ||
"metadata": {}, | ||
"source": [ | ||
"# Linking Drug Mentions in German (without Context) to ATC\n", | ||
"\n", | ||
"## Preparation\n", | ||
"\n", | ||
"- Get German ATC 2023 version from: https://www.wido.de/publikationen-produkte/arzneimittel-klassifikation/ \n", | ||
"- Optional: get access to DrugBank (https://go.drugbank.com/releases/latest) for much more aliases (e.g., trade names)\n", | ||
"- `pip install openpyxl`\n", | ||
"- Prepare xMEN KB and indices:\n", | ||
" - `xmen dict examples/conf/atc.yaml --code examples/dicts/atc2023_de.py`\n", | ||
" - `xmen index examples/conf/atc.yaml --all`" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "2cbdaaa9-1b2d-4f6b-a193-554da8226217", | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from xmen import load_kb\n", | ||
"from xmen.linkers import default_ensemble\n", | ||
"import os\n", | ||
"from pathlib import Path" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "0756e7cf-9478-4bb2-bdf0-2cc147c44e55", | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"base_path = Path(os.path.expanduser('~/.cache/xmen/atc/'))\n", | ||
"kb = load_kb(base_path / 'atc.jsonl')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "449f6fe0-bcbd-4572-b48a-8f47a98c52f2", | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[03/07/24 18:18:49] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loading hierarchical faiss index <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">sap_bert_linker.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">153</span></a>\n", | ||
"</pre>\n" | ||
], | ||
"text/plain": [ | ||
"\u001b[2;36m[03/07/24 18:18:49]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading hierarchical faiss index \u001b]8;id=789095;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=471624;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
}, | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loading index from <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">faiss_indexer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">64</span></a>\n", | ||
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #800080; text-decoration-color: #800080\">/home/Florian.Borchert/.cache/xmen/atc/index/sapbert/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">embed_faiss_h</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span>\n", | ||
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> <span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">ier.pickle</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span>\n", | ||
"</pre>\n" | ||
], | ||
"text/plain": [ | ||
"\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading index from \u001b]8;id=852479;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=39591;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n", | ||
"\u001b[2;36m \u001b[0m \u001b[35m/home/Florian.Borchert/.cache/xmen/atc/index/sapbert/\u001b[0m\u001b[95membed_faiss_h\u001b[0m \u001b[2m \u001b[0m\n", | ||
"\u001b[2;36m \u001b[0m \u001b[95mier.pickle\u001b[0m \u001b[2m \u001b[0m\n" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
}, | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[03/07/24 18:18:50] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO </span> Loaded index of type <span style=\"font-weight: bold\"><</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">class</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'faiss.swigfaiss.IndexHNSWFlat'</span><span style=\"font-weight: bold\">></span> and <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">faiss_indexer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">66</span></a>\n", | ||
"<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> </span> size <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">470941</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> </span>\n", | ||
"</pre>\n" | ||
], | ||
"text/plain": [ | ||
"\u001b[2;36m[03/07/24 18:18:50]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and \u001b]8;id=788991;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=743737;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n", | ||
"\u001b[2;36m \u001b[0m size \u001b[1;36m470941\u001b[0m \u001b[2m \u001b[0m\n" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
} | ||
], | ||
"source": [ | ||
"linker = default_ensemble(base_path / 'index')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "89561707-0f5e-480e-b96c-c29229e1fd70", | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"drug_mentions = [\n", | ||
" 'Ursodeoxycholsäure, 250 mg - Kapsel',\n", | ||
" 'Propofol 2%, 20 mg/ml 1000 mg/50 ml Injektionslösung',\n", | ||
" 'Norepinephrin 20 µg/ml',\n", | ||
" 'Amphotericin B, 10 mg - Lutschtablette',\n", | ||
" 'Fentanyl (50 µg/ml) i.v.',\n", | ||
" 'Vollelektrolyt-Lösung',\n", | ||
" 'Sufentanil 5µg/ml 250 µg/50 ml Injektionslösung'\n", | ||
"]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "2c09eda3-462c-4f6b-b055-52ff77a6ba65", | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"application/vnd.jupyter.widget-view+json": { | ||
"model_id": "5a4c8dd8146148aaa5694a517d6aa965", | ||
"version_major": 2, | ||
"version_minor": 0 | ||
}, | ||
"text/plain": [ | ||
"Map: 0%| | 0/7 [00:00<?, ? examples/s]" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
} | ||
], | ||
"source": [ | ||
"predictions = linker.predict_no_context(drug_mentions)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "8158be67-5c2e-4358-b986-c72b9a74e43c", | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Input: Ursodeoxycholsäure, 250 mg - Kapsel\n", | ||
"Confidence: 0.7435341477394104\n", | ||
"CUI: A05AA02, Name: Ursodeoxycholsäure\n", | ||
"Definition: None\n", | ||
"TUI(s): \n", | ||
"Aliases (abbreviated, total: 47): \n", | ||
"\t Litursol, Solutrat, Ursochol, Ag-ursodiol, 3alpha,7beta-Dihydroxy-5beta-cholan-24-oic acid, Urso DS, PMS-ursodiol, Ursodeoxycholic acid, (3alpha,5beta,7beta)-3,7-dihydroxycholan-24-oic acid, Urusa\n", | ||
"------\n", | ||
"Input: Propofol 2%, 20 mg/ml 1000 mg/50 ml Injektionslösung\n", | ||
"Confidence: 0.6950462460517883\n", | ||
"CUI: N01AX10, Name: Propofol\n", | ||
"Definition: None\n", | ||
"TUI(s): \n", | ||
"Aliases (abbreviated, total: 45): \n", | ||
"\t Diprivan, Gobbifol, Hypro, Disoprivan, Propofil, Anesthesia S/I-50, Anesthesia S/I-60, Propofol-II Injection, Propoven, Anepol\n", | ||
"------\n", | ||
"Input: Norepinephrin 20 µg/ml\n", | ||
"Confidence: 0.848741888999939\n", | ||
"CUI: C01CA03, Name: Norepinephrin\n", | ||
"Definition: None\n", | ||
"TUI(s): \n", | ||
"Aliases (abbreviated, total: 27): \n", | ||
"\t (R)-norepinephrine, Norepinephrine Bitartrate In 5% Dextrose Injection, (R)-4-(2-amino-1-hydroxyethyl)-1,2-benzenediol, Norepinephrine Bitartrate Injection USP, Levophed(r) Norepinephrine Bitartrate, Norépinéphrine, Norepinephrine, Levophed, (R)-(−)-norepinephrine, L-noradrenaline\n", | ||
"------\n", | ||
"Input: Amphotericin B, 10 mg - Lutschtablette\n", | ||
"Confidence: 0.6953610181808472\n", | ||
"CUI: J02AA01, Name: Amphotericin B\n", | ||
"Definition: None\n", | ||
"TUI(s): \n", | ||
"Aliases (abbreviated, total: 22): \n", | ||
"\t Amphocin, Amphotericinum B, Liposomal amphotericin B, Amphotericin, Fungizone, Amphotec 50 mg, Amphotec 100 mg, Abelect, Amphotericin B, Amphocil\n", | ||
"------\n", | ||
"Input: Fentanyl (50 µg/ml) i.v.\n", | ||
"Confidence: 0.8534790277481079\n", | ||
"CUI: N02AB03, Name: Fentanyl\n", | ||
"Definition: None\n", | ||
"TUI(s): \n", | ||
"Aliases (abbreviated, total: 70): \n", | ||
"\t Instanyl, Duragesic 12, Fentanyl Buccal, Fentanyl Transdermal, Pecfent, N-(1-phenethylpiperidin-4-yl)-N-phenylpropionamide, Fentora, Lazanda, Mylan-fentanyl Matrix Patch, Abstral\n", | ||
"------\n", | ||
"Input: Vollelektrolyt-Lösung\n", | ||
"Confidence: 0.8366699814796448\n", | ||
"CUI: B05XA, Name: Elektrolytlösungen\n", | ||
"Definition: None\n", | ||
"TUI(s): \n", | ||
"Aliases: (total: 0): \n", | ||
"\t \n", | ||
"------\n", | ||
"Input: Sufentanil 5µg/ml 250 µg/50 ml Injektionslösung\n", | ||
"Confidence: 0.7572200894355774\n", | ||
"CUI: N01AH03, Name: Sufentanil\n", | ||
"Definition: None\n", | ||
"TUI(s): \n", | ||
"Aliases (abbreviated, total: 21): \n", | ||
"\t Sufentanyl, N-(4-(Methoxymethyl)-1-(2-(2-thienyl)ethyl)-4-piperidyl)propionanilide, Sufentanilum, Sufentil, Dsuvia, N-(4-(Methoxymethyl)-1-(2-(2-thienyl)ethyl)-4-piperidinyl)-N-phenylpropanamide, Zalviso, Sufentanil Citrate, Sufenta, Chronogesic\n", | ||
"------\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"for d, p in zip(drug_mentions, predictions):\n", | ||
" print('Input:', d)\n", | ||
" top_candidate = p['normalized'][0]\n", | ||
" print('Confidence:', top_candidate['score'])\n", | ||
" print(kb.cui_to_entity[top_candidate['db_id']])\n", | ||
" print('------')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "eb7e79b7-06b9-4fed-a603-8a63cb77b97d", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python [conda env:xmen_notebooks]", | ||
"language": "python", | ||
"name": "conda-env-xmen_notebooks-py" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.11" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# Config for German ATC 2023 version | ||
name : atc | ||
|
||
dict: | ||
custom: | ||
# Get 2023 version from: https://www.wido.de/publikationen-produkte/arzneimittel-klassifikation/ | ||
atc_path: local_files/atc2023 | ||
drug_bank_xml: local_files/drugbank/5.1.8/full-database.xml | ||
lang: | ||
- de |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import pandas as pd | ||
from pathlib import Path | ||
from collections import defaultdict | ||
import openpyxl | ||
import xml.etree.ElementTree as ET | ||
from xmen.log import logger | ||
|
||
|
||
def get_concept_details(cfg) -> dict: | ||
path = cfg.dict.custom.atc_path | ||
wb = openpyxl.load_workbook(f"{path}/ATC GKV-AI_2023.xlsm") | ||
sheet = wb["WIdO-Index 2023 ATC-sortiert"] | ||
sort = pd.DataFrame(sheet.values) | ||
sort = sort.rename(columns={0: "code", 2: "text", 4: "DDD_Info"}) | ||
sort.drop(0, axis=0, inplace=True) | ||
sort.dropna(axis=0, how="all", inplace=True) | ||
sort.drop(sort[sort.text.isnull()].index, inplace=True) | ||
sort.drop(sort[sort.code.isnull()].index, inplace=True) | ||
|
||
sort["code"] = sort["code"].str.rstrip() | ||
sort = sort[sort["code"].str.len() > 3] | ||
|
||
if drug_bank_xml := cfg.dict.custom.get("drug_bank_xml", None): | ||
logger.info("Extending ATC by DrugBank synonyms") | ||
|
||
ns = {"": "http://www.drugbank.ca"} | ||
tree = ET.parse(drug_bank_xml) | ||
|
||
atc2name = defaultdict(set) | ||
|
||
for drug in tree.getroot(): | ||
aliases = set() | ||
|
||
syns = [s.text for s in drug.findall("synonyms/synonym", ns)] | ||
aliases.update(syns) | ||
|
||
products = [p.text for p in drug.findall("products/product/name", ns)] | ||
aliases.update(products) | ||
|
||
mixtures = [p.text for p in drug.findall("mixtures/mixture/name", ns)] | ||
aliases.update(mixtures) | ||
|
||
international = [i.text for i in drug.findall("international-brands/international-brand/name", ns)] | ||
aliases.update(international) | ||
|
||
atc_codes = drug.find("atc-codes", ns) | ||
for atc_code in atc_codes: | ||
atc2name[atc_code.get("code")].update(aliases) | ||
|
||
logger.info("Building concept dictionary") | ||
concept_details = {} | ||
for _, entry in sort.iterrows(): | ||
sid = entry.code | ||
if not sid in concept_details: | ||
concept_details[sid] = {"concept_id": sid, "canonical_name": entry.text, "types": [], "aliases": []} | ||
elif sid in concept_details: | ||
concept_details[sid]["aliases"].append(entry.text) | ||
if drug_bank_xml: | ||
for alias in atc2name[sid]: | ||
if not alias in concept_details[sid]["aliases"]: | ||
concept_details[sid]["aliases"].append(alias) | ||
|
||
return concept_details |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[tool.poetry] | ||
name = "xmen" | ||
version = "1.0.5" | ||
version = "1.0.6" | ||
description = "An extensible toolkit for Cross-lingual (x) Medical Entity Normalization." | ||
license = "Apache-2.0" | ||
authors = ["Florian Borchert <[email protected]>"] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters