diff --git a/WeakSupervision.ipynb b/WeakSupervision.ipynb index aa3da4d..9efc808 100644 --- a/WeakSupervision.ipynb +++ b/WeakSupervision.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 1, "id": "bb5a326a-91b3-4057-af63-82389654e86c", "metadata": {}, "outputs": [], @@ -82,7 +82,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 3, @@ -191,6 +191,14 @@ "# Labeling Functions" ] }, + { + "cell_type": "markdown", + "id": "2b8f5257-d74b-4e85-a0c1-f15879a2bcb3", + "metadata": {}, + "source": [ + "## Gazetteer-based LFs" + ] + }, { "cell_type": "markdown", "id": "0c52fdfc", @@ -206,15 +214,15 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv('data/molecular/nightly-GeneSummaries.tsv', sep='\\t')\n", - "CIVIC_genes = df['name'].tolist()\n", + "civic_genes_df = pd.read_csv('data/molecular/nightly-GeneSummaries.tsv', sep='\\t')\n", + "CIVIC_genes = civic_genes_df['name'].tolist()\n", "CIVIC_genes_lower = [c.lower() for c in CIVIC_genes]" ] }, { "cell_type": "code", "execution_count": 10, - "id": "34c3d555", + "id": "14ff4e2e-1545-4f11-96a5-e0a481e78017", "metadata": {}, "outputs": [ { @@ -232,8 +240,8 @@ } ], "source": [ - "df = pd.read_csv('data/molecular/nightly-VariantSummaries.tsv', sep='\\t', error_bad_lines=False )\n", - "CIVIC_variants = df['variant'].tolist()\n", + "civic_variant_df = pd.read_csv('data/molecular/nightly-VariantSummaries.tsv', sep='\\t', error_bad_lines=False )\n", + "CIVIC_variants = civic_variant_df['variant'].tolist()\n", "CIVIC_variants_lower = [c.lower() for c in CIVIC_variants]" ] }, @@ -252,27 +260,107 @@ "metadata": {}, "outputs": [], "source": [ - "def civic(doc):\n", + "def civic_fn(doc):\n", " for tok in doc:\n", " for cue in CIVIC_genes:\n", " if tok.text.find(cue) == -1:\n", " continue\n", " else:\n", " yield tok.i, tok.i+1, \"Gene or Protein\"\n", - "cue_civic = heuristics.FunctionAnnotator(\"cue_civic\", civic)" + "lf_civic = heuristics.FunctionAnnotator(\"CIViC\", civic_fn)" ] }, { "cell_type": "markdown", - "id": "a3d4ba0a", + "id": "7d750a40-b60a-47a1-8988-69a3d2ed81d2", "metadata": {}, "source": [ - "The Online Mendelian Inheritance in Man (OMIM) database is the encyclopedic collection of the human medical branch of genetics." + "Get all synonyms in Entrez for CIViC genes, remove short ones and German stopwords" ] }, { "cell_type": "code", "execution_count": 12, + "id": "d9454936-02a2-4d19-900f-b06e0d183c6f", + "metadata": {}, + "outputs": [], + "source": [ + "entrez_df = pd.read_csv('data/Homo_sapiens.gene_info', sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "96921b47-84ab-4f09-8864-4e33059bcde0", + "metadata": {}, + "outputs": [], + "source": [ + "symbols = set()\n", + "for _, r in entrez_df.set_index('GeneID').loc[civic_variant_df.entrez_id].iterrows():\n", + " symbols.add(r.Symbol)\n", + " for s in r.Synonyms.split('|'):\n", + " if not s in ['R1', 'R2', 'eN', 'HNPCC'] and len(s) > 1 and not s.lower() in stops:\n", + " symbols.add(s.lower())" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "8cb9c469-a8b9-47b0-8f8d-ab7581a40044", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2028/2028 [00:00<00:00, 12862.40it/s]\n" + ] + } + ], + "source": [ + "from spacy.matcher import Matcher\n", + "\n", + "entrez_matcher = Matcher(nlp.vocab)\n", + "pattern = []\n", + "for s in nlp.pipe(tqdm(symbols), disable=[\"ner\", \"tok2vec\"]):\n", + " for pos in ['NOUN', 'PROPN', 'X']: # Consider only if first POS is one of these\n", + " p = [{'LOWER' : spl.text.lower() } for spl in s]\n", + " p[0]['POS'] = pos\n", + " pattern.append(p)\n", + " p2 = p + [{'LOWER' : '-'}, {'LOWER' : 'gen'}] #also consider if followed by -Gen\n", + " pattern.append(p2)\n", + "entrez_matcher.add(\"entrez\", pattern)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ce0939e0-0515-4921-af01-3ede88f7a043", + "metadata": {}, + "outputs": [], + "source": [ + "def entrez_fn(doc):\n", + " matches = entrez_matcher(doc)\n", + " if matches:\n", + " # Keep longest matches only\n", + " spans = [doc[start:end] for _, start, end in matches]\n", + " spans = spacy.util.filter_spans(spans)\n", + " for s in spans:\n", + " yield s.start, s.end, 'Gene or Protein'\n", + "lf_entrez = heuristics.FunctionAnnotator(\"Entrez\", entrez_fn) " + ] + }, + { + "cell_type": "markdown", + "id": "54562797-295f-4920-863e-8d141a49ea80", + "metadata": {}, + "source": [ + "The Online Mendelian Inheritance in Man (OMIM) database is the encyclopedic collection of the human medical branch of genetics. \"omim\" is based on the OMIM database and checks whether tokens are present in its list of 16,767 approved gene symbols in lowercase as the diversity of genes often shows in volatile capitalization. To increase precision, genes with a length shorter than three characters are matched only correctly cased." + ] + }, + { + "cell_type": "code", + "execution_count": 16, "id": "3b2354cf", "metadata": {}, "outputs": [ @@ -299,31 +387,23 @@ "print(len(omim_list))" ] }, - { - "cell_type": "markdown", - "id": "35a2989a", - "metadata": {}, - "source": [ - "\"omim\" is based on the OMIM database and checks whether tokens are present in its list of 16,767 approved gene symbols in lowercase as the diversity of genes often shows in volatile capitalization. To increase precision, genes with a length shorter than three characters are matched only correctly cased." - ] - }, { "cell_type": "code", - "execution_count": 13, - "id": "e6813e7a-8d36-4da2-9749-9c9b6ffc6d0f", + "execution_count": 17, + "id": "dab88a11-60d2-4075-9ae0-4f22d1e5874d", "metadata": {}, "outputs": [], "source": [ - "def omim(doc):\n", + "def omim_fn(doc):\n", " for tok in doc:\n", " if tok.text.lower() in omim_list_lower and tok.text.lower() not in stops and len(tok.text.lower())>=3:\n", " yield tok.i, tok.i+1, \"Gene or Protein\"\n", - "omim = heuristics.FunctionAnnotator(\"omim\", omim) " + "lf_omim = heuristics.FunctionAnnotator(\"OMIM\", omim_fn) " ] }, { "cell_type": "markdown", - "id": "4769cf3d", + "id": "02dd22df-8fc6-44eb-9915-4e80ee9ef7d9", "metadata": {}, "source": [ "The Catalogue of Somatic Mutations in Cancer (COSMIC) database harbors somatic cell mutations and additional information associated with cancer in humans." @@ -331,8 +411,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "faecaa97", + "execution_count": 18, + "id": "2890317a-11f0-4cd2-bc22-5f583ca95c7e", "metadata": {}, "outputs": [], "source": [ @@ -341,29 +421,70 @@ "cosmic_census_lower = [c.lower() for c in cosmic_census]" ] }, - { - "cell_type": "markdown", - "id": "4f3983b3", - "metadata": {}, - "source": [ - "\"cue_cosmic_census\" is based on the COSMIC database. If a token contains a gene symbol which is listed here, this token and its successor are annotated as a gene." - ] - }, { "cell_type": "code", - "execution_count": 15, - "id": "60fe7a41", + "execution_count": 19, + "id": "11655485-9d45-443a-a736-3e9301e6be21", "metadata": {}, "outputs": [], "source": [ - "def cosmic(doc):\n", + "def cosmic_fn(doc):\n", " for tok in doc:\n", " for cue in cosmic_census:\n", " if tok.text.find(cue) == -1:\n", " continue\n", " else:\n", " yield tok.i, tok.i+1, \"Gene or Protein\"\n", - "cue_cosmic_census = heuristics.FunctionAnnotator(\"cue_cosmic_census\", cosmic) " + "lf_cosmic = heuristics.FunctionAnnotator(\"COSMIC\", cosmic_fn) " + ] + }, + { + "cell_type": "markdown", + "id": "309be9e6-7453-4e00-a897-ef021f7c8bce", + "metadata": {}, + "source": [ + "Gazetteer based on common Protein names, sourced from Wikipedia and refined using the training part of the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "ee7c77c8-907d-4078-aea1-b0194e20e8cf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Proteins': [PD-L1]}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from skweak.gazetteers import Trie, GazetteerAnnotator\n", + "\n", + "terms = [t.strip() for t in open('proteins.txt', 'r').readlines()]\n", + "\n", + "trie = Trie()\n", + "for term in terms:\n", + " trie.add([t.text for t in nlp(term)])\n", + "\n", + "lf_protein_gazetteer = GazetteerAnnotator('Proteins', tries = {'Gene or Protein' : trie })\n", + "\n", + "doc = nlp(\"PD-L1\")\n", + "lf_protein_gazetteer(doc)\n", + "doc.spans" + ] + }, + { + "cell_type": "markdown", + "id": "64987ef9-e82d-43c2-86b6-2d4fef3b05f0", + "metadata": {}, + "source": [ + "## Rule-based LFs" ] }, { @@ -371,17 +492,17 @@ "id": "68e52e0a", "metadata": {}, "source": [ - "\"HGNC\" is based on the Human Genome Organization (HUGO) Gene Nomenclature Committee (HGNC) naming conventions for genes and leverages regular expressions to let the annotator abide by them. Those expressions comprise various combinations of letters and numbers and certain fixed terms for shorter terms to avoid underfitting. In addition, the CIViC database for variants has also been included for a better recall." + "\"hgnc\" is based on the Human Genome Organization (HUGO) Gene Nomenclature Committee (HGNC) naming conventions for genes and leverages regular expressions to let the annotator abide by them. Those expressions comprise various combinations of letters and numbers and certain fixed terms for shorter terms to avoid underfitting. In addition, the CIViC database for variants has also been included for a better recall." ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 21, "id": "653ed53d", "metadata": {}, "outputs": [], "source": [ - "def structure(doc):\n", + "def hgnc_fn(doc):\n", " for tok in doc:\n", " if re.search(r\"[a-zA-Z]{4}\\d{2}\", tok.text) or re.search(r\"[a-zA-Z]{5}\\d{1}\", tok.text)\\\n", " or re.search(r\"[a-zA-Z]{4}\\d{1}\", tok.text) or re.search(r\"[A-Z]{5}\\d{1}\", tok.text)\\\n", @@ -393,12 +514,71 @@ " or re.search(r\"[A-Z]{3}\\d{1}\", tok.text) or re.search(r\"[A-Z]{2}\\d{2}\", tok.text)\\\n", " or re.search(r\"^CK.\", tok.text) or re.search(r\"^PD-..\", tok.text) or re.search(r\"^PS[MA|A]\", tok.text) or tok.text.lower in CIVIC_variants_lower:\n", " yield tok.i, tok.i+1, \"Gene or Protein\"\n", - "construct = heuristics.FunctionAnnotator(\"construct\", structure)" + "lf_hgnc = heuristics.FunctionAnnotator(\"HGNC\", hgnc_fn)" + ] + }, + { + "cell_type": "markdown", + "id": "21803256-76bf-496e-87e5-7205333d3763", + "metadata": {}, + "source": [ + "Rule-based matcher based on protein families" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 22, + "id": "6a982767-0c6f-4ff7-839d-d52d72d1b5be", + "metadata": {}, + "outputs": [], + "source": [ + "protein_matcher = Matcher(nlp.vocab)\n", + "patterns = []\n", + "\n", + "for suffix in ['[A-Z]*[Kk]inase[n]?$', '[A-Z]+[rR]ezeptor(en|s)?$', '^(RAS|ras)$']:\n", + " p = [{'TEXT' : { 'REGEX' : suffix}}]\n", + " patterns.append(p)\n", + " for _ in range(0, 3): # Consider also combinations like Rezepter-Tyrosinkinasen\n", + " p = [{'IS_ALPHA' : True}, {'lower' : '-'}] + p\n", + " patterns.append(p)\n", + "protein_matcher.add('protein', patterns[-1::-1])\n", + "\n", + "def protein_families_fn(doc):\n", + " matches = protein_matcher(doc)\n", + " if matches:\n", + " # Keep longest matches only\n", + " spans = [doc[start:end] for _, start, end in matches]\n", + " spans = spacy.util.filter_spans(spans)\n", + " for s in spans:\n", + " yield s.start, s.end, 'Gene or Protein'\n", + "\n", + "lf_protein_families = heuristics.FunctionAnnotator(\"Protein Families\", protein_families_fn) " + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "59a961e6-ea23-416e-9549-c444a59a438e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(0, 1, 'Gene or Protein'), (1, 4, 'Gene or Protein')]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(protein_families_fn(nlp(\"RAS k-RAS krass\")))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, "id": "c4aed24b-34e9-48b9-b4fa-927ceb9172a8", "metadata": {}, "outputs": [ @@ -430,7 +610,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 25, "id": "d9bc3aae-f3c3-4d76-93f1-ea2b44d8e9f9", "metadata": {}, "outputs": [ @@ -438,18 +618,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 85996/85996 [14:58<00:00, 95.70it/s]\n" + "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 85996/85996 [20:11<00:00, 70.96it/s]\n" ] } ], "source": [ - "lfs = [construct, cue_civic, omim, cue_cosmic_census]\n", + "lfs = [lf_civic, lf_entrez, lf_omim, lf_cosmic, lf_protein_gazetteer, lf_hgnc, lf_protein_families]\n", "\n", "#For Quick Run with Random Sentences!\n", "#random_files = files_df.sample(n = 10000)\n", "all_docs = []\n", "\n", - "for sentence_idx, doc in zip(tqdm(list(sentence_df.reset_index().iterrows())), nlp.pipe(sentence_df.text, batch_size=32, disable=[\"ner\"])):\n", + "for sentence_idx, doc in zip(tqdm(list(sentence_df.reset_index().iterrows())), nlp.pipe(sentence_df.text, disable=[\"ner\"])):\n", " i, row = sentence_idx\n", " for lf in lfs:\n", " doc = lf(doc)\n", @@ -458,37 +638,15 @@ }, { "cell_type": "markdown", - "id": "6ef09bac-468f-4b72-bad7-13c8878c55ed", + "id": "f6954ced-b4be-49cb-8416-bf95ce225033", "metadata": {}, "source": [ "Remove files which have been manually annotated from the training dataset" ] }, - { - "cell_type": "markdown", - "id": "447db4d7-1452-41ed-b0e8-802d8a7d8364", - "metadata": { - "tags": [] - }, - "source": [ - "## Training Set Evaluation" - ] - }, { "cell_type": "code", - "execution_count": 39, - "id": "628aeab7-901d-42a8-ade1-80e2d8bb5df5", - "metadata": {}, - "outputs": [], - "source": [ - "# TODO: Turn into new LF\n", - "#for t in nlp(\"Wir behandeln die Mutation des mit-Gens mit Chemotherapie.\"):\n", - "# print(t, t.pos_)" - ] - }, - { - "cell_type": "code", - "execution_count": 40, + "execution_count": 26, "id": "34c1e291-67ce-449e-a2dc-4aaa37b16d24", "metadata": {}, "outputs": [ @@ -498,7 +656,7 @@ "2000" ] }, - "execution_count": 40, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -511,7 +669,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 27, "id": "9022b1d4-4b52-4a80-8c26-37a9fc6af7cb", "metadata": {}, "outputs": [ @@ -521,7 +679,7 @@ "(83624, 83624)" ] }, - "execution_count": 41, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -533,9 +691,17 @@ "len(docs), len(filtered_sentence_df)" ] }, + { + "cell_type": "markdown", + "id": "c9567f81-8883-404e-8156-63f625b99612", + "metadata": {}, + "source": [ + "## Training Set Evaluation" + ] + }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 30, "id": "6dbc1515-9a22-4c9a-8437-7c85bda222dd", "metadata": {}, "outputs": [ @@ -560,38 +726,51 @@ " \n", " \n", " \n", - " construct\n", - " cue_civic\n", - " cue_cosmic_census\n", - " omim\n", + " CIViC\n", + " Entrez\n", + " OMIM\n", + " COSMIC\n", + " Proteins\n", + " HGNC\n", + " Protein Families\n", " \n", " \n", " \n", " \n", " Coverage\n", - " 0.498998\n", - " 0.286573\n", - " 0.304609\n", - " 0.469739\n", + " 0.209893\n", + " 0.405695\n", + " 0.344048\n", + " 0.223103\n", + " 0.136651\n", + " 0.365478\n", + " 0.017907\n", " \n", " \n", " Overlaps\n", - " 0.236546\n", - " 0.949650\n", + " 0.980420\n", + " 0.685962\n", + " 0.499573\n", " 0.929605\n", - " 0.453925\n", + " 0.699248\n", + " 0.381526\n", + " 0.368852\n", " \n", " \n", "\n", "" ], "text/plain": [ - " construct cue_civic cue_cosmic_census omim\n", - "Coverage 0.498998 0.286573 0.304609 0.469739\n", - "Overlaps 0.236546 0.949650 0.929605 0.453925" + " CIViC Entrez OMIM COSMIC Proteins HGNC \\\n", + "Coverage 0.209893 0.405695 0.344048 0.223103 0.136651 0.365478 \n", + "Overlaps 0.980420 0.685962 0.499573 0.929605 0.699248 0.381526 \n", + "\n", + " Protein Families \n", + "Coverage 0.017907 \n", + "Overlaps 0.368852 " ] }, - "execution_count": 43, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -602,7 +781,7 @@ "lfa = LFAnalysis(docs, ['Gene or Protein'])\n", "cov = lfa.lf_coverages().rename(index={'Gene or Protein' : 'Coverage'})\n", "overlap = lfa.lf_overlaps().rename(index={'Gene or Protein' : 'Overlaps'})\n", - "pd.concat([cov, overlap])" + "pd.concat([cov, overlap])[[lf.name for lf in lfs]]" ] }, { @@ -615,7 +794,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 31, "id": "cf2dd2ea-50b3-4a17-8905-30545b57a78b", "metadata": {}, "outputs": [ @@ -627,7 +806,8 @@ "Number of processed documents: 1000\n", "Number of processed documents: 2000\n", "Number of processed documents: 3000\n", - "Finished E-step with 3900 documents\n", + "Number of processed documents: 4000\n", + "Finished E-step with 4624 documents\n", "Starting iteration 2\n" ] }, @@ -635,7 +815,7 @@ "name": "stderr", "output_type": "stream", "text": [ - " 1 -23893.4823 +nan\n" + " 1 -33073.4892 +nan\n" ] }, { @@ -645,7 +825,8 @@ "Number of processed documents: 1000\n", "Number of processed documents: 2000\n", "Number of processed documents: 3000\n", - "Finished E-step with 3900 documents\n", + "Number of processed documents: 4000\n", + "Finished E-step with 4624 documents\n", "Starting iteration 3\n" ] }, @@ -653,7 +834,7 @@ "name": "stderr", "output_type": "stream", "text": [ - " 2 -23327.7100 +565.7723\n" + " 2 -32400.1132 +673.3760\n" ] }, { @@ -663,7 +844,8 @@ "Number of processed documents: 1000\n", "Number of processed documents: 2000\n", "Number of processed documents: 3000\n", - "Finished E-step with 3900 documents\n", + "Number of processed documents: 4000\n", + "Finished E-step with 4624 documents\n", "Starting iteration 4\n" ] }, @@ -671,7 +853,7 @@ "name": "stderr", "output_type": "stream", "text": [ - " 3 -23319.1366 +8.5734\n" + " 3 -32386.9774 +13.1358\n" ] }, { @@ -681,14 +863,15 @@ "Number of processed documents: 1000\n", "Number of processed documents: 2000\n", "Number of processed documents: 3000\n", - "Finished E-step with 3900 documents\n" + "Number of processed documents: 4000\n", + "Finished E-step with 4624 documents\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - " 4 -23317.6761 +1.4605\n" + " 4 -32383.9291 +3.0482\n" ] } ], @@ -702,7 +885,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 32, "id": "00c04eb3-5190-4b49-80a1-457e98e1fa70", "metadata": {}, "outputs": [], @@ -717,12 +900,12 @@ "id": "f3066aa9-db13-490e-ac24-432bffdd6577", "metadata": {}, "source": [ - "Consider subset of files where at least one LF has matched" + "Consider subset of files where at least on LF has matched" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 33, "id": "d07ab611-b36a-4eb4-beb4-0002fda81327", "metadata": {}, "outputs": [], @@ -741,17 +924,17 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 34, "id": "19236a84-2a05-4d61-aefd-789e804bd7da", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(83624, 31299, 3900)" + "(83624, 35501, 4624)" ] }, - "execution_count": 47, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -760,26 +943,6 @@ "len(docs), len(filtered_docs), len(gene_docs)" ] }, - { - "cell_type": "code", - "execution_count": 48, - "id": "c1bd6907-4ed1-4f3e-8bfc-a032a69cfcca", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Write to output/weak_training_lg.spacy...done\n", - "Write to output/weak_training_md.spacy...done\n" - ] - } - ], - "source": [ - "utils.docbin_writer(docs, f\"output/weak_training_lg.spacy\")\n", - "utils.docbin_writer(filtered_docs, f\"output/weak_training_md.spacy\")" - ] - }, { "cell_type": "markdown", "id": "6e00ac4b-8333-44bf-9274-ab6734810bbb", @@ -792,7 +955,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 35, "id": "595183ba-63b3-4ca7-8870-2647745676d4", "metadata": {}, "outputs": [], @@ -810,7 +973,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 36, "id": "97cce16c-4d79-4dc8-83dd-c54f60795f67", "metadata": {}, "outputs": [], @@ -826,15 +989,15 @@ }, { "cell_type": "code", - "execution_count": 31, - "id": "2113b209", + "execution_count": 37, + "id": "6c5ac964-a203-4c0e-a291-9ab837c7a18f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:11<00:00, 90.78it/s]\n" + "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:15<00:00, 64.38it/s]\n" ] } ], @@ -844,7 +1007,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 41, "id": "f79a91a8", "metadata": {}, "outputs": [ @@ -898,21 +1061,9 @@ " \n", " \n", " \n", - " Gene or Protein\n", - " 100.0 %\n", - " construct\n", - " 0.879\n", - " 0.244\n", - " 0.382\n", - " \n", - " \n", - " \n", - " 0.833\n", - " 0.305\n", - " 0.446\n", - " \n", - " \n", - " cue_civic\n", + " Gene or Protein\n", + " 100.0 %\n", + " CIViC\n", " 0.979\n", " 0.366\n", " 0.532\n", @@ -924,7 +1075,7 @@ " 0.624\n", " \n", " \n", - " cue_cosmic_census\n", + " COSMIC\n", " 0.960\n", " 0.342\n", " 0.504\n", @@ -936,19 +1087,31 @@ " 0.594\n", " \n", " \n", - " hmm\n", - " 0.916\n", - " 0.486\n", - " 0.636\n", + " Entrez\n", + " 0.937\n", + " 0.450\n", + " 0.608\n", " \n", " \n", " \n", - " 0.870\n", - " 0.608\n", - " 0.716\n", + " 0.902\n", + " 0.503\n", + " 0.646\n", " \n", " \n", - " omim\n", + " HGNC\n", + " 0.879\n", + " 0.244\n", + " 0.382\n", + " \n", + " \n", + " \n", + " 0.833\n", + " 0.305\n", + " 0.446\n", + " \n", + " \n", + " OMIM\n", " 0.955\n", " 0.411\n", " 0.574\n", @@ -960,21 +1123,45 @@ " 0.670\n", " \n", " \n", - " macro\n", - " \n", - " construct\n", - " 0.879\n", - " 0.244\n", - " 0.382\n", + " Protein Families\n", + " 1.000\n", + " 0.067\n", + " 0.126\n", " \n", " \n", " \n", - " 0.833\n", - " 0.305\n", - " 0.446\n", + " 1.000\n", + " 0.076\n", + " 0.142\n", " \n", " \n", - " cue_civic\n", + " Proteins\n", + " 1.000\n", + " 0.117\n", + " 0.210\n", + " \n", + " \n", + " \n", + " 0.934\n", + " 0.120\n", + " 0.212\n", + " \n", + " \n", + " hmm\n", + " 0.899\n", + " 0.596\n", + " 0.716\n", + " \n", + " \n", + " \n", + " 0.841\n", + " 0.680\n", + " 0.752\n", + " \n", + " \n", + " macro\n", + " \n", + " CIViC\n", " 0.979\n", " 0.366\n", " 0.532\n", @@ -986,7 +1173,7 @@ " 0.624\n", " \n", " \n", - " cue_cosmic_census\n", + " COSMIC\n", " 0.960\n", " 0.342\n", " 0.504\n", @@ -998,19 +1185,31 @@ " 0.594\n", " \n", " \n", - " hmm\n", - " 0.916\n", - " 0.486\n", - " 0.636\n", + " Entrez\n", + " 0.937\n", + " 0.450\n", + " 0.608\n", " \n", " \n", " \n", - " 0.870\n", - " 0.608\n", - " 0.716\n", + " 0.902\n", + " 0.503\n", + " 0.646\n", " \n", " \n", - " omim\n", + " HGNC\n", + " 0.879\n", + " 0.244\n", + " 0.382\n", + " \n", + " \n", + " \n", + " 0.833\n", + " 0.305\n", + " 0.446\n", + " \n", + " \n", + " OMIM\n", " 0.955\n", " 0.411\n", " 0.574\n", @@ -1022,21 +1221,45 @@ " 0.670\n", " \n", " \n", - " micro\n", - " \n", - " construct\n", - " 0.879\n", - " 0.244\n", - " 0.382\n", - " 0.819\n", - " 0.976\n", - " 0.278\n", - " 0.833\n", - " 0.305\n", - " 0.446\n", + " Protein Families\n", + " 1.000\n", + " 0.067\n", + " 0.126\n", + " \n", + " \n", + " \n", + " 1.000\n", + " 0.076\n", + " 0.142\n", " \n", " \n", - " cue_civic\n", + " Proteins\n", + " 1.000\n", + " 0.117\n", + " 0.210\n", + " \n", + " \n", + " \n", + " 0.934\n", + " 0.120\n", + " 0.212\n", + " \n", + " \n", + " hmm\n", + " 0.899\n", + " 0.596\n", + " 0.716\n", + " \n", + " \n", + " \n", + " 0.841\n", + " 0.680\n", + " 0.752\n", + " \n", + " \n", + " micro\n", + " \n", + " CIViC\n", " 0.979\n", " 0.366\n", " 0.532\n", @@ -1048,7 +1271,7 @@ " 0.624\n", " \n", " \n", - " cue_cosmic_census\n", + " COSMIC\n", " 0.960\n", " 0.342\n", " 0.504\n", @@ -1060,19 +1283,31 @@ " 0.594\n", " \n", " \n", - " hmm\n", - " 0.916\n", - " 0.486\n", - " 0.636\n", - " 0.42\n", - " 0.987\n", - " 0.53\n", - " 0.870\n", + " Entrez\n", + " 0.937\n", + " 0.450\n", " 0.608\n", - " 0.716\n", + " 0.819\n", + " 0.976\n", + " 0.481\n", + " 0.902\n", + " 0.503\n", + " 0.646\n", " \n", " \n", - " omim\n", + " HGNC\n", + " 0.879\n", + " 0.244\n", + " 0.382\n", + " 0.819\n", + " 0.976\n", + " 0.278\n", + " 0.833\n", + " 0.305\n", + " 0.446\n", + " \n", + " \n", + " OMIM\n", " 0.955\n", " 0.411\n", " 0.574\n", @@ -1084,21 +1319,45 @@ " 0.670\n", " \n", " \n", - " weighted\n", - " \n", - " construct\n", - " 0.879\n", - " 0.244\n", - " 0.382\n", - " \n", - " \n", - " \n", - " 0.833\n", - " 0.305\n", - " 0.446\n", + " Protein Families\n", + " 1.000\n", + " 0.067\n", + " 0.126\n", + " 0.819\n", + " 0.976\n", + " 0.067\n", + " 1.000\n", + " 0.076\n", + " 0.142\n", " \n", " \n", - " cue_civic\n", + " Proteins\n", + " 1.000\n", + " 0.117\n", + " 0.210\n", + " 0.819\n", + " 0.976\n", + " 0.117\n", + " 0.934\n", + " 0.120\n", + " 0.212\n", + " \n", + " \n", + " hmm\n", + " 0.899\n", + " 0.596\n", + " 0.716\n", + " 0.326\n", + " 0.989\n", + " 0.663\n", + " 0.841\n", + " 0.680\n", + " 0.752\n", + " \n", + " \n", + " weighted\n", + " \n", + " CIViC\n", " 0.979\n", " 0.366\n", " 0.532\n", @@ -1110,7 +1369,7 @@ " 0.624\n", " \n", " \n", - " cue_cosmic_census\n", + " COSMIC\n", " 0.960\n", " 0.342\n", " 0.504\n", @@ -1122,19 +1381,31 @@ " 0.594\n", " \n", " \n", - " hmm\n", - " 0.916\n", - " 0.486\n", - " 0.636\n", + " Entrez\n", + " 0.937\n", + " 0.450\n", + " 0.608\n", " \n", " \n", " \n", - " 0.870\n", - " 0.608\n", - " 0.716\n", + " 0.902\n", + " 0.503\n", + " 0.646\n", + " \n", + " \n", + " HGNC\n", + " 0.879\n", + " 0.244\n", + " 0.382\n", + " \n", + " \n", + " \n", + " 0.833\n", + " 0.305\n", + " 0.446\n", " \n", " \n", - " omim\n", + " OMIM\n", " 0.955\n", " 0.411\n", " 0.574\n", @@ -1145,105 +1416,154 @@ " 0.524\n", " 0.670\n", " \n", + " \n", + " Protein Families\n", + " 1.000\n", + " 0.067\n", + " 0.126\n", + " \n", + " \n", + " \n", + " 1.000\n", + " 0.076\n", + " 0.142\n", + " \n", + " \n", + " Proteins\n", + " 1.000\n", + " 0.117\n", + " 0.210\n", + " \n", + " \n", + " \n", + " 0.934\n", + " 0.120\n", + " 0.212\n", + " \n", + " \n", + " hmm\n", + " 0.899\n", + " 0.596\n", + " 0.716\n", + " \n", + " \n", + " \n", + " 0.841\n", + " 0.680\n", + " 0.752\n", + " \n", " \n", "\n", "" ], "text/plain": [ - " tok_precision tok_recall \\\n", - "label proportion model \n", - "Gene or Protein 100.0 % construct 0.879 0.244 \n", - " cue_civic 0.979 0.366 \n", - " cue_cosmic_census 0.960 0.342 \n", - " hmm 0.916 0.486 \n", - " omim 0.955 0.411 \n", - "macro construct 0.879 0.244 \n", - " cue_civic 0.979 0.366 \n", - " cue_cosmic_census 0.960 0.342 \n", - " hmm 0.916 0.486 \n", - " omim 0.955 0.411 \n", - "micro construct 0.879 0.244 \n", - " cue_civic 0.979 0.366 \n", - " cue_cosmic_census 0.960 0.342 \n", - " hmm 0.916 0.486 \n", - " omim 0.955 0.411 \n", - "weighted construct 0.879 0.244 \n", - " cue_civic 0.979 0.366 \n", - " cue_cosmic_census 0.960 0.342 \n", - " hmm 0.916 0.486 \n", - " omim 0.955 0.411 \n", + " tok_precision tok_recall \\\n", + "label proportion model \n", + "Gene or Protein 100.0 % CIViC 0.979 0.366 \n", + " COSMIC 0.960 0.342 \n", + " Entrez 0.937 0.450 \n", + " HGNC 0.879 0.244 \n", + " OMIM 0.955 0.411 \n", + " Protein Families 1.000 0.067 \n", + " Proteins 1.000 0.117 \n", + " hmm 0.899 0.596 \n", + "macro CIViC 0.979 0.366 \n", + " COSMIC 0.960 0.342 \n", + " Entrez 0.937 0.450 \n", + " HGNC 0.879 0.244 \n", + " OMIM 0.955 0.411 \n", + " Protein Families 1.000 0.067 \n", + " Proteins 1.000 0.117 \n", + " hmm 0.899 0.596 \n", + "micro CIViC 0.979 0.366 \n", + " COSMIC 0.960 0.342 \n", + " Entrez 0.937 0.450 \n", + " HGNC 0.879 0.244 \n", + " OMIM 0.955 0.411 \n", + " Protein Families 1.000 0.067 \n", + " Proteins 1.000 0.117 \n", + " hmm 0.899 0.596 \n", + "weighted CIViC 0.979 0.366 \n", + " COSMIC 0.960 0.342 \n", + " Entrez 0.937 0.450 \n", + " HGNC 0.879 0.244 \n", + " OMIM 0.955 0.411 \n", + " Protein Families 1.000 0.067 \n", + " Proteins 1.000 0.117 \n", + " hmm 0.899 0.596 \n", "\n", - " tok_f1 tok_cee tok_acc coverage \\\n", - "label proportion model \n", - "Gene or Protein 100.0 % construct 0.382 \n", - " cue_civic 0.532 \n", - " cue_cosmic_census 0.504 \n", - " hmm 0.636 \n", - " omim 0.574 \n", - "macro construct 0.382 \n", - " cue_civic 0.532 \n", - " cue_cosmic_census 0.504 \n", - " hmm 0.636 \n", - " omim 0.574 \n", - "micro construct 0.382 0.819 0.976 0.278 \n", - " cue_civic 0.532 0.819 0.976 0.374 \n", - " cue_cosmic_census 0.504 0.819 0.976 0.356 \n", - " hmm 0.636 0.42 0.987 0.53 \n", - " omim 0.574 0.819 0.976 0.43 \n", - "weighted construct 0.382 \n", - " cue_civic 0.532 \n", - " cue_cosmic_census 0.504 \n", - " hmm 0.636 \n", - " omim 0.574 \n", - "\n", - " ent_precision ent_recall \\\n", - "label proportion model \n", - "Gene or Protein 100.0 % construct 0.833 0.305 \n", - " cue_civic 0.944 0.465 \n", - " cue_cosmic_census 0.928 0.436 \n", - " hmm 0.870 0.608 \n", - " omim 0.926 0.524 \n", - "macro construct 0.833 0.305 \n", - " cue_civic 0.944 0.465 \n", - " cue_cosmic_census 0.928 0.436 \n", - " hmm 0.870 0.608 \n", - " omim 0.926 0.524 \n", - "micro construct 0.833 0.305 \n", - " cue_civic 0.944 0.465 \n", - " cue_cosmic_census 0.928 0.436 \n", - " hmm 0.870 0.608 \n", - " omim 0.926 0.524 \n", - "weighted construct 0.833 0.305 \n", - " cue_civic 0.944 0.465 \n", - " cue_cosmic_census 0.928 0.436 \n", - " hmm 0.870 0.608 \n", - " omim 0.926 0.524 \n", + " tok_f1 tok_cee tok_acc coverage \\\n", + "label proportion model \n", + "Gene or Protein 100.0 % CIViC 0.532 \n", + " COSMIC 0.504 \n", + " Entrez 0.608 \n", + " HGNC 0.382 \n", + " OMIM 0.574 \n", + " Protein Families 0.126 \n", + " Proteins 0.210 \n", + " hmm 0.716 \n", + "macro CIViC 0.532 \n", + " COSMIC 0.504 \n", + " Entrez 0.608 \n", + " HGNC 0.382 \n", + " OMIM 0.574 \n", + " Protein Families 0.126 \n", + " Proteins 0.210 \n", + " hmm 0.716 \n", + "micro CIViC 0.532 0.819 0.976 0.374 \n", + " COSMIC 0.504 0.819 0.976 0.356 \n", + " Entrez 0.608 0.819 0.976 0.481 \n", + " HGNC 0.382 0.819 0.976 0.278 \n", + " OMIM 0.574 0.819 0.976 0.43 \n", + " Protein Families 0.126 0.819 0.976 0.067 \n", + " Proteins 0.210 0.819 0.976 0.117 \n", + " hmm 0.716 0.326 0.989 0.663 \n", + "weighted CIViC 0.532 \n", + " COSMIC 0.504 \n", + " Entrez 0.608 \n", + " HGNC 0.382 \n", + " OMIM 0.574 \n", + " Protein Families 0.126 \n", + " Proteins 0.210 \n", + " hmm 0.716 \n", "\n", - " ent_f1 \n", - "label proportion model \n", - "Gene or Protein 100.0 % construct 0.446 \n", - " cue_civic 0.624 \n", - " cue_cosmic_census 0.594 \n", - " hmm 0.716 \n", - " omim 0.670 \n", - "macro construct 0.446 \n", - " cue_civic 0.624 \n", - " cue_cosmic_census 0.594 \n", - " hmm 0.716 \n", - " omim 0.670 \n", - "micro construct 0.446 \n", - " cue_civic 0.624 \n", - " cue_cosmic_census 0.594 \n", - " hmm 0.716 \n", - " omim 0.670 \n", - "weighted construct 0.446 \n", - " cue_civic 0.624 \n", - " cue_cosmic_census 0.594 \n", - " hmm 0.716 \n", - " omim 0.670 " + " ent_precision ent_recall ent_f1 \n", + "label proportion model \n", + "Gene or Protein 100.0 % CIViC 0.944 0.465 0.624 \n", + " COSMIC 0.928 0.436 0.594 \n", + " Entrez 0.902 0.503 0.646 \n", + " HGNC 0.833 0.305 0.446 \n", + " OMIM 0.926 0.524 0.670 \n", + " Protein Families 1.000 0.076 0.142 \n", + " Proteins 0.934 0.120 0.212 \n", + " hmm 0.841 0.680 0.752 \n", + "macro CIViC 0.944 0.465 0.624 \n", + " COSMIC 0.928 0.436 0.594 \n", + " Entrez 0.902 0.503 0.646 \n", + " HGNC 0.833 0.305 0.446 \n", + " OMIM 0.926 0.524 0.670 \n", + " Protein Families 1.000 0.076 0.142 \n", + " Proteins 0.934 0.120 0.212 \n", + " hmm 0.841 0.680 0.752 \n", + "micro CIViC 0.944 0.465 0.624 \n", + " COSMIC 0.928 0.436 0.594 \n", + " Entrez 0.902 0.503 0.646 \n", + " HGNC 0.833 0.305 0.446 \n", + " OMIM 0.926 0.524 0.670 \n", + " Protein Families 1.000 0.076 0.142 \n", + " Proteins 0.934 0.120 0.212 \n", + " hmm 0.841 0.680 0.752 \n", + "weighted CIViC 0.944 0.465 0.624 \n", + " COSMIC 0.928 0.436 0.594 \n", + " Entrez 0.902 0.503 0.646 \n", + " HGNC 0.833 0.305 0.446 \n", + " OMIM 0.926 0.524 0.670 \n", + " Protein Families 1.000 0.076 0.142 \n", + " Proteins 0.934 0.120 0.212 \n", + " hmm 0.841 0.680 0.752 " ] }, - "execution_count": 51, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -1262,6 +1582,24 @@ "# Training of Transformer-based NER Models" ] }, + { + "cell_type": "code", + "execution_count": 42, + "id": "60f62095-f8ed-4fd6-b7a5-672e130c5282", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Write to output/weak_training_lg.spacy...done\n" + ] + } + ], + "source": [ + "utils.docbin_writer(docs, f\"output/weak_training_lg.spacy\")" + ] + }, { "cell_type": "markdown", "id": "7fc7b292-c94a-45f9-8e06-a862a3932019", @@ -1272,7 +1610,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 43, "id": "5f713abd-6637-481b-8fea-39972eb129f4", "metadata": {}, "outputs": [ @@ -1305,17 +1643,6 @@ "!spacy train config.cfg --paths.train output/weak_training_lg.spacy --paths.dev data/molecular/gold_dev.spacy --output output/weak_ner_lg --gpu-id 0 --code training.py" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "9ecbde44-9646-4267-bc26-e5332d4dc202", - "metadata": {}, - "outputs": [], - "source": [ - "# Train NER model on smaller set of weak labels with spaCy\n", - "#!spacy train config.cfg --paths.train output/weak_training_md.spacy --paths.dev data/molecular/gold_dev.spacy --output output/weak_ner_md --gpu-id 0 --code training.py" - ] - }, { "cell_type": "code", "execution_count": null, @@ -1337,7 +1664,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 58, "id": "8842e777-aa91-485e-8e61-a4fc70790679", "metadata": {}, "outputs": [], @@ -1348,7 +1675,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 59, "id": "ef897e37-9bae-49ee-b8c1-267f19ceffe5", "metadata": {}, "outputs": [], @@ -1391,7 +1718,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 60, "id": "5be94760-3fa5-40e9-bb8f-d798cd452f6b", "metadata": { "tags": [] @@ -1413,7 +1740,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:11<00:00, 90.17it/s]\n" + "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:15<00:00, 64.79it/s]\n" ] }, { @@ -1464,20 +1791,8 @@ " \n", " \n", " \n", - " 100.0 %\n", - " construct\n", - " 0.879\n", - " 0.244\n", - " 0.382\n", - " \n", - " \n", - " \n", - " 0.833\n", - " 0.305\n", - " 0.446\n", - " \n", - " \n", - " cue_civic\n", + " 100.0 %\n", + " CIViC\n", " 0.979\n", " 0.366\n", " 0.532\n", @@ -1489,7 +1804,7 @@ " 0.624\n", " \n", " \n", - " cue_cosmic_census\n", + " COSMIC\n", " 0.960\n", " 0.342\n", " 0.504\n", @@ -1501,19 +1816,31 @@ " 0.594\n", " \n", " \n", - " hmm\n", - " 0.916\n", - " 0.486\n", - " 0.636\n", + " Entrez\n", + " 0.937\n", + " 0.450\n", + " 0.608\n", " \n", " \n", " \n", - " 0.870\n", - " 0.608\n", - " 0.716\n", + " 0.902\n", + " 0.503\n", + " 0.646\n", " \n", " \n", - " omim\n", + " HGNC\n", + " 0.879\n", + " 0.244\n", + " 0.382\n", + " \n", + " \n", + " \n", + " 0.833\n", + " 0.305\n", + " 0.446\n", + " \n", + " \n", + " OMIM\n", " 0.955\n", " 0.411\n", " 0.574\n", @@ -1524,34 +1851,79 @@ " 0.524\n", " 0.670\n", " \n", + " \n", + " Protein Families\n", + " 1.000\n", + " 0.067\n", + " 0.126\n", + " \n", + " \n", + " \n", + " 1.000\n", + " 0.076\n", + " 0.142\n", + " \n", + " \n", + " Proteins\n", + " 1.000\n", + " 0.117\n", + " 0.210\n", + " \n", + " \n", + " \n", + " 0.934\n", + " 0.120\n", + " 0.212\n", + " \n", + " \n", + " hmm\n", + " 0.899\n", + " 0.596\n", + " 0.716\n", + " \n", + " \n", + " \n", + " 0.841\n", + " 0.680\n", + " 0.752\n", + " \n", " \n", "\n", "" ], "text/plain": [ - " tok_precision tok_recall tok_f1 tok_cee \\\n", - "proportion model \n", - "100.0 % construct 0.879 0.244 0.382 \n", - " cue_civic 0.979 0.366 0.532 \n", - " cue_cosmic_census 0.960 0.342 0.504 \n", - " hmm 0.916 0.486 0.636 \n", - " omim 0.955 0.411 0.574 \n", + " tok_precision tok_recall tok_f1 tok_cee \\\n", + "proportion model \n", + "100.0 % CIViC 0.979 0.366 0.532 \n", + " COSMIC 0.960 0.342 0.504 \n", + " Entrez 0.937 0.450 0.608 \n", + " HGNC 0.879 0.244 0.382 \n", + " OMIM 0.955 0.411 0.574 \n", + " Protein Families 1.000 0.067 0.126 \n", + " Proteins 1.000 0.117 0.210 \n", + " hmm 0.899 0.596 0.716 \n", "\n", - " tok_acc coverage ent_precision ent_recall \\\n", - "proportion model \n", - "100.0 % construct 0.833 0.305 \n", - " cue_civic 0.944 0.465 \n", - " cue_cosmic_census 0.928 0.436 \n", - " hmm 0.870 0.608 \n", - " omim 0.926 0.524 \n", + " tok_acc coverage ent_precision ent_recall \\\n", + "proportion model \n", + "100.0 % CIViC 0.944 0.465 \n", + " COSMIC 0.928 0.436 \n", + " Entrez 0.902 0.503 \n", + " HGNC 0.833 0.305 \n", + " OMIM 0.926 0.524 \n", + " Protein Families 1.000 0.076 \n", + " Proteins 0.934 0.120 \n", + " hmm 0.841 0.680 \n", "\n", - " ent_f1 \n", - "proportion model \n", - "100.0 % construct 0.446 \n", - " cue_civic 0.624 \n", - " cue_cosmic_census 0.594 \n", - " hmm 0.716 \n", - " omim 0.670 " + " ent_f1 \n", + "proportion model \n", + "100.0 % CIViC 0.624 \n", + " COSMIC 0.594 \n", + " Entrez 0.646 \n", + " HGNC 0.446 \n", + " OMIM 0.670 \n", + " Protein Families 0.142 \n", + " Proteins 0.212 \n", + " hmm 0.752 " ] }, "metadata": {}, @@ -1573,7 +1945,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|███████████████████████████████████████████████████████████████████████████████████████████████████▉| 999/1000 [00:39<00:00, 25.06it/s]\n" + "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 999/1000 [00:39<00:00, 24.99it/s]\n" ] }, { @@ -1626,15 +1998,15 @@ " \n", " 100.0 %\n", " ner_model\n", - " 0.916\n", - " 0.505\n", - " 0.652\n", + " 0.902\n", + " 0.617\n", + " 0.732\n", " \n", " \n", " \n", - " 0.875\n", - " 0.636\n", - " 0.736\n", + " 0.855\n", + " 0.72\n", + " 0.782\n", " \n", " \n", "\n", @@ -1643,11 +2015,11 @@ "text/plain": [ " tok_precision tok_recall tok_f1 tok_cee tok_acc \\\n", "proportion model \n", - "100.0 % ner_model 0.916 0.505 0.652 \n", + "100.0 % ner_model 0.902 0.617 0.732 \n", "\n", " coverage ent_precision ent_recall ent_f1 \n", "proportion model \n", - "100.0 % ner_model 0.875 0.636 0.736 " + "100.0 % ner_model 0.855 0.72 0.782 " ] }, "metadata": {}, @@ -1670,7 +2042,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 61, "id": "631cbb84-3c39-4281-9160-5029493a23a7", "metadata": {}, "outputs": [ @@ -1690,7 +2062,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:09<00:00, 101.28it/s]\n" + "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:13<00:00, 73.50it/s]\n" ] }, { @@ -1741,20 +2113,8 @@ " \n", " \n", " \n", - " 100.0 %\n", - " construct\n", - " 0.853\n", - " 0.190\n", - " 0.310\n", - " \n", - " \n", - " \n", - " 0.836\n", - " 0.280\n", - " 0.420\n", - " \n", - " \n", - " cue_civic\n", + " 100.0 %\n", + " CIViC\n", " 0.933\n", " 0.350\n", " 0.510\n", @@ -1766,7 +2126,7 @@ " 0.606\n", " \n", " \n", - " cue_cosmic_census\n", + " COSMIC\n", " 0.927\n", " 0.342\n", " 0.500\n", @@ -1778,19 +2138,31 @@ " 0.608\n", " \n", " \n", - " hmm\n", - " 0.855\n", - " 0.398\n", - " 0.544\n", + " Entrez\n", + " 0.951\n", + " 0.525\n", + " 0.676\n", " \n", " \n", " \n", - " 0.781\n", - " 0.545\n", - " 0.642\n", + " 0.890\n", + " 0.608\n", + " 0.722\n", + " \n", + " \n", + " HGNC\n", + " 0.853\n", + " 0.190\n", + " 0.310\n", + " \n", + " \n", + " \n", + " 0.836\n", + " 0.280\n", + " 0.420\n", " \n", " \n", - " omim\n", + " OMIM\n", " 0.904\n", " 0.363\n", " 0.518\n", @@ -1801,34 +2173,79 @@ " 0.493\n", " 0.616\n", " \n", + " \n", + " Protein Families\n", + " 0.538\n", + " 0.027\n", + " 0.052\n", + " \n", + " \n", + " \n", + " 0.250\n", + " 0.012\n", + " 0.022\n", + " \n", + " \n", + " Proteins\n", + " 1.000\n", + " 0.131\n", + " 0.232\n", + " \n", + " \n", + " \n", + " 0.975\n", + " 0.112\n", + " 0.200\n", + " \n", + " \n", + " hmm\n", + " 0.864\n", + " 0.596\n", + " 0.706\n", + " \n", + " \n", + " \n", + " 0.789\n", + " 0.689\n", + " 0.736\n", + " \n", " \n", "\n", "" ], "text/plain": [ - " tok_precision tok_recall tok_f1 tok_cee \\\n", - "proportion model \n", - "100.0 % construct 0.853 0.190 0.310 \n", - " cue_civic 0.933 0.350 0.510 \n", - " cue_cosmic_census 0.927 0.342 0.500 \n", - " hmm 0.855 0.398 0.544 \n", - " omim 0.904 0.363 0.518 \n", + " tok_precision tok_recall tok_f1 tok_cee \\\n", + "proportion model \n", + "100.0 % CIViC 0.933 0.350 0.510 \n", + " COSMIC 0.927 0.342 0.500 \n", + " Entrez 0.951 0.525 0.676 \n", + " HGNC 0.853 0.190 0.310 \n", + " OMIM 0.904 0.363 0.518 \n", + " Protein Families 0.538 0.027 0.052 \n", + " Proteins 1.000 0.131 0.232 \n", + " hmm 0.864 0.596 0.706 \n", "\n", - " tok_acc coverage ent_precision ent_recall \\\n", - "proportion model \n", - "100.0 % construct 0.836 0.280 \n", - " cue_civic 0.841 0.473 \n", - " cue_cosmic_census 0.854 0.473 \n", - " hmm 0.781 0.545 \n", - " omim 0.818 0.493 \n", + " tok_acc coverage ent_precision ent_recall \\\n", + "proportion model \n", + "100.0 % CIViC 0.841 0.473 \n", + " COSMIC 0.854 0.473 \n", + " Entrez 0.890 0.608 \n", + " HGNC 0.836 0.280 \n", + " OMIM 0.818 0.493 \n", + " Protein Families 0.250 0.012 \n", + " Proteins 0.975 0.112 \n", + " hmm 0.789 0.689 \n", "\n", - " ent_f1 \n", - "proportion model \n", - "100.0 % construct 0.420 \n", - " cue_civic 0.606 \n", - " cue_cosmic_census 0.608 \n", - " hmm 0.642 \n", - " omim 0.616 " + " ent_f1 \n", + "proportion model \n", + "100.0 % CIViC 0.606 \n", + " COSMIC 0.608 \n", + " Entrez 0.722 \n", + " HGNC 0.420 \n", + " OMIM 0.616 \n", + " Protein Families 0.022 \n", + " Proteins 0.200 \n", + " hmm 0.736 " ] }, "metadata": {}, @@ -1850,7 +2267,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|███████████████████████████████████████████████████████████████████████████████████████████████████▉| 999/1000 [00:35<00:00, 28.01it/s]\n" + "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 999/1000 [00:40<00:00, 24.53it/s]\n" ] }, { @@ -1903,15 +2320,15 @@ " \n", " 100.0 %\n", " ner_model\n", - " 0.883\n", - " 0.437\n", - " 0.584\n", + " 0.901\n", + " 0.613\n", + " 0.73\n", " \n", " \n", " \n", - " 0.794\n", - " 0.588\n", - " 0.676\n", + " 0.819\n", + " 0.718\n", + " 0.766\n", " \n", " \n", "\n", @@ -1920,11 +2337,11 @@ "text/plain": [ " tok_precision tok_recall tok_f1 tok_cee tok_acc \\\n", "proportion model \n", - "100.0 % ner_model 0.883 0.437 0.584 \n", + "100.0 % ner_model 0.901 0.613 0.73 \n", "\n", " coverage ent_precision ent_recall ent_f1 \n", "proportion model \n", - "100.0 % ner_model 0.794 0.588 0.676 " + "100.0 % ner_model 0.819 0.718 0.766 " ] }, "metadata": {}, @@ -1946,7 +2363,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|███████████████████████████████████████████████████████████████████████████████████████████████████▉| 999/1000 [00:37<00:00, 26.69it/s]\n" + "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 999/1000 [00:38<00:00, 25.86it/s]\n" ] }, { @@ -2043,7 +2460,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 62, "id": "a313589f-d854-41c8-8df6-bccc2b0bf2d4", "metadata": {}, "outputs": [], @@ -2054,17 +2471,17 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 63, "id": "a655dd6c-92c2-4582-84ea-fb8af767856b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(83624, 4717)" + "(83624, 5617)" ] }, - "execution_count": 67, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } @@ -2075,7 +2492,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 64, "id": "1dff773c-af59-4896-87d2-0a65bff9a6cf", "metadata": {}, "outputs": [ @@ -2085,7 +2502,7 @@ "(1000, 475)" ] }, - "execution_count": 66, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } @@ -2096,7 +2513,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 65, "id": "7bad482c-d324-4052-b965-26d27b9e686e", "metadata": {}, "outputs": [ @@ -2106,7 +2523,7 @@ "(1000, 347)" ] }, - "execution_count": 64, + "execution_count": 65, "metadata": {}, "output_type": "execute_result" } @@ -2114,6 +2531,14 @@ "source": [ "len(gold_docs_test_eval), len(get_genes(gold_docs_test_eval))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63e43db1-f326-4372-91bd-f2e413e089ae", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/proteins.txt b/proteins.txt new file mode 100644 index 0000000..7f0cca4 --- /dev/null +++ b/proteins.txt @@ -0,0 +1,36 @@ +PD-L1 +PD-1 +RAS +CYP +CYP3A4 +MEK +CYP3A +Transaminase +CYP2D6 +NTRK +CYP450 +Cyclooxygenase +COX-2 +a-Reduktase +a-Fetoprotein +Phosphodiesterase +CYP1A2 +CYP2C9 +a-Glutamyltransferase +n-Dehydrogenase +Glukose-6-Phosphat-Dehydrogenase +Uridin-5’-Diphospho-Glucuronosyltransferase +Glutamat-Oxalacetat-Transaminase +F-MEK +CYP2C19 +CYP2B6 +CYP19 +m-Laktatdehydrogenase +CYP2C19A +α-Reduktase +t-Dehydrogenase +CYP17 +CYP2C8 +l-RAS +d-Dehydrogenase +Tyrosin-Kinase \ No newline at end of file