From b8469f0246d291c94c89785f67b14d06ba43fdb6 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 28 Jan 2019 04:09:01 +0100 Subject: [PATCH] Adding scripts and code for experiments. --- .../sigir19/RelevantDocsFoundAnalysis.ipynb | 421 ++---- scripts/runAllPmClassExperimentsLiterature.sh | 9 + scripts/runAllRecallExperimentsLiterature.sh | 18 +- .../runAllTermBoostExperimentsLiterature.sh | 9 + scripts/runPmClassExperimentsLiterature.sh | 6 + scripts/runRecallExperimentsLiterature.sh | 2 +- scripts/runTermBoostExperimentsLiterature.sh | 6 + .../imi/bst/trec/SigirParameters.java | 2 + ...igirPubmedExperimenterBoostOptimizer.java} | 109 +- .../trec/SigirPubmedExperimenterPmClass.java | 77 ++ ...bmedRecallExperimenterDefaultBoosting.java | 9 +- ...edRecallExperimenterOptimizedBoosting.java | 2 +- ...PubmedRecallExperimenterParameterized.java | 2 +- ...dTermBoostExperimenterDefaultBoosting.java | 56 + .../SuperSigirPubmedRecallExperimenter.java | 330 +++-- .../imi/bst/trec/search/ElasticSearch.java | 2 +- src/main/resources/log4j2.properties | 2 +- .../sigir19_experiments_biomed/mutations.json | 8 + .../sigir19_experiments_biomed/baseline.json | 3 - .../sigir19_experiments_biomed/cancer.json | 11 + ...ers_additional_signals.json => chemo.json} | 4 - .../sigir19_experiments_biomed/dgi.json | 11 + .../sigir19_experiments_biomed/dna.json | 11 + ...ine_plus_genefield.json => genefield.json} | 0 ...must.json => genefield_custompm_must.json} | 0 ...ld.json => genefield_custompm_should.json} | 0 .../sigir19_experiments_biomed/mutations.json | 11 + ...mpm_should.json => negative_boosters.json} | 2 - ...th_pos_boosters.json => pos_boosters.json} | 0 ...t.json => pos_boosters_custompm_must.json} | 0 ...json => pos_boosters_custompm_should.json} | 0 ...g_boosters.json => positive_boosters.json} | 2 - ...ters_additional_signals_custompm_must.json | 16 - ...neg_boosters_additional_signals_extra.json | 16 - ...sters_additional_signals_extra_nonmel.json | 19 - ...al_signals_extra_nonmel_custompm_must.json | 20 - ...dditional_signals_extra_nonmel_should.json | 17 - .../with_pos_neg_boosters_gcustompm_must.json | 13 - .../all_custompm_must.json} | 12 +- .../all_custompm_should.json} | 2 + .../baseline_custompm_must.json | 9 + .../baseline_custompm_should.json | 11 + .../aeDescriptions.bin | Bin 58 -> 0 bytes .../ccDescriptions.bin | Bin 18766 -> 0 bytes .../cmDescriptions.bin | Bin 24440 -> 0 bytes .../crDescriptions.bin | Bin 37897 -> 0 bytes .../desc/JCoRe JSON Writer.xml | 2 +- .../desc/XMI Database Multiplier Reader.xml | 2 +- uima/preprocessing/aeDescriptions.json | 2 +- uima/preprocessing/ccDescriptions.json | 2 +- uima/preprocessing/cmDescriptions.json | 2 +- uima/preprocessing/crDescriptions.json | 2 +- uima/preprocessing/desc/CPE.xml | 47 +- .../JCoRe Abstract Database Multiplier.xml | 25 + .../desc/JCoRe XMI Database Writer.xml | 182 --- .../preprocessing/desc/PM Classifier 2017.xml | 1213 ----------------- .../preprocessing/desc/PM Classifier 2018.xml | 1213 ----------------- .../desc/XMI Database Multiplier Reader.xml | 196 +++ .../desc/aggregateAnalysisEngine.xml | 65 +- uima/preprocessing/desc/cpeAAE.xml | 38 +- .../desc/JCoRe ElasticSearch Consumer.xml | 2 +- .../desc/XMI Database Multiplier Reader.xml | 6 +- .../descAll/JCoRe ElasticSearch Consumer.xml | 2 +- .../XMI Database Multiplier Reader.xml | 6 +- .../pubmed/Trec2018FieldGenerator.java | 20 + 65 files changed, 999 insertions(+), 3288 deletions(-) create mode 100755 scripts/runAllPmClassExperimentsLiterature.sh create mode 100755 scripts/runAllTermBoostExperimentsLiterature.sh create mode 100755 scripts/runPmClassExperimentsLiterature.sh create mode 100755 scripts/runTermBoostExperimentsLiterature.sh rename src/main/java/at/medunigraz/imi/bst/trec/{SigirPubmedRecallExperimenterBoostOptimizer.java => SigirPubmedExperimenterBoostOptimizer.java} (60%) create mode 100644 src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedExperimenterPmClass.java create mode 100644 src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedTermBoostExperimenterDefaultBoosting.java create mode 100644 src/main/resources/subtemplates/sigir19_experiments_biomed/mutations.json create mode 100644 src/main/resources/templates/sigir19_experiments_biomed/cancer.json rename src/main/resources/templates/sigir19_experiments_biomed/{with_pos_neg_boosters_additional_signals.json => chemo.json} (50%) create mode 100644 src/main/resources/templates/sigir19_experiments_biomed/dgi.json create mode 100644 src/main/resources/templates/sigir19_experiments_biomed/dna.json rename src/main/resources/templates/sigir19_experiments_biomed/{baseline_plus_genefield.json => genefield.json} (100%) rename src/main/resources/templates/sigir19_experiments_biomed/{baseline_plus_genefield_custompm_must.json => genefield_custompm_must.json} (100%) rename src/main/resources/templates/sigir19_experiments_biomed/{baseline_plus_genefield_custompm_should.json => genefield_custompm_should.json} (100%) create mode 100644 src/main/resources/templates/sigir19_experiments_biomed/mutations.json rename src/main/resources/templates/sigir19_experiments_biomed/{with_pos_neg_boosters_custompm_should.json => negative_boosters.json} (68%) rename src/main/resources/templates/sigir19_experiments_biomed/{with_pos_boosters.json => pos_boosters.json} (100%) rename src/main/resources/templates/sigir19_experiments_biomed/{with_pos_boosters_custompm_must.json => pos_boosters_custompm_must.json} (100%) rename src/main/resources/templates/sigir19_experiments_biomed/{with_pos_boosters_custompm_should.json => pos_boosters_custompm_should.json} (100%) rename src/main/resources/templates/sigir19_experiments_biomed/{with_pos_neg_boosters.json => positive_boosters.json} (66%) delete mode 100644 src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_custompm_must.json delete mode 100644 src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra.json delete mode 100644 src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra_nonmel.json delete mode 100644 src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra_nonmel_custompm_must.json delete mode 100644 src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra_nonmel_should.json delete mode 100644 src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_gcustompm_must.json rename src/main/resources/templates/{sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra_nonmel_custompm_should.json => sigir19_pmclass_biomed/all_custompm_must.json} (79%) rename src/main/resources/templates/{sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_custompm_should.json => sigir19_pmclass_biomed/all_custompm_should.json} (81%) create mode 100644 src/main/resources/templates/sigir19_pmclass_biomed/baseline_custompm_must.json create mode 100644 src/main/resources/templates/sigir19_pmclass_biomed/baseline_custompm_should.json delete mode 100644 uima/corpus-to-json-pipeline/aeDescriptions.bin delete mode 100644 uima/corpus-to-json-pipeline/ccDescriptions.bin delete mode 100644 uima/corpus-to-json-pipeline/cmDescriptions.bin delete mode 100644 uima/corpus-to-json-pipeline/crDescriptions.bin delete mode 100644 uima/preprocessing/desc/JCoRe XMI Database Writer.xml delete mode 100644 uima/preprocessing/desc/PM Classifier 2017.xml delete mode 100644 uima/preprocessing/desc/PM Classifier 2018.xml diff --git a/notebooks/sigir19/RelevantDocsFoundAnalysis.ipynb b/notebooks/sigir19/RelevantDocsFoundAnalysis.ipynb index 254341b5..8de5373e 100644 --- a/notebooks/sigir19/RelevantDocsFoundAnalysis.ipynb +++ b/notebooks/sigir19/RelevantDocsFoundAnalysis.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 207, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -13,19 +13,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "goldfile = \"../../resources/20180622processedGoldStandardTopics.tsv.gz\"\n", - "resultsdir = \"../../results/\"\n", - "statsdir = \"../../stats_pmclass/\"\n", + "resultsdir = \"../../results_disease_2017/\"\n", + "statsdir = \"../../stats_disease_2017/\"\n", "measures = [\"ndcg\",\"infNDCG\", \"P_10\"]" ] }, { "cell_type": "code", - "execution_count": 203, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 173, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -83,7 +83,7 @@ }, { "cell_type": "code", - "execution_count": 174, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -102,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 194, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -114,7 +114,7 @@ " 'all' value is not used here.\n", " Returns only those measures defined in the 'measures' list at the beginning of this cell.\n", " \"\"\"\n", - " statfiles = sorted(list(filter(lambda f: f.endswith(\".csv\") in f, os.listdir(statspath))))\n", + " statfiles = sorted(list(filter(lambda f: f.endswith(\".csv\"), os.listdir(statspath))))\n", " runstatsmap = {}\n", " for stat in statfiles:\n", " df = prepareStats(statspath+stat)\n", @@ -129,7 +129,7 @@ }, { "cell_type": "code", - "execution_count": 195, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -147,86 +147,7 @@ }, { "cell_type": "code", - "execution_count": 243, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "df = pd.DataFrame(np.random.rand(10,5))\n", - "df.columns = [\"ndcg\", \"infNDCG\", \"Rprec\", \"P_5\", \"P_10\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 249, - "metadata": {}, - "outputs": [], - "source": [ - "mi = pd.MultiIndex.from_product([[\"Baseline_wr\", \"Baseline\", \"Dis_wr\", \"Dis\", \"COSMIC\"],[1,2]], names=[\"run\", \"topic\"])\n", - "df = df.set_index(mi)" - ] - }, - { - "cell_type": "code", - "execution_count": 282, - "metadata": {}, - "outputs": [], - "source": [ - "files = [\"file--d:1-m:5-h:7\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 270, - "metadata": {}, - "outputs": [], - "source": [ - "from parse import *" - ] - }, - { - "cell_type": "code", - "execution_count": 349, - "metadata": {}, - "outputs": [], - "source": [ - "result = parse(\"file--d:{d}-m:{m}-h:{h}\", \"file--d:1-m:5-h:7\")" - ] - }, - { - "cell_type": "code", - "execution_count": 350, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'d': '1', 'h': '7', 'm': '5'}" - ] - }, - "execution_count": 350, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "result.named" - ] - }, - { - "cell_type": "code", - "execution_count": 289, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(np.random.rand(10,5))\n", - "df2 = pd.DataFrame(np.random.rand(10,2))" - ] - }, - { - "cell_type": "code", - "execution_count": 317, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -250,301 +171,141 @@ " \n", " \n", " \n", - " 0\n", - " 1\n", - " 2\n", - " 3\n", - " 4\n", - " 0\n", - " 1\n", + " ndcg\n", + " infNDCG\n", + " P_10\n", + " \n", + " \n", + " run\n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 0\n", - " 37.36\n", - " 0.62\n", - " 71.86\n", - " 89.33\n", - " 63.57\n", - " 29.53\n", - " 79.51\n", + " dishyper--dis1.0-hyp0.0-syn0.0\n", + " 0.504927\n", + " 0.352660\n", + " 0.453333\n", " \n", " \n", - " 1\n", - " 61.04\n", - " 99.39\n", - " 98.70\n", - " 40.73\n", - " 6.83\n", - " 59.63\n", - " 57.45\n", + " dishyper--dis1.0-hyp0.0-syn1.0\n", + " 0.504927\n", + " 0.352660\n", + " 0.453333\n", " \n", " \n", - " 2\n", - " 35.93\n", - " 53.95\n", - " 48.33\n", - " 28.05\n", - " 98.69\n", - " 76.95\n", - " 52.50\n", + " dishyper--dis1.0-hyp1.0-syn0.0\n", + " 0.446457\n", + " 0.281613\n", + " 0.366667\n", " \n", " \n", - " 3\n", - " 15.61\n", - " 56.03\n", - " 10.63\n", - " 39.10\n", - " 37.58\n", - " 65.93\n", - " 48.67\n", + " dishyper--dis1.0-hyp1.0-syn1.0\n", + " 0.446457\n", + " 0.281613\n", + " 0.366667\n", " \n", " \n", - " 4\n", - " 54.48\n", - " 82.53\n", - " 27.30\n", - " 63.26\n", - " 86.22\n", - " 46.13\n", - " 73.93\n", + " dissyn--dis1.0-hyp0.0-syn0.0\n", + " 0.529317\n", + " 0.388357\n", + " 0.486667\n", " \n", " \n", - " 5\n", - " 4.43\n", - " 18.65\n", - " 20.25\n", - " 67.25\n", - " 78.39\n", - " 2.64\n", - " 46.55\n", + " dissyn--dis1.0-hyp0.0-syn1.0\n", + " 0.535060\n", + " 0.391587\n", + " 0.483333\n", " \n", " \n", - " 6\n", - " 46.90\n", - " 59.00\n", - " 83.09\n", - " 67.41\n", - " 29.96\n", - " 1.63\n", - " 49.01\n", + " dissyn--dis1.0-hyp1.0-syn0.0\n", + " 0.529317\n", + " 0.388357\n", + " 0.486667\n", " \n", " \n", - " 7\n", - " 24.62\n", - " 99.20\n", - " 15.51\n", - " 38.95\n", - " 1.94\n", - " 81.04\n", - " 75.18\n", + " dissyn--dis1.0-hyp1.0-syn1.0\n", + " 0.535060\n", + " 0.391587\n", + " 0.483333\n", " \n", " \n", - " 8\n", - " 70.69\n", - " 44.62\n", - " 76.58\n", - " 1.22\n", - " 3.18\n", - " 8.79\n", - " 32.47\n", + " dissynhyper--dis1.0-hyp0.0-syn0.0\n", + " 0.504927\n", + " 0.352660\n", + " 0.453333\n", " \n", " \n", - " 9\n", - " 3.85\n", - " 72.39\n", - " 27.84\n", - " 71.23\n", - " 62.34\n", - " 3.84\n", - " 69.09\n", + " dissynhyper--dis1.0-hyp0.0-syn1.0\n", + " 0.513820\n", + " 0.358230\n", + " 0.453333\n", " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " 0 1 2 3 4 0 1\n", - "0 37.36 0.62 71.86 89.33 63.57 29.53 79.51\n", - "1 61.04 99.39 98.70 40.73 6.83 59.63 57.45\n", - "2 35.93 53.95 48.33 28.05 98.69 76.95 52.50\n", - "3 15.61 56.03 10.63 39.10 37.58 65.93 48.67\n", - "4 54.48 82.53 27.30 63.26 86.22 46.13 73.93\n", - "5 4.43 18.65 20.25 67.25 78.39 2.64 46.55\n", - "6 46.90 59.00 83.09 67.41 29.96 1.63 49.01\n", - "7 24.62 99.20 15.51 38.95 1.94 81.04 75.18\n", - "8 70.69 44.62 76.58 1.22 3.18 8.79 32.47\n", - "9 3.85 72.39 27.84 71.23 62.34 3.84 69.09" - ] - }, - "execution_count": 317, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(pd.concat([df,df2], axis=1)*100).round(2)" - ] - }, - { - "cell_type": "code", - "execution_count": 305, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'multfields': 'most_fields',\n", - " 'op': 'AND',\n", - " 'run': 'dissyn',\n", - " 'wordremoval': 'false'},\n", - " {'multfields': 'most_fields',\n", - " 'op': 'AND',\n", - " 'run': 'gendisall',\n", - " 'wordremoval': 'true'}]" - ] - }, - "execution_count": 305, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s = [\"dissyn--mmm:most_fields-op:AND-wr:false\", \"gendisall--mmm:most_fields-op:AND-wr:true\"]\n", - "dicts = []\n", - "for file in s:\n", - " dicts.append(parse(\"{run}--mmm:{multfields}-op:{op}-wr:{wordremoval}\",file).named)\n", - "dicts" - ] - }, - { - "cell_type": "code", - "execution_count": 333, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
multfieldsoprunwordremoval
0most_fieldsANDdissynfalsedissynhyper--dis1.0-hyp1.0-syn0.00.4464570.2816130.366667
1most_fieldsANDgendisalltruedissynhyper--dis1.0-hyp1.0-syn1.00.4520200.2853870.370000
\n", "
" ], "text/plain": [ - " multfields op run wordremoval\n", - "0 most_fields AND dissyn false\n", - "1 most_fields AND gendisall true" + " ndcg infNDCG P_10\n", + "run \n", + "dishyper--dis1.0-hyp0.0-syn0.0 0.504927 0.352660 0.453333\n", + "dishyper--dis1.0-hyp0.0-syn1.0 0.504927 0.352660 0.453333\n", + "dishyper--dis1.0-hyp1.0-syn0.0 0.446457 0.281613 0.366667\n", + "dishyper--dis1.0-hyp1.0-syn1.0 0.446457 0.281613 0.366667\n", + "dissyn--dis1.0-hyp0.0-syn0.0 0.529317 0.388357 0.486667\n", + "dissyn--dis1.0-hyp0.0-syn1.0 0.535060 0.391587 0.483333\n", + "dissyn--dis1.0-hyp1.0-syn0.0 0.529317 0.388357 0.486667\n", + "dissyn--dis1.0-hyp1.0-syn1.0 0.535060 0.391587 0.483333\n", + "dissynhyper--dis1.0-hyp0.0-syn0.0 0.504927 0.352660 0.453333\n", + "dissynhyper--dis1.0-hyp0.0-syn1.0 0.513820 0.358230 0.453333\n", + "dissynhyper--dis1.0-hyp1.0-syn0.0 0.446457 0.281613 0.366667\n", + "dissynhyper--dis1.0-hyp1.0-syn1.0 0.452020 0.285387 0.370000" ] }, - "execution_count": 333, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = pd.DataFrame(dicts)\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 348, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[2, 0, 1, 2]" - ] - }, - "execution_count": 348, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "[2] + list(range(len(df.columns)-1))" - ] - }, - { - "cell_type": "code", - "execution_count": 342, - "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "all inputs must be Index", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"huhu\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/Coding/miniconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mappend\u001b[0;34m(self, other)\u001b[0m\n\u001b[1;32m 2134\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mto_concat\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2135\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mIndex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2136\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'all inputs must be Index'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2137\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2138\u001b[0m \u001b[0mnames\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mto_concat\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: all inputs must be Index" - ] - } - ], - "source": [ - "df.columns[:-1].append(\"huhu\")" + "getMeanStatsPerRun(statsdir)" ] }, { "cell_type": "code", - "execution_count": 332, + "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Index(['multfields', 'op', 'run', 'wordremoval'], dtype='object')" + "" ] }, - "execution_count": 332, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.columns" + "from parse import *\n", + "s = \"DSYN_DHYP_DPT_GSYN_WR--mmm:phrase-op:OR-wr:false-sl:5.trec_results.gz\"\n", + "filenameformat = \"{run}--mmm:{multfields}-op:{op}-wr:{wordremoval}-sl:{slop}\"\n", + "parse(filenameformat, s)" ] } ], diff --git a/scripts/runAllPmClassExperimentsLiterature.sh b/scripts/runAllPmClassExperimentsLiterature.sh new file mode 100755 index 00000000..fc68642c --- /dev/null +++ b/scripts/runAllPmClassExperimentsLiterature.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +mvn compile +# For the best_fields, the slop does not do anything, but a argument value is expected +sbatch --exclude=h5,h6 scripts/runPmClassExperimentsLiterature.sh best_fields OR +sbatch --exclude=h5,h6 scripts/runPmClassExperimentsLiterature.sh best_fields AND + +# Here, the boolean operator has no effect +sbatch --exclude=h5,h6 scripts/runPmClassExperimentsLiterature.sh phrase OR diff --git a/scripts/runAllRecallExperimentsLiterature.sh b/scripts/runAllRecallExperimentsLiterature.sh index 2ce07538..35483126 100755 --- a/scripts/runAllRecallExperimentsLiterature.sh +++ b/scripts/runAllRecallExperimentsLiterature.sh @@ -2,17 +2,11 @@ mvn compile # For the best_fields, the slop does not do anything, but a argument value is expected -sbatch scripts/runRecallExperimentsLiterature.sh best_fields OR false 10 -sbatch scripts/runRecallExperimentsLiterature.sh best_fields OR true 10 -sbatch scripts/runRecallExperimentsLiterature.sh best_fields AND false 10 -sbatch scripts/runRecallExperimentsLiterature.sh best_fields AND true 10 +sbatch --exclude=h5,h6 scripts/runRecallExperimentsLiterature.sh best_fields OR 10 +sbatch --exclude=h5,h6 scripts/runRecallExperimentsLiterature.sh best_fields AND 10 # Here, the boolean operator has no effect -sbatch scripts/runRecallExperimentsLiterature.sh phrase OR false 10 -sbatch scripts/runRecallExperimentsLiterature.sh phrase OR true 10 -sbatch scripts/runRecallExperimentsLiterature.sh phrase OR false 5 -sbatch scripts/runRecallExperimentsLiterature.sh phrase OR true 5 -sbatch scripts/runRecallExperimentsLiterature.sh phrase OR false 3 -sbatch scripts/runRecallExperimentsLiterature.sh phrase OR true 3 -sbatch scripts/runRecallExperimentsLiterature.sh phrase OR false 2 -sbatch scripts/runRecallExperimentsLiterature.sh phrase OR true 2 \ No newline at end of file +sbatch --exclude=h5,h6 scripts/runRecallExperimentsLiterature.sh phrase OR 10 +#sbatch --exclude=h5,h6 scripts/runRecallExperimentsLiterature.sh phrase OR 5 +#sbatch --exclude=h5,h6 scripts/runRecallExperimentsLiterature.sh phrase OR 3 +#sbatch --exclude=h5,h6 scripts/runRecallExperimentsLiterature.sh phrase OR 2 \ No newline at end of file diff --git a/scripts/runAllTermBoostExperimentsLiterature.sh b/scripts/runAllTermBoostExperimentsLiterature.sh new file mode 100755 index 00000000..b7f072a8 --- /dev/null +++ b/scripts/runAllTermBoostExperimentsLiterature.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +mvn compile +# For the best_fields, the slop does not do anything, but a argument value is expected +sbatch --exclude=h5,h6 scripts/runTermBoostExperimentsLiterature.sh best_fields OR +sbatch --exclude=h5,h6 scripts/runTermBoostExperimentsLiterature.sh best_fields AND + +# Here, the boolean operator has no effect +sbatch --exclude=h5,h6 scripts/runTermBoostExperimentsLiterature.sh phrase OR diff --git a/scripts/runPmClassExperimentsLiterature.sh b/scripts/runPmClassExperimentsLiterature.sh new file mode 100755 index 00000000..61eba2dd --- /dev/null +++ b/scripts/runPmClassExperimentsLiterature.sh @@ -0,0 +1,6 @@ +#!/bin/bash +#SBATCH --cpus-per-task 2 +#SBATCH --mem 10G +#SBATCH -J termboostexp + +mvn exec:java -Dexec.mainClass=at.medunigraz.imi.bst.trec.SigirPubmedExperimenterPmClass -Dexec.args="$1 $2" diff --git a/scripts/runRecallExperimentsLiterature.sh b/scripts/runRecallExperimentsLiterature.sh index e41bedd2..1a40a734 100755 --- a/scripts/runRecallExperimentsLiterature.sh +++ b/scripts/runRecallExperimentsLiterature.sh @@ -3,4 +3,4 @@ #SBATCH --mem 10G #SBATCH -J recallexp -mvn exec:java -Dexec.mainClass=at.medunigraz.imi.bst.trec.SigirPubmedRecallExperimenterDefaultBoosting -Dexec.args="$1 $2 $3 $4" +mvn exec:java -Dexec.mainClass=at.medunigraz.imi.bst.trec.SigirPubmedRecallExperimenterDefaultBoosting -Dexec.args="$1 $2 $3" diff --git a/scripts/runTermBoostExperimentsLiterature.sh b/scripts/runTermBoostExperimentsLiterature.sh new file mode 100755 index 00000000..91fd17a3 --- /dev/null +++ b/scripts/runTermBoostExperimentsLiterature.sh @@ -0,0 +1,6 @@ +#!/bin/bash +#SBATCH --cpus-per-task 5 +#SBATCH --mem 10G +#SBATCH -J termboostexp + +mvn exec:java -Dexec.mainClass=at.medunigraz.imi.bst.trec.SigirPubmedTermBoostExperimenterDefaultBoosting -Dexec.args="$1 $2" diff --git a/src/main/java/at/medunigraz/imi/bst/trec/SigirParameters.java b/src/main/java/at/medunigraz/imi/bst/trec/SigirParameters.java index 9f4130f3..3838af84 100644 --- a/src/main/java/at/medunigraz/imi/bst/trec/SigirParameters.java +++ b/src/main/java/at/medunigraz/imi/bst/trec/SigirParameters.java @@ -36,6 +36,7 @@ public class SigirParameters { TREC_2018_HPIPUBNONE.put("non_mel_boost", "1"); TREC_2018_HPIPUBNONE.put("pm_gs_boost", "1"); TREC_2018_HPIPUBNONE.put("dgi_boost", "0"); + TREC_2018_HPIPUBNONE.put("mut_boost", "1"); TREC_2018_HPIPUBNONE.put("dis_multi_match_type", BEST_FIELDS); TREC_2018_HPIPUBNONE.put("dis_prefterm_multi_match_type", BEST_FIELDS); @@ -92,6 +93,7 @@ public class SigirParameters { LITERATURE_ES_DEFAULTS.put("non_mel_boost", "1"); LITERATURE_ES_DEFAULTS.put("pm_gs_boost", "1"); LITERATURE_ES_DEFAULTS.put("dgi_boost", "1"); + LITERATURE_ES_DEFAULTS.put("mut_boost", "1"); LITERATURE_ES_DEFAULTS.put("dis_multi_match_type", BEST_FIELDS); LITERATURE_ES_DEFAULTS.put("dis_prefterm_multi_match_type", BEST_FIELDS); diff --git a/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedRecallExperimenterBoostOptimizer.java b/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedExperimenterBoostOptimizer.java similarity index 60% rename from src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedRecallExperimenterBoostOptimizer.java rename to src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedExperimenterBoostOptimizer.java index 7698c292..1cff9bb3 100644 --- a/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedRecallExperimenterBoostOptimizer.java +++ b/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedExperimenterBoostOptimizer.java @@ -6,21 +6,21 @@ import java.util.*; import java.util.stream.IntStream; -public class SigirPubmedRecallExperimenterBoostOptimizer extends SuperSigirPubmedRecallExperimenter { +public class SigirPubmedExperimenterBoostOptimizer extends SuperSigirPubmedRecallExperimenter { public static void main(String[] args) { Set validParams = new LinkedHashSet<>(); - validParams.add("disease"); - validParams.add("gene"); + validParams.add("genedis"); validParams.add("fields"); validParams.add("posneg"); validParams.add("additional"); validParams.add("extra"); - validParams.add("pmgs"); validParams.add("pmclass"); + validParams.add("mutation"); + validParams.add("drug"); if (args.length != 1 || !validParams.contains(args[0])) { - System.err.println("Usage: " + SigirPubmedRecallExperimenterBoostOptimizer.class.getSimpleName() + " "); + System.err.println("Usage: " + SigirPubmedExperimenterBoostOptimizer.class.getSimpleName() + " "); System.err.println("Where is one of " + validParams); System.exit(1); } @@ -36,36 +36,25 @@ public static void main(String[] args) { DecimalFormat df = new DecimalFormat("0.0"); - if (what.equals("disease")) { + if (what.equals("genedis")) { List> parameters = new ArrayList<>(); List suffixes = new ArrayList<>(); - for (double disb = .4; disb < 2.4; disb += .4) { - for (double ptb = .2; ptb < 1.2; ptb += .4) { - for (double synb = .2; synb < 2; synb += .4) { - Map paramcombination = new HashMap<>(templateProperties); - paramcombination.put("dis_boost", String.valueOf(disb)); - paramcombination.put("dis_prefterm_boost", String.valueOf(ptb)); - paramcombination.put("dis_syn_boost", String.valueOf(synb)); - String suffix = "--dis" + df.format(disb) + "-pt" + df.format(ptb) + "-syn" + df.format(synb); - parameters.add(paramcombination); - suffixes.add(suffix); - } - } - } - runExperimentsWithParameters(parameters, suffixes, year, what, goldStandard, target); - } else if (what.equals("gene")) { - List> parameters = new ArrayList<>(); - List suffixes = new ArrayList<>(); - for (double genb = .4; genb < 2.4; genb += .4) { - for (double descb = .2; descb < 1.8; descb += .4) { - for (double synb = .2; synb < 1.8; synb += .4) { - Map paramcombination = new HashMap<>(templateProperties); - paramcombination.put("gene_boost", String.valueOf(genb)); - paramcombination.put("gene_desc_boost", String.valueOf(descb)); - paramcombination.put("gene_syn_boost", String.valueOf(synb)); - String suffix = "--gen" + df.format(genb) + "-desc" + df.format(descb) + "-syn" + df.format(synb); - parameters.add(paramcombination); - suffixes.add(suffix); + for (double genb = 1; genb < 3; genb += .5) { + for (double descb = 1; descb < 3; descb += .5) { + for (double gsynb = 1; gsynb < 3; gsynb += .5) { + for (double disb = 1; disb < 3; disb += .5) { + for (double dsynb = 1; dsynb < 3; gsynb += .5) { + Map paramcombination = new HashMap<>(templateProperties); + paramcombination.put("gene_boost", String.valueOf(genb)); + paramcombination.put("gene_desc_boost", String.valueOf(descb)); + paramcombination.put("gene_syn_boost", String.valueOf(gsynb)); + paramcombination.put("dis_boost", String.valueOf(disb)); + paramcombination.put("dis_syn_boost", String.valueOf(dsynb)); + String suffix = "--gen" + df.format(genb) + "-gdes" + df.format(descb) + "-gsyn" + df.format(gsynb) + "--dis" + df.format(disb) + "-dsyn" + df.format(dsynb); + parameters.add(paramcombination); + suffixes.add(suffix); + } + } } } } @@ -73,11 +62,11 @@ public static void main(String[] args) { } else if (what.equals("fields")) { List> parameters = new ArrayList<>(); List suffixes = new ArrayList<>(); - for (double titb = 1; titb < 1.6; titb += .2) { - for (double abstrb = 1; abstrb < 1.6; abstrb += .2) { - for (double kwb = 1; kwb < 1.6; kwb += .2) { - for (double meshb = 1; meshb < 1.6; meshb += .2) { - for (double genesb = 1; genesb < 1.1; genesb += .4) { + for (double titb = 1; titb < 3; titb += .5) { + for (double abstrb = 1; abstrb < 3; abstrb += .5) { + for (double kwb = 1; kwb < 3; kwb += .5) { + for (double meshb = 1; meshb < 3; meshb += .5) { + for (double genesb = 1; genesb < 3; genesb += .5) { Map paramcombination = new HashMap<>(templateProperties); paramcombination.put("title_boost", "^" + titb); paramcombination.put("abstract_boost", "^" + abstrb); @@ -96,8 +85,8 @@ public static void main(String[] args) { } else if (what.equals("posneg")) { List> parameters = new ArrayList<>(); List suffixes = new ArrayList<>(); - for (double posb = .7; posb < 1.2; posb += .1) { - for (double negb = -3; negb < .2; negb += .4) { + for (double posb = .5; posb < 3; posb += .5) { + for (double negb = -3; negb <= .5; negb += .5) { Map paramcombination = new HashMap<>(templateProperties); paramcombination.put("pos_words_boost", String.valueOf(posb)); paramcombination.put("neg_words_boost", String.valueOf(negb)); @@ -110,19 +99,16 @@ public static void main(String[] args) { } else if (what.equals("additional")) { List> parameters = new ArrayList<>(); List suffixes = new ArrayList<>(); - for (double cancerb = .4; cancerb < 2; cancerb += .4) { - for (double chemob = .4; chemob < 2; chemob += .4) { - for (double dnab = .4; dnab < 2; dnab += .4) { - for (double nonmelb = -1; nonmelb < .8; nonmelb += .4) { - Map paramcombination = new HashMap<>(templateProperties); - paramcombination.put("cancer_boost", String.valueOf(cancerb)); - paramcombination.put("chemo_boost", String.valueOf(chemob)); - paramcombination.put("dna_boost", String.valueOf(dnab)); - paramcombination.put("non_mel_boost", String.valueOf(dnab)); - String suffix = "--canc" + df.format(cancerb) + "-chem" + df.format(chemob) + "-dna" + df.format(dnab) + "-nonmel" + df.format(nonmelb); - parameters.add(paramcombination); - suffixes.add(suffix); - } + for (double cancerb = .5; cancerb < 3; cancerb += .5) { + for (double chemob = .5; chemob < 3; chemob += .5) { + for (double dnab = .5; dnab < 3; dnab += .5) { + Map paramcombination = new HashMap<>(templateProperties); + paramcombination.put("cancer_boost", String.valueOf(cancerb)); + paramcombination.put("chemo_boost", String.valueOf(chemob)); + paramcombination.put("dna_boost", String.valueOf(dnab)); + String suffix = "--canc" + df.format(cancerb) + "-chem" + df.format(chemob) + "-dna" + df.format(dnab); + parameters.add(paramcombination); + suffixes.add(suffix); } } } @@ -130,7 +116,7 @@ public static void main(String[] args) { } else if (what.equals("extra")) { List> parameters = new ArrayList<>(); List suffixes = new ArrayList<>(); - for (double extrab = .4; extrab < 2; extrab += .4) { + for (double extrab = .5; extrab <= 3; extrab += .5) { Map paramcombination = new HashMap<>(templateProperties); paramcombination.put("extra_boost", String.valueOf(extrab)); String suffix = "--extra" + df.format(extrab); @@ -138,19 +124,18 @@ public static void main(String[] args) { suffixes.add(suffix); } runExperimentsWithParameters(parameters, suffixes, year, what, goldStandard, target); - } else if (what.equals("pmgs")) { + } else if (what.equals("mutation")) { List> parameters = new ArrayList<>(); List suffixes = new ArrayList<>(); - for (double pmgsb = -1; pmgsb < .8; pmgsb += .4) { + for (double extrab = .5; extrab <= 3; extrab += .5) { Map paramcombination = new HashMap<>(templateProperties); - paramcombination.put("pm_gs_boost", String.valueOf(pmgsb)); - String suffix = "--pmgs" + df.format(pmgsb); + paramcombination.put("mut_boost", String.valueOf(extrab)); + String suffix = "--mut" + df.format(extrab); parameters.add(paramcombination); suffixes.add(suffix); } runExperimentsWithParameters(parameters, suffixes, year, what, goldStandard, target); - } - if (what.equals("pmclass")) { + } else if (what.equals("pmclass")) { final List pmfields = Arrays.asList("pmclass2017lstm.keyword", "pmclass2017lstmatt.keyword", "pmclass2017lstmgru.keyword", @@ -163,7 +148,7 @@ public static void main(String[] args) { pmfields.parallelStream().forEach(pmfield -> { Map parameters = new HashMap<>(templateProperties); parameters.put("pm_class_field", pmfield); - runExperiments(parameters, false, goldStandard, target, year, what, "-" + pmfield); + // runExperiments(parameters, false, goldStandard, target, year, what, "-" + pmfield); }); } else throw new IllegalStateException("Unknown mode " + what); @@ -174,7 +159,7 @@ private static void runExperimentsWithParameters(List> param IntStream.range(0, parameters.size()).parallel().forEach(i -> { Map parameterset = parameters.get(i); String suffix = suffixes.get(i); - runExperiments(parameterset, false, goldStandard, target, year, what, suffix); + // runExperiments(parameterset, false, goldStandard, target, year, what, suffix); }); } } diff --git a/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedExperimenterPmClass.java b/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedExperimenterPmClass.java new file mode 100644 index 00000000..51413141 --- /dev/null +++ b/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedExperimenterPmClass.java @@ -0,0 +1,77 @@ +package at.medunigraz.imi.bst.trec; + +import at.medunigraz.imi.bst.trec.experiment.Experiment; + +import java.text.DecimalFormat; +import java.util.*; +import java.util.stream.IntStream; + +import static at.medunigraz.imi.bst.trec.SigirParameters.BEST_FIELDS; + +public class SigirPubmedExperimenterPmClass extends SuperSigirPubmedRecallExperimenter { + public static void main(String[] args) { + + if (args.length != 2) { + System.err.println("Parameters: "); + } + + String what = "pmclass"; + + final Experiment.GoldStandard goldStandard = Experiment.GoldStandard.OFFICIAL; + final Experiment.Task target = Experiment.Task.PUBMED; + final int year = 2017; + + + + + Map templateProperties = new HashMap<>(SigirParameters.LITERATURE_ES_DEFAULTS); + + String defaultMultiMatch = args[0]; + templateProperties.put("dis_multi_match_type", defaultMultiMatch); + templateProperties.put("dis_syn_multi_match_type", defaultMultiMatch); + templateProperties.put("dis_hyper_multi_match_type", defaultMultiMatch); + templateProperties.put("gene_multi_match_type", defaultMultiMatch.equals("phrase") ? BEST_FIELDS : defaultMultiMatch); + templateProperties.put("gene_syn_multi_match_type", defaultMultiMatch); + templateProperties.put("gene_desc_multi_match_type", defaultMultiMatch); + templateProperties.put("gene_hyper_multi_match_type", defaultMultiMatch); + templateProperties.put("cancer_multi_match_type", defaultMultiMatch); + templateProperties.put("dna_multi_match_type", defaultMultiMatch); + templateProperties.put("neg_boost_multi_match_type", defaultMultiMatch); + templateProperties.put("pos_boost_multi_match_type", defaultMultiMatch); + templateProperties.put("dis_prefterm_multi_match_type", defaultMultiMatch); + templateProperties.put("dgi_multi_match_type", defaultMultiMatch); + + String defaultOperator = args[1]; + templateProperties.put("dis_operator", defaultOperator); + templateProperties.put("dis_prefterm_operator", defaultOperator); + templateProperties.put("dis_syn_operator", defaultOperator); + templateProperties.put("dis_hyper_operator", defaultOperator); + templateProperties.put("gene_operator", "OR"); + templateProperties.put("gene_syn_operator", defaultOperator); + templateProperties.put("gene_hyper_operator", defaultOperator); + templateProperties.put("gene_desc_operator", "OR"); + templateProperties.put("cancer_operator", "OR"); + templateProperties.put("dna_operator", "OR"); + + templateProperties.put("phrase_slop", "10"); + + + final List pmfields = Arrays.asList("pmclass2017lstm.keyword", + "pmclass2017lstmatt.keyword", + "pmclass2017lstmgru.keyword", + "pmclass2018lstm.keyword", + "pmclass2018lstmatt.keyword", + "pmclass2018lstmgru.keyword", + "pmclass2017.keyword", + "pmclass2018.keyword"); + pmfields.parallelStream().forEach(pmfield -> { + Map parameters = new HashMap<>(templateProperties); + parameters.put("pm_class_field", pmfield); + runPmClassifierExperiments(null, parameters, goldStandard, target, year, what, "--mmm:" + defaultMultiMatch + "-op:" + defaultOperator + "-pmf:" + pmfield); + }); + + + } + + +} diff --git a/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedRecallExperimenterDefaultBoosting.java b/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedRecallExperimenterDefaultBoosting.java index d40b05a5..bb88c9a7 100644 --- a/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedRecallExperimenterDefaultBoosting.java +++ b/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedRecallExperimenterDefaultBoosting.java @@ -10,8 +10,8 @@ public class SigirPubmedRecallExperimenterDefaultBoosting extends SuperSigirPubmedRecallExperimenter { public static void main(String[] args) { - if (args.length != 4) { - System.err.println("Parameters: "); + if (args.length != 3) { + System.err.println("Parameters: "); } final Experiment.GoldStandard goldStandard = Experiment.GoldStandard.OFFICIAL; @@ -48,11 +48,10 @@ public static void main(String[] args) { templateProperties.put("cancer_operator", "OR"); templateProperties.put("dna_operator", "OR"); - final String slop = args[3]; + final String slop = args[2]; templateProperties.put("phrase_slop", slop); - final boolean wordremoval = Boolean.parseBoolean(args[2]); - runExperiments(templateProperties, wordremoval, goldStandard, target, year, "", "--mmm:" + defaultMultiMatch + "-op:" + defaultOperator + "-wr:" + wordremoval + "-sl:" + slop); + runRecallExperiments(templateProperties, goldStandard, target, year, "recall", "--mmm:" + defaultMultiMatch + "-op:" + defaultOperator + "-sl:" + slop); } } diff --git a/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedRecallExperimenterOptimizedBoosting.java b/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedRecallExperimenterOptimizedBoosting.java index 6833aadd..e3eb8507 100644 --- a/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedRecallExperimenterOptimizedBoosting.java +++ b/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedRecallExperimenterOptimizedBoosting.java @@ -38,6 +38,6 @@ public static void main(String[] args) { // Doesn't seem to do anything, the whole non-melanoma query part doesn't show much influence (none?) templateProperties.put("non_mel_boost", "-10"); - runExperiments(templateProperties, false, goldStandard, target, year, "", ""); + // runExperiments(templateProperties, false, goldStandard, target, year, "", ""); } } diff --git a/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedRecallExperimenterParameterized.java b/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedRecallExperimenterParameterized.java index 421bc485..6b33c687 100644 --- a/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedRecallExperimenterParameterized.java +++ b/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedRecallExperimenterParameterized.java @@ -18,6 +18,6 @@ public static void main(String[] args) { // Change here the predefined parameter set you want to use Map templateProperties = SigirParameters.TREC_2018_HPIPUBNONE; - runExperiments(templateProperties, false, goldStandard, target, year, "", ""); + // runExperiments(templateProperties, false, goldStandard, target, year, "", ""); } } diff --git a/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedTermBoostExperimenterDefaultBoosting.java b/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedTermBoostExperimenterDefaultBoosting.java new file mode 100644 index 00000000..26cfa2f1 --- /dev/null +++ b/src/main/java/at/medunigraz/imi/bst/trec/SigirPubmedTermBoostExperimenterDefaultBoosting.java @@ -0,0 +1,56 @@ +package at.medunigraz.imi.bst.trec; + +import at.medunigraz.imi.bst.trec.experiment.Experiment; + +import java.util.HashMap; +import java.util.Map; + +import static at.medunigraz.imi.bst.trec.SigirParameters.BEST_FIELDS; + +public class SigirPubmedTermBoostExperimenterDefaultBoosting extends SuperSigirPubmedRecallExperimenter { + public static void main(String[] args) { + + if (args.length != 2) { + System.err.println("Parameters: "); + } + + final Experiment.GoldStandard goldStandard = Experiment.GoldStandard.OFFICIAL; + final Experiment.Task target = Experiment.Task.PUBMED; + final int year = 2018; + + + Map templateProperties = new HashMap<>(SigirParameters.LITERATURE_ES_DEFAULTS); + + String defaultMultiMatch = args[0]; + templateProperties.put("dis_multi_match_type", defaultMultiMatch); + templateProperties.put("dis_syn_multi_match_type", defaultMultiMatch); + templateProperties.put("dis_hyper_multi_match_type", defaultMultiMatch); + templateProperties.put("gene_multi_match_type", defaultMultiMatch.equals("phrase") ? BEST_FIELDS : defaultMultiMatch); + templateProperties.put("gene_syn_multi_match_type", defaultMultiMatch); + templateProperties.put("gene_desc_multi_match_type", defaultMultiMatch); + templateProperties.put("gene_hyper_multi_match_type", defaultMultiMatch); + templateProperties.put("cancer_multi_match_type", defaultMultiMatch); + templateProperties.put("dna_multi_match_type", defaultMultiMatch); + templateProperties.put("neg_boost_multi_match_type", defaultMultiMatch); + templateProperties.put("pos_boost_multi_match_type", defaultMultiMatch); + templateProperties.put("dis_prefterm_multi_match_type", defaultMultiMatch); + templateProperties.put("dgi_multi_match_type", defaultMultiMatch); + + String defaultOperator = args[1]; + templateProperties.put("dis_operator", defaultOperator); + templateProperties.put("dis_prefterm_operator", defaultOperator); + templateProperties.put("dis_syn_operator", defaultOperator); + templateProperties.put("dis_hyper_operator", defaultOperator); + templateProperties.put("gene_operator", "OR"); + templateProperties.put("gene_syn_operator", defaultOperator); + templateProperties.put("gene_hyper_operator", defaultOperator); + templateProperties.put("gene_desc_operator", "OR"); + templateProperties.put("cancer_operator", "OR"); + templateProperties.put("dna_operator", "OR"); + + templateProperties.put("phrase_slop", "10"); + + + runTermBoostExperiments(templateProperties, goldStandard, target, year, "termboost", "--mmm:" + defaultMultiMatch + "-op:" + defaultOperator); + } +} diff --git a/src/main/java/at/medunigraz/imi/bst/trec/SuperSigirPubmedRecallExperimenter.java b/src/main/java/at/medunigraz/imi/bst/trec/SuperSigirPubmedRecallExperimenter.java index fb0aa97f..5e8ea673 100644 --- a/src/main/java/at/medunigraz/imi/bst/trec/SuperSigirPubmedRecallExperimenter.java +++ b/src/main/java/at/medunigraz/imi/bst/trec/SuperSigirPubmedRecallExperimenter.java @@ -2,31 +2,169 @@ import at.medunigraz.imi.bst.trec.experiment.Experiment; import at.medunigraz.imi.bst.trec.experiment.ExperimentsBuilder; +import com.google.common.collect.Sets; import org.apache.commons.lang.StringUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import java.io.File; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; -import java.util.function.Function; +import java.util.*; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.stream.Collectors; public class SuperSigirPubmedRecallExperimenter { + private static final Logger LOG = LogManager.getLogger(); + private static int numProperties = 51; + protected static void runPmClassifierExperiments(File template, Map templateProperties, Experiment.GoldStandard goldStandard, Experiment.Task target, int year, String what, String suffix) { + if (templateProperties.size() > numProperties) + throw new IllegalArgumentException("There are more key in the properties map as there are known properties: " + templateProperties.keySet()); + + ExperimentsBuilder builder = new ExperimentsBuilder(); + if (!StringUtils.isBlank(what)) { + builder.setDefaultStatsDir("stats_" + what + "_" + year); + builder.setDefaultResultsDir("results_" + what + "_" + year); + } + final Map sigirTemplates = getSigirTemplates("/templates/sigir19_pmclass_biomed"); + + final Set> expansionSets = new HashSet<>(Sets.powerSet(EnumSet.of(Expansion.DGI, Expansion.GDE, Expansion.GSY, Expansion.DSY, Expansion.WR))); + + addExperiments(template, sigirTemplates, templateProperties, expansionSets, goldStandard, target, year, builder, suffix); + + Set experiments = builder.build(); + final ExecutorService executorService = Executors.newFixedThreadPool(2); + List> futures = new ArrayList<>(); + for (Experiment exp : experiments) { + futures.add(executorService.submit(exp)); + } + + for (Future f : futures) { + try { + f.get(); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException(e); + } + } + } + + /** + * For experiments that try different query boosts. + * + * @param template + * @param templateProperties + * @param goldStandard + * @param target + * @param year + * @param what + * @param suffix + */ + protected static void runBoostExperiments(File template, Map templateProperties, Experiment.GoldStandard goldStandard, Experiment.Task target, int year, String what, String suffix) { + if (templateProperties.size() > numProperties) + throw new IllegalArgumentException("There are more key in the properties map as there are known properties: " + templateProperties.keySet()); + + ExperimentsBuilder builder = new ExperimentsBuilder(); + if (!StringUtils.isBlank(what)) { + builder.setDefaultStatsDir("stats_" + what + "_" + year); + builder.setDefaultResultsDir("results_" + what + "_" + year); + } + final Map sigirTemplates = getSigirTemplates("/templates/sigir19_experiments_biomed"); + + // Switch everything on - except word removal - so that the boosters actually have a point. + final Set> expansionSets = new HashSet<>(Arrays.asList(EnumSet.complementOf(EnumSet.of(Expansion.WR)))); + + addExperiments(template, sigirTemplates, templateProperties, expansionSets, goldStandard, target, year, builder, suffix); + + Set experiments = builder.build(); + final ExecutorService executorService = Executors.newFixedThreadPool(5); + List> futures = new ArrayList<>(); + for (Experiment exp : experiments) { + futures.add(executorService.submit(exp)); + } + + for (Future f : futures) { + try { + f.get(); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException(e); + } + } + } + + /** + * For experiments with different boosting signals in the form of additional SHOULD queries. + * + * @param templateProperties + * @param goldStandard + * @param target + * @param year + * @param what + * @param suffix + */ + protected static void runTermBoostExperiments(Map templateProperties, Experiment.GoldStandard goldStandard, Experiment.Task target, int year, String what, String suffix) { + if (templateProperties.size() > numProperties) + throw new IllegalArgumentException("There are more key in the properties map as there are known properties: " + templateProperties.keySet()); + + ExperimentsBuilder builder = new ExperimentsBuilder(); + if (!StringUtils.isBlank(what)) { + builder.setDefaultStatsDir("stats_" + what + "_" + year); + builder.setDefaultResultsDir("results_" + what + "_" + year); + } + final Map sigirTemplates = getSigirTemplates("/templates/sigir19_experiments_biomed"); + + final Set> expansionSets = new HashSet<>(Sets.powerSet(EnumSet.of(Expansion.DGI, Expansion.GDE, Expansion.GSY, Expansion.DSY, Expansion.WR))); + // Only make those sets that actually contain a positive boost, namely the drug interactions and the gene descriptions + for (Iterator> it = expansionSets.iterator(); it.hasNext(); ) { + final Set set = it.next(); + if (!set.contains(Expansion.DGI) && !set.contains(Expansion.GDE)) + it.remove(); + } + + addExperiments(null, sigirTemplates, templateProperties, expansionSets, goldStandard, target, year, builder, suffix); + + Set experiments = builder.build(); + final ExecutorService executorService = Executors.newFixedThreadPool(5); + List> futures = new ArrayList<>(); + for (Experiment exp : experiments) { + futures.add(executorService.submit(exp)); + } - protected static void runExperiments(Map templateProperties, boolean wordremoval, Experiment.GoldStandard goldStandard, Experiment.Task target, int year, String what, String suffix) { - if (templateProperties.size() > 50) + for (Future f : futures) { + try { + f.get(); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException(e); + } + } + } + + /** + * Experiments with the base template and different settings for topic expansions + * + * @param templateProperties + * @param goldStandard + * @param target + * @param year + * @param what + * @param suffix + */ + protected static void runRecallExperiments(Map templateProperties, Experiment.GoldStandard goldStandard, Experiment.Task target, int year, String what, String suffix) { + if (templateProperties.size() > numProperties) throw new IllegalArgumentException("There are more key in the properties map as there are known properties: " + templateProperties.keySet()); ExperimentsBuilder builder = new ExperimentsBuilder(); if (!StringUtils.isBlank(what)) { - builder.setDefaultStatsDir("stats_" + what+"_"+year); - builder.setDefaultResultsDir("results_" + what+"_"+year); + builder.setDefaultStatsDir("stats_" + what + "_" + year); + builder.setDefaultResultsDir("results_" + what + "_" + year); } - final Map sigirTemplates = getSigirTemplates(); + final Map sigirTemplates = getSigirTemplates("/templates/sigir19_experiments_biomed"); + + final Set> expansionSets = Sets.powerSet(EnumSet.complementOf(EnumSet.of(Expansion.DGI, Expansion.GDE))); - //addExperimentsWithoutPmClassifier(sigirTemplates, templateProperties, goldStandard, wordremoval, target, year, builder, suffix); - addExperimentsWithCustomPmClassifierShould(sigirTemplates, templateProperties, goldStandard, wordremoval, target, year, builder, suffix); - addExperimentsWithCustomPmClassifierMust(sigirTemplates, templateProperties, goldStandard, wordremoval, target, year, builder, suffix); + File baselineTemplate = new File(PubmedExperimenter.class.getResource("/templates/sigir19_experiments_biomed/baseline.json").getFile()); + addExperiments(baselineTemplate, sigirTemplates, templateProperties, expansionSets, goldStandard, target, year, builder, suffix); Set experiments = builder.build(); @@ -40,8 +178,8 @@ protected static void runExperiments(Map templateProperties, boo } } - private static Map getSigirTemplates() { - final File file = new File(PubmedExperimenter.class.getResource("/templates/sigir19_experiments_biomed").getFile()); + private static Map getSigirTemplates(String dir) { + final File file = new File(PubmedExperimenter.class.getResource(dir).getFile()); final File[] templateFiles = file.listFiles(f -> !f.getName().equals(".DS_Store")); Map templateMap = new HashMap<>(); for (File template : templateFiles) { @@ -63,114 +201,61 @@ else if (template.getName().contains("custompm_should")) return templateMap; } - private static void addExperimentsWithoutPmClassifier(Map templates, Map templateProperties, Experiment.GoldStandard goldStandard, boolean wordremoval, Experiment.Task target, int year, ExperimentsBuilder builder, String suffix) { - Function getTemplate = name -> templates.get(name).getBase(); - addExperiments(templateProperties, goldStandard, wordremoval, target, year, builder, getTemplate, suffix); - } - private static void addExperimentsWithCustomPmClassifierShould(Map templates, Map templateProperties, Experiment.GoldStandard goldStandard, boolean wordremoval, Experiment.Task target, int year, ExperimentsBuilder builder, String suffix) { - Function getTemplate = name -> templates.get(name).getCustompmShould(); - addExperiments(templateProperties, goldStandard, wordremoval, target, year, builder, getTemplate, "_custompm_should" + suffix); - } + private static void addExperiments(File singleTemplate, Map templates, Map templateProperties, Set> expansionSets, Experiment.GoldStandard goldStandard, Experiment.Task target, int year, ExperimentsBuilder builder, String suffix) { + Map effectiveTemplates = templates; + if (singleTemplate != null) { + effectiveTemplates = new HashMap<>(); + final TemplateSet templateSet = new TemplateSet(); + templateSet.setBase(singleTemplate); + } + for (TemplateSet templateSet : effectiveTemplates.values()) { + final File template = templateSet.getBase(); + if (template == null) { + LOG.debug("Skipping template set {} because it does not have the required template derivative", templateSet); + continue; + } - private static void addExperimentsWithCustomPmClassifierMust(Map templates, Map templateProperties, Experiment.GoldStandard goldStandard, boolean wordremoval, Experiment.Task target, int year, ExperimentsBuilder builder, String suffix) { - Function getTemplate = name -> templates.get(name).getCustompmMust(); - addExperiments(templateProperties, goldStandard, wordremoval, target, year, builder, getTemplate, "_custompm_must" + suffix); + LOG.debug("Creating experiments with template {}", template); + for (Set expansions : expansionSets) { + builder.newExperiment().withYear(year).withGoldStandard(goldStandard).withTarget(target) + .withSubTemplate(template, templateProperties); + // This is the default name to indicate that no expansion is enabled + builder.withName(template.getName() + "-NONE" + suffix); + + for (Expansion expansion : expansions) { + switch (expansion) { + case DSY: + builder.withDiseaseSynonym(); + break; + case DHY: + builder.withDiseaseHypernym(); + break; + case DP: + builder.withDiseasePreferredTerm(); + break; + case GSY: + builder.withGeneSynonym(); + break; + case GDE: + builder.withGeneDescription(); + break; + case WR: + builder.withWordRemoval(); + break; + case DGI: + builder.withDrugInteraction(); + break; + } + String name = expansions.stream().sorted().map(Expansion::name).collect(Collectors.joining("_")); + builder.withName(template.getName() + "-" + name + suffix); + } + } + } } - private static void addExperiments(Map templateProperties, Experiment.GoldStandard goldStandard, boolean wordremoval, Experiment.Task target, int year, ExperimentsBuilder builder, Function getTemplate, String suffix) { - File baseline = getTemplate.apply("baseline"); - File baseline_plus_genefield = getTemplate.apply("baseline_plus_genefield"); - File with_pos_boosters = getTemplate.apply("with_pos_boosters"); - File with_pos_neg_boosters = getTemplate.apply("with_pos_neg_boosters"); - File with_pos_neg_boosters_additional_signals = getTemplate.apply("with_pos_neg_boosters_additional_signals"); - File with_pos_neg_boosters_additional_signals_extra = getTemplate.apply("with_pos_neg_boosters_additional_signals_extra"); - File with_pos_neg_boosters_additional_signals_extra_nonmel = getTemplate.apply("with_pos_neg_boosters_additional_signals_extra_nonmel"); - - builder.newExperiment().withName("baseline" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) - .withSubTemplate(baseline, templateProperties); - if (wordremoval) builder.withWordRemoval(); - - builder.newExperiment().withName("dissyn" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) - .withSubTemplate(baseline, templateProperties).withDiseaseSynonym(); - if (wordremoval) builder.withWordRemoval(); - - builder.newExperiment().withName("dishyper" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) - .withSubTemplate(baseline, templateProperties).withDiseaseHypernym(); - if (wordremoval) builder.withWordRemoval(); - - builder.newExperiment().withName("dissynpt" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) - .withSubTemplate(baseline, templateProperties).withDiseasePreferredTerm().withDiseaseSynonym(); - if (wordremoval) builder.withWordRemoval(); - - builder.newExperiment().withName("dissynpthyper" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) - .withSubTemplate(baseline, templateProperties).withDiseasePreferredTerm().withDiseaseSynonym().withDiseaseHypernym(); - if (wordremoval) builder.withWordRemoval(); - - builder.newExperiment().withName("gensyn" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) - .withSubTemplate(baseline, templateProperties).withGeneSynonym(); - if (wordremoval) builder.withWordRemoval(); + public enum Expansion {DSY, DHY, DP, GSY, GDE, WR, DGI} - builder.newExperiment().withName("gendis" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) - .withSubTemplate(baseline, templateProperties).withDiseasePreferredTerm().withDiseaseSynonym().withDiseaseHypernym().withGeneSynonym().withGeneDescription(); - if (wordremoval) builder.withWordRemoval(); - - - - - - - builder.newExperiment().withName("gensyndesc" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) - .withSubTemplate(baseline, templateProperties).withGeneSynonym().withGeneDescription(); - if (wordremoval) builder.withWordRemoval(); - - builder.newExperiment().withName("gensyndescplus" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) - .withSubTemplate(baseline_plus_genefield, templateProperties).withGeneSynonym().withGeneDescription(); - if (wordremoval) builder.withWordRemoval(); - - builder.newExperiment().withName("dgint" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) - .withSubTemplate(baseline, templateProperties).withDrugInteraction(); - if (wordremoval) builder.withWordRemoval(); - - builder.newExperiment().withName("gendisdgint" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) - .withSubTemplate(baseline, templateProperties).withDiseasePreferredTerm().withDiseaseSynonym().withDiseaseHypernym().withGeneSynonym().withGeneDescription().withDrugInteraction(); - if (wordremoval) builder.withWordRemoval(); - - - - - - -// builder.newExperiment().withName("genedispb" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) -// .withSubTemplate(with_pos_boosters, templateProperties).withDiseasePreferredTerm().withDiseaseSynonym().withGeneSynonym().withGeneDescription(); -// -// builder.newExperiment().withName("genespb" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) -// .withSubTemplate(with_pos_boosters, templateProperties).withGeneSynonym().withGeneDescription(); -// -// builder.newExperiment().withName("genedispbnb" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) -// .withSubTemplate(with_pos_neg_boosters, templateProperties).withGeneSynonym().withGeneDescription().withDiseaseSynonym().withDiseasePreferredTerm(); -// -// builder.newExperiment().withName("posnegbstadd" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) -// .withSubTemplate(with_pos_neg_boosters_additional_signals, templateProperties).withGeneSynonym().withGeneDescription().withDiseaseSynonym().withDiseasePreferredTerm(); -// -// builder.newExperiment().withName("posnegbstaddextra" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) -// .withSubTemplate(with_pos_neg_boosters_additional_signals_extra, templateProperties).withGeneSynonym().withGeneDescription().withDiseaseSynonym().withDiseasePreferredTerm(); -// -// builder.newExperiment().withName("addextranonmel" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) -// .withSubTemplate(with_pos_neg_boosters_additional_signals_extra_nonmel, templateProperties).withGeneSynonym().withGeneDescription().withDiseaseSynonym().withDiseasePreferredTerm(); -// -// builder.newExperiment().withName("addextranonmelshould" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) -// .withSubTemplate(with_pos_neg_boosters_additional_signals_extra_nonmel_should, templateProperties).withGeneSynonym().withGeneDescription().withDiseaseSynonym().withDiseasePreferredTerm(); - - -// builder.newExperiment().withName("hpipubnone_replique").withYear(year).withGoldStandard(goldStandard).withTarget(target) -// .withSubTemplate(with_pos_neg_boosters_additional_signals_extra_nonmel, templateProperties).withWordRemoval().withGeneSynonym() -// .withDiseasePreferredTerm().withGeneDescription().withDiseaseSynonym(); - -// builder.newExperiment().withName("hpipubnone_replique" + suffix).withYear(year).withGoldStandard(goldStandard).withTarget(target) -// .withSubTemplate(with_pos_neg_boosters_additional_signals_extra_nonmel, templateProperties).withWordRemoval().withGeneSynonym() -// .withDiseasePreferredTerm().withGeneDescription().withDiseaseSynonym(); - } private static class TemplateSet { private File base; private File gspmMust; @@ -216,6 +301,17 @@ public void setCustompmShould(File custompmShould) { this.custompmShould = custompmShould; } + @Override + public String toString() { + return "TemplateSet{" + + "base=" + base + + ", gspmMust=" + gspmMust + + ", gspmShould=" + gspmShould + + ", custompmShould=" + custompmShould + + ", custompmMust=" + custompmMust + + '}'; + } + public File getCustompmMust() { return custompmMust; } diff --git a/src/main/java/at/medunigraz/imi/bst/trec/search/ElasticSearch.java b/src/main/java/at/medunigraz/imi/bst/trec/search/ElasticSearch.java index 29b1e0cc..73ecbddb 100644 --- a/src/main/java/at/medunigraz/imi/bst/trec/search/ElasticSearch.java +++ b/src/main/java/at/medunigraz/imi/bst/trec/search/ElasticSearch.java @@ -46,7 +46,7 @@ public List query(JSONObject jsonQuery) { private List query(QueryBuilder qb) { SearchRequestBuilder searchRequestBuilder = client.prepareSearch(index).setTypes(types).setQuery(qb) - .setSize(20000).addStoredField("_id"); + .setSize(10000).addStoredField("_id"); SearchResponse response = searchRequestBuilder.get(); //LOG.trace(JsonUtils.prettify(response.toString())); diff --git a/src/main/resources/log4j2.properties b/src/main/resources/log4j2.properties index baf2f50e..f8d7a14c 100644 --- a/src/main/resources/log4j2.properties +++ b/src/main/resources/log4j2.properties @@ -25,4 +25,4 @@ logger.file.appenderRef.file.ref = FILE loggers = trec logger.trec.name = at.medunigraz.imi.bst.trec -logger.trec.level = info \ No newline at end of file +logger.trec.level = debug \ No newline at end of file diff --git a/src/main/resources/subtemplates/sigir19_experiments_biomed/mutations.json b/src/main/resources/subtemplates/sigir19_experiments_biomed/mutations.json new file mode 100644 index 00000000..c3a7d779 --- /dev/null +++ b/src/main/resources/subtemplates/sigir19_experiments_biomed/mutations.json @@ -0,0 +1,8 @@ +{ + "match": { + "mutations": { + "query": "{{gene}}", + "boost": {{mut_boost}} + } + } +} \ No newline at end of file diff --git a/src/main/resources/templates/sigir19_experiments_biomed/baseline.json b/src/main/resources/templates/sigir19_experiments_biomed/baseline.json index 8eb3e61f..91b7db6c 100644 --- a/src/main/resources/templates/sigir19_experiments_biomed/baseline.json +++ b/src/main/resources/templates/sigir19_experiments_biomed/baseline.json @@ -3,9 +3,6 @@ "must": [ {{sigir19_experiments_biomed/disease.json}}, {{sigir19_experiments_biomed/gene.json}} - ], - "should": [ - {{sigir19_experiments_biomed/interactions.json}} ] } } diff --git a/src/main/resources/templates/sigir19_experiments_biomed/cancer.json b/src/main/resources/templates/sigir19_experiments_biomed/cancer.json new file mode 100644 index 00000000..c6d17cdd --- /dev/null +++ b/src/main/resources/templates/sigir19_experiments_biomed/cancer.json @@ -0,0 +1,11 @@ +{ + "bool": { + "must": [ + {{sigir19_experiments_biomed/disease.json}}, + {{sigir19_experiments_biomed/gene_plus_genefield.json}} + ], + "should": [ + {{sigir19_experiments_biomed/cancer.json}}, + ] + } +} diff --git a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals.json b/src/main/resources/templates/sigir19_experiments_biomed/chemo.json similarity index 50% rename from src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals.json rename to src/main/resources/templates/sigir19_experiments_biomed/chemo.json index d7b2e392..91b7dc0d 100644 --- a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals.json +++ b/src/main/resources/templates/sigir19_experiments_biomed/chemo.json @@ -5,11 +5,7 @@ {{sigir19_experiments_biomed/gene_plus_genefield.json}} ], "should": [ - {{sigir19_experiments_biomed/positive_boosters.json}}, - {{sigir19_experiments_biomed/negative_boosters.json}}, {{sigir19_experiments_biomed/chemotherapy.json}}, - {{sigir19_experiments_biomed/cancer.json}}, - {{sigir19_experiments_biomed/dna.json}} ] } } diff --git a/src/main/resources/templates/sigir19_experiments_biomed/dgi.json b/src/main/resources/templates/sigir19_experiments_biomed/dgi.json new file mode 100644 index 00000000..8eb3e61f --- /dev/null +++ b/src/main/resources/templates/sigir19_experiments_biomed/dgi.json @@ -0,0 +1,11 @@ +{ + "bool": { + "must": [ + {{sigir19_experiments_biomed/disease.json}}, + {{sigir19_experiments_biomed/gene.json}} + ], + "should": [ + {{sigir19_experiments_biomed/interactions.json}} + ] + } +} diff --git a/src/main/resources/templates/sigir19_experiments_biomed/dna.json b/src/main/resources/templates/sigir19_experiments_biomed/dna.json new file mode 100644 index 00000000..8c135ee3 --- /dev/null +++ b/src/main/resources/templates/sigir19_experiments_biomed/dna.json @@ -0,0 +1,11 @@ +{ + "bool": { + "must": [ + {{sigir19_experiments_biomed/disease.json}}, + {{sigir19_experiments_biomed/gene_plus_genefield.json}} + ], + "should": [ + {{sigir19_experiments_biomed/dna.json}}, + ] + } +} diff --git a/src/main/resources/templates/sigir19_experiments_biomed/baseline_plus_genefield.json b/src/main/resources/templates/sigir19_experiments_biomed/genefield.json similarity index 100% rename from src/main/resources/templates/sigir19_experiments_biomed/baseline_plus_genefield.json rename to src/main/resources/templates/sigir19_experiments_biomed/genefield.json diff --git a/src/main/resources/templates/sigir19_experiments_biomed/baseline_plus_genefield_custompm_must.json b/src/main/resources/templates/sigir19_experiments_biomed/genefield_custompm_must.json similarity index 100% rename from src/main/resources/templates/sigir19_experiments_biomed/baseline_plus_genefield_custompm_must.json rename to src/main/resources/templates/sigir19_experiments_biomed/genefield_custompm_must.json diff --git a/src/main/resources/templates/sigir19_experiments_biomed/baseline_plus_genefield_custompm_should.json b/src/main/resources/templates/sigir19_experiments_biomed/genefield_custompm_should.json similarity index 100% rename from src/main/resources/templates/sigir19_experiments_biomed/baseline_plus_genefield_custompm_should.json rename to src/main/resources/templates/sigir19_experiments_biomed/genefield_custompm_should.json diff --git a/src/main/resources/templates/sigir19_experiments_biomed/mutations.json b/src/main/resources/templates/sigir19_experiments_biomed/mutations.json new file mode 100644 index 00000000..2b6f98e9 --- /dev/null +++ b/src/main/resources/templates/sigir19_experiments_biomed/mutations.json @@ -0,0 +1,11 @@ +{ + "bool": { + "must": [ + {{sigir19_experiments_biomed/disease.json}}, + {{sigir19_experiments_biomed/gene_plus_genefield.json}} + ], + "should": [ + {{sigir19_experiments_biomed/mutations.json}} + ] + } +} diff --git a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_custompm_should.json b/src/main/resources/templates/sigir19_experiments_biomed/negative_boosters.json similarity index 68% rename from src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_custompm_should.json rename to src/main/resources/templates/sigir19_experiments_biomed/negative_boosters.json index f004bec3..507f5053 100644 --- a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_custompm_should.json +++ b/src/main/resources/templates/sigir19_experiments_biomed/negative_boosters.json @@ -5,9 +5,7 @@ {{sigir19_experiments_biomed/gene_plus_genefield.json}} ], "should": [ - {{sigir19_experiments_biomed/positive_boosters.json}}, {{sigir19_experiments_biomed/negative_boosters.json}}, - {{sigir19_experiments_biomed/pm.json}} ] } } diff --git a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_boosters.json b/src/main/resources/templates/sigir19_experiments_biomed/pos_boosters.json similarity index 100% rename from src/main/resources/templates/sigir19_experiments_biomed/with_pos_boosters.json rename to src/main/resources/templates/sigir19_experiments_biomed/pos_boosters.json diff --git a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_boosters_custompm_must.json b/src/main/resources/templates/sigir19_experiments_biomed/pos_boosters_custompm_must.json similarity index 100% rename from src/main/resources/templates/sigir19_experiments_biomed/with_pos_boosters_custompm_must.json rename to src/main/resources/templates/sigir19_experiments_biomed/pos_boosters_custompm_must.json diff --git a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_boosters_custompm_should.json b/src/main/resources/templates/sigir19_experiments_biomed/pos_boosters_custompm_should.json similarity index 100% rename from src/main/resources/templates/sigir19_experiments_biomed/with_pos_boosters_custompm_should.json rename to src/main/resources/templates/sigir19_experiments_biomed/pos_boosters_custompm_should.json diff --git a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters.json b/src/main/resources/templates/sigir19_experiments_biomed/positive_boosters.json similarity index 66% rename from src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters.json rename to src/main/resources/templates/sigir19_experiments_biomed/positive_boosters.json index 4a5c6e1e..0386a37b 100644 --- a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters.json +++ b/src/main/resources/templates/sigir19_experiments_biomed/positive_boosters.json @@ -6,8 +6,6 @@ ], "should": [ {{sigir19_experiments_biomed/positive_boosters.json}}, - {{sigir19_experiments_biomed/negative_boosters.json}}, - {{sigir19_experiments_biomed/extra.json}} ] } } diff --git a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_custompm_must.json b/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_custompm_must.json deleted file mode 100644 index f2d1ebf2..00000000 --- a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_custompm_must.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "bool": { - "must": [ - {{sigir19_experiments_biomed/disease.json}}, - {{sigir19_experiments_biomed/gene_plus_genefield.json}}, - {{sigir19_experiments_biomed/pm.json}} - ], - "should": [ - {{sigir19_experiments_biomed/positive_boosters.json}}, - {{sigir19_experiments_biomed/negative_boosters.json}}, - {{sigir19_experiments_biomed/chemotherapy.json}}, - {{sigir19_experiments_biomed/cancer.json}}, - {{sigir19_experiments_biomed/dna.json}} - ] - } -} diff --git a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra.json b/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra.json deleted file mode 100644 index d060b1a0..00000000 --- a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "bool": { - "must": [ - {{sigir19_experiments_biomed/disease.json}}, - {{sigir19_experiments_biomed/gene_plus_genefield.json}} - ], - "should": [ - {{sigir19_experiments_biomed/positive_boosters.json}}, - {{sigir19_experiments_biomed/negative_boosters.json}}, - {{sigir19_experiments_biomed/chemotherapy.json}}, - {{sigir19_experiments_biomed/cancer.json}}, - {{sigir19_experiments_biomed/dna.json}}, - {{sigir19_experiments_biomed/extra.json}} - ] - } -} diff --git a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra_nonmel.json b/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra_nonmel.json deleted file mode 100644 index 80e13e5c..00000000 --- a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra_nonmel.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "bool": { - "must": [ - {{sigir19_experiments_biomed/disease.json}}, - {{sigir19_experiments_biomed/gene_plus_genefield.json}} - ], - "should": [ - {{sigir19_experiments_biomed/extra.json}}, - {{sigir19_experiments_biomed/chemotherapy.json}}, - {{sigir19_experiments_biomed/cancer.json}}, - {{sigir19_experiments_biomed/dna.json}}, - {{sigir19_experiments_biomed/positive_boosters.json}}, - {{sigir19_experiments_biomed/negative_boosters.json}} - ], - "must_not": [ - {{sigir19_experiments_biomed/non_melanoma.json}} - ] - } -} diff --git a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra_nonmel_custompm_must.json b/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra_nonmel_custompm_must.json deleted file mode 100644 index d7b46306..00000000 --- a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra_nonmel_custompm_must.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "bool": { - "must": [ - {{sigir19_experiments_biomed/disease.json}}, - {{sigir19_experiments_biomed/gene_plus_genefield.json}}, - {{sigir19_experiments_biomed/pm.json}} - ], - "should": [ - {{sigir19_experiments_biomed/extra.json}}, - {{sigir19_experiments_biomed/chemotherapy.json}}, - {{sigir19_experiments_biomed/cancer.json}}, - {{sigir19_experiments_biomed/dna.json}}, - {{sigir19_experiments_biomed/positive_boosters.json}}, - {{sigir19_experiments_biomed/negative_boosters.json}} - ], - "must_not": [ - {{sigir19_experiments_biomed/non_melanoma.json}} - ] - } -} diff --git a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra_nonmel_should.json b/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra_nonmel_should.json deleted file mode 100644 index 9ce8cb74..00000000 --- a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra_nonmel_should.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "bool": { - "must": [ - {{sigir19_experiments_biomed/disease.json}}, - {{sigir19_experiments_biomed/gene_plus_genefield.json}} - ], - "should": [ - {{sigir19_experiments_biomed/positive_boosters.json}}, - {{sigir19_experiments_biomed/negative_boosters.json}}, - {{sigir19_experiments_biomed/chemotherapy.json}}, - {{sigir19_experiments_biomed/cancer.json}}, - {{sigir19_experiments_biomed/dna.json}}, - {{sigir19_experiments_biomed/extra.json}}, - {{sigir19_experiments_biomed/non_melanoma.json}} - ] - } -} diff --git a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_gcustompm_must.json b/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_gcustompm_must.json deleted file mode 100644 index ebc462c2..00000000 --- a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_gcustompm_must.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "bool": { - "must": [ - {{sigir19_experiments_biomed/disease.json}}, - {{sigir19_experiments_biomed/gene_plus_genefield.json}}, - {{sigir19_experiments_biomed/pm.json}} - ], - "should": [ - {{sigir19_experiments_biomed/positive_boosters.json}}, - {{sigir19_experiments_biomed/negative_boosters.json}} - ] - } -} diff --git a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra_nonmel_custompm_should.json b/src/main/resources/templates/sigir19_pmclass_biomed/all_custompm_must.json similarity index 79% rename from src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra_nonmel_custompm_should.json rename to src/main/resources/templates/sigir19_pmclass_biomed/all_custompm_must.json index 5d8c78eb..cb36e4e5 100644 --- a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_extra_nonmel_custompm_should.json +++ b/src/main/resources/templates/sigir19_pmclass_biomed/all_custompm_must.json @@ -3,18 +3,16 @@ "must": [ {{sigir19_experiments_biomed/disease.json}}, {{sigir19_experiments_biomed/gene_plus_genefield.json}}, + {{sigir19_experiments_biomed/pm.json}} ], "should": [ - {{sigir19_experiments_biomed/extra.json}}, + {{sigir19_experiments_biomed/positive_boosters.json}}, + {{sigir19_experiments_biomed/negative_boosters.json}}, {{sigir19_experiments_biomed/chemotherapy.json}}, {{sigir19_experiments_biomed/cancer.json}}, {{sigir19_experiments_biomed/dna.json}}, - {{sigir19_experiments_biomed/positive_boosters.json}}, - {{sigir19_experiments_biomed/negative_boosters.json}}, - {{sigir19_experiments_biomed/pm.json}} - ], - "must_not": [ - {{sigir19_experiments_biomed/non_melanoma.json}} + {{sigir19_experiments_biomed/mutations.json}}, + {{sigir19_experiments_biomed/interactions.json}} ] } } diff --git a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_custompm_should.json b/src/main/resources/templates/sigir19_pmclass_biomed/all_custompm_should.json similarity index 81% rename from src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_custompm_should.json rename to src/main/resources/templates/sigir19_pmclass_biomed/all_custompm_should.json index 59946005..5229e907 100644 --- a/src/main/resources/templates/sigir19_experiments_biomed/with_pos_neg_boosters_additional_signals_custompm_should.json +++ b/src/main/resources/templates/sigir19_pmclass_biomed/all_custompm_should.json @@ -10,6 +10,8 @@ {{sigir19_experiments_biomed/chemotherapy.json}}, {{sigir19_experiments_biomed/cancer.json}}, {{sigir19_experiments_biomed/dna.json}}, + {{sigir19_experiments_biomed/mutations.json}}, + {{sigir19_experiments_biomed/interactions.json}}, {{sigir19_experiments_biomed/pm.json}} ] } diff --git a/src/main/resources/templates/sigir19_pmclass_biomed/baseline_custompm_must.json b/src/main/resources/templates/sigir19_pmclass_biomed/baseline_custompm_must.json new file mode 100644 index 00000000..991d6ee6 --- /dev/null +++ b/src/main/resources/templates/sigir19_pmclass_biomed/baseline_custompm_must.json @@ -0,0 +1,9 @@ +{ + "bool": { + "must": [ + {{sigir19_experiments_biomed/disease.json}}, + {{sigir19_experiments_biomed/gene.json}}, + {{sigir19_experiments_biomed/pm.json}} + ] + } +} diff --git a/src/main/resources/templates/sigir19_pmclass_biomed/baseline_custompm_should.json b/src/main/resources/templates/sigir19_pmclass_biomed/baseline_custompm_should.json new file mode 100644 index 00000000..888f693d --- /dev/null +++ b/src/main/resources/templates/sigir19_pmclass_biomed/baseline_custompm_should.json @@ -0,0 +1,11 @@ +{ + "bool": { + "must": [ + {{sigir19_experiments_biomed/disease.json}}, + {{sigir19_experiments_biomed/gene.json}} + ], + "should": [ + {{sigir19_experiments_biomed/pm.json}} + ] + } +} diff --git a/uima/corpus-to-json-pipeline/aeDescriptions.bin b/uima/corpus-to-json-pipeline/aeDescriptions.bin deleted file mode 100644 index 665a8492e104a67e962def8efdf29300b4a78bae..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 58 zcmZ4UmVvdnh(S0ju`E%qv?Mb}&#|Z|vC=2AxTK=-lI+amiF2757(E$SiZiQHD+(BZ KpqvFnRsaBEClVU~ diff --git a/uima/corpus-to-json-pipeline/ccDescriptions.bin b/uima/corpus-to-json-pipeline/ccDescriptions.bin deleted file mode 100644 index aeac682a3d91b03e720bfe5f01a268682c7e94ed..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 18766 zcmeHPeT*Dud7oSR?DP5J&tO7SmkFfAp?9-R?3CC}NY?jZ?{$55z4h7ZInpt^`|j=7 z^Oc!*eY-_Oe5gSI6)4nF2@tgsf1otv4`>UeMGYZp3ItlGO@!J?p;Rr3N-FV3TLD$o z-}AgPvopJU9}e`NcCBP@cjleC< z`(8Z!$>+__Y}1B{+V;@;4bk#7O&eO5tHIa{3&-3c}c=dd=dY)F#S3S3Gtu_ObmJag^ z5s9Z2g_F1@O;?7u&k_gcgfwrTj8?&lhcZtun1SgCDT0T!N3?AvZBH#?$_vi7eId02 zie%e#SMy~VSnew3*k`$xoXoIxFg5P9=h?z^i9Oo`gP@Y8A1-Nov6h*Vo7CR9Wv)r) zdEzGZJeJrM`l4#pEfGlV?vfX*=0R7rA@WVjG4p{4y=G7qd0Mz;N>j~%H}jRtqAD@L z7!9O37fjiR=-k!}EUE2LJh?*Vmllf{y0hijbC4I0YWZX`Ctt70+}Kxy4sqBFElGQT+B#3LTcj~(;+XzekX4V z*$@Fd&Q;+SqWNIW+sTht=L%OQd#9xBCgDZiGg>h5J@6MWT)LX|NVG6rJzUa;R|BufL*P|* zSmA;&*@f-EC&#NbZMd$}8F6gnP}#Li##Al}2-m_EX`jL#r%nF$v_2EL8cME4My|J= zyz^JT|{=o0lVP5u9&Ovv+@0_FsayswqVS^z?@7V zMupO{yeq;%})QK;71=1wX&Nd z(%^yb|K4Z+^L-z@7n`FPZ;n9Q*V%A$O{XG)N1wWOc=zvo@54FlM~a*qfIlb-$jwyB7z>61H z5EMdNsMiC}(aWA4+!5-2fZ$`=ZETfMJ^F|EVAphskY?xwdR*wU6zEH4Wt=_*LI*9T z+eNHzH?e~5m{*_>jKi3S0JB;ZP$agXZrY)zSK7&18Ga37xD&1QhUxpl4Rw&-kl;+8 zMMePDp)`HYlR|Mu^mPP7l+hZr;dO{&Rk#AdUnDE~BB*2I+HD=a%c^xTQ4@8uX-g%) zwT4w~=rs>AgMjrwJlwRvJT0i0HJ}#Zj_VC?O+XzioxUQaXc|kS%IeZ9LT`qmhQC!4 zZ+baj8v)fQdO^ZxPKA-##-8}y=MKTRSdF;qrkP@-Y5X@*v~B3zi3Ff7MMic0p1 zi_?59cB$C)D|Gv#AA0Ne-!}H}Ejev;SsQg00?UD8r~LBBbZPqB$`oGD6wXht6y^%0 zUoRKSE7NmligVK|<%Q|V;!JURu}tK^Asty*oL{(s@%L# zwW*#wuuJNlba35S5V&q}W+*#e^MWf_+!hW(SUQ{C4^?>N21G4Q+$HTMr|i{D#4sOj z19|^DE^VEl)RU)>*xc7`KXFE@3}-RQZztjNi1$RcY&dRl$8*npZuq}A7#((|TS!k} zY4Jl-0(Mum`nu za?__hBx+=$+4?QtYofg>OCD*Tx}?2%on0FfgaoX2&MwY6*3h3RN;Bh)9?Ad}pu~1M z*<%LlP8)6S_*u+%z?lh)u=>^{`3=1b#@lZzynv9mQYzPc`gJcA&w*t| zv5ep5lr9RiW7r(s_06!8@|MHS(o%6M8okrG07vX`1%oI(f{&@3p9YcGCx)BfeB&{@GZ)c$4iRP-bbtD1(mm_@!oYgWCE63vyUS%4eR znBK~vYB%BPP_PJN)zBMK`X}>w!XItm6!PSWV@HqX4SmY10tiAF(bp{7riuj^hSV!f z`^vX5ls^2 zMao9{Ec7%jq^70Ou0-Gq38-R(p2;_1Zr6pamX+mm4zs7?GA4&TB37Zo`}f{#1nu%01sph zX~`ry#fn){S(_nj3T1S~l+^~zLAeZ+sGfe$Q5GuSDFbLr6R;l=%$8@x)`+bdwFlr0*pD*js>(KN5w=pp_VSxH ztSZ?MSdWekP|hfoGGaoI2-#qhRtGr_DpsXI88oZ13Dckfm|fB?iuNTCV!tUnN>SFN zAQ;xVGAf*=fS<^y2=gIC!O*i}An0SwY?D8T2sZUdh4EHSC{Gh^=*2pO47Y~DtwV~E zJ9d4%r^x;UX$S<2!_MJnVwuJBCn^CEu~n0$?Cqj7Z7-SK4s*$k{T$3ipni*7f>{X> z&8jj{UqWPLUmM%Zl51O6M3<;z8yfGO6r4k$(5t?N^MbxcL*^sQ8rgv70*Q+BMIO9Y zW9&4Pg^_WgX=JhdVnxa#v|x5(tMVgA~p@6|kCqD%ulX3Bk z%6j1Lafl#jgrYA^*|?5#t*f*HKyG3Sx^R5i=8cfCpDy$0JV3^Cpx3%KvY&0U8`uv8 zjEx;l%5r8b8yoqwX9lq=+SpXOxuDU#v;!h-6HLkgAPFxd@X)j&L|Tp@KmfL;o_0&# z6#_ytUSO$st^h$M(66D-*J1Mtgyg6JAeL=R&6`$^ZdL;io>e6aIJAouc!U!I3NrO3 zUL=GO-(qix=q`d)LYP5v8{WmS041q$*ko>~pi8nFU*SFEA(R7&<0|uVCu|uv;0eJw z0tkD`#O6qF4hq#(#*s)yWRj75#$lu|x}anB5rY*{6v73>+p+Z6p^|9zXat36ACX1* ze0FJUz-*U^jYo0cL)<6;QdWY^E>%aep&WZk6B9M^)rfcud&eG<%!&XdY+ZHCsym|E zFkLHjA|SY}vr5U!h7FKdONPQ=m&lR#T-O`I_BkM9coX5zLGl5;s1U}-?aUKNuVX3+ zfj)FZ{v45$&c6`i=!_u+!PD*=o9XO=aQ%IUMB}eG(V%=KQ5c^jkU3M>10DMrA~ce^DE1L2XYmQj>@XCs$8HR>^nthh?PKr$?8Tgxy98V(sR-jiXb9L!2}=ny*FFoT zMdy_VP%kQYV8#nn-Ad{8wxncKvu;x2WoP}wyH0&<{#R~(GARf+k&MqawB*9W43(t$qywIur}`BqBeTKFIK>{h1aa*Jr6DB zG@QHkmW@;lMh{Z=OCJHJ?Q9wEuo9qs~h>=ck!r634e<5;Qh#R9iC#TKD5* z6l#r46xqGT=gX799`wZ(}QR&+NrV=n=ZZy9Npi0oU%b1%^6FcZB{zX8~z`Dr5vqGye zs)2(#Zh#_PMQaAa=Im0bIE`)u7UVI0B{9Wk&yf3(UV2qZ1P}n}|13I?E~*A3TIznD zqR3I0Ktzq-%_r~Z+6a#!Fe-N&r8zzt;3&0FJpz=KaLWaiuy zqJRJrq(MlVk*%X*<5eg4LXwlt0kF5D_6G@g(rO$PQg*EWU=+7N(G-;+PG29ZA14 zAYK_DToAFJOcS9Y%)Xk)6>AAo8AOOqduxPpP}7IX?IO8Bk|;08;SkV>G6jGZig<7c zycJ61#BsmU8{8;>Sw8gYljJFzB8-ll#r8AH#f9{!ol(nOVtPYET>n>8&uGdf62Pfr z4oS_<%pU2OA9fNM?j1m-#2t5%uG4DfTHc0S`#e!cXAM1)NLXQ4n2aUU? z(IW(ctjOTilc73eC4>*)6SY{HYz(L#X+|z$9TX@ENdvaL+JKx!X$m#-`qCP}jwne6 zuwxe@leU)u>^OiOyUe5GQ3tRi<5mOM5n%`clwjEb>=+{u;Q)5zjPCy?>_`>S`#Dhl zCRIS3eDu5mvw z@B{kryXsk=za#TFUE!Bc@iTpE?e*_~%GXzgAIfmv`L@pp(eo_dT-2DtuQBb<2y|=j6Y|u7@wva*F5Vea?=`m;<8ng{@ zf9s%ah!V)C;ae|GB?lD$FWZKw8tc^W^!qL+x=*2FAgA-C4eG8oGHp2Ab?`5*J@Ul$ z+5fnPuAxjv0sV3Sw*h5;^d$P75Y_qsbfD+>M{4)x>4=nHR`8Q6=u|QD<>Hy*;@wB@ zI+6FA6-U%2&;mE%I}_Ze&ENl@a5*g;dVA*dhwH~%hUgF7;q$YuYwI+x#gD9;U*PC@ zGJS^ZI~`&S`T+}df@Lf?lhmTlL>65%{&peCq>P^CT%l&!T; z3{Br)3kC7@zYfGm?F~Knl7XLUf;J1okn5q?z2pp!L zfT+{?=gEA-b40#w;tR<50(5>7eLeV*9yB`S@sl?a2k*Y4egubXvKdZDPyI|u+(VH! a@;KbB=9!*fy89?0UihOhFQTdCxBdrx4IcXd diff --git a/uima/corpus-to-json-pipeline/cmDescriptions.bin b/uima/corpus-to-json-pipeline/cmDescriptions.bin deleted file mode 100644 index 460b94b87cf5b48b449a7227dca508bc23314aef..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24440 zcmeHPd5m3GdA~FEc#GpD4$dM4E)ZpGYQITBpu|o{X0!dwcs$G-$6%*2ciuZQbM1HU zeYwjlED?*E3Mx^gHh&Zq)C!6aqExM_R)r$b0-`_x1vTA3f>N~zl~h9NB9=e;`+et} zyS(?tGY&yrm`i`&P^_ySt zKCsOhs5{#e|IKQ(<2cU1%69(6q}Q}m?P}KcRok1dU2R6Os&)L1YWtz8&1ZhQrQ+JW zm#A9c`C)BBCC%9Hq<$1){`Noo##eth=v;2i+fRL!q|W{+ zLN!9DM$UPilT*&FrkAS4C|*mQ)>KQ4*xg5D_Yu4Mh}nIFyN@)ZaKT^9Vvk!VBQplX zF%983o~2%xCU>9Z3C^k1yL;U1f*lW6X3u!B7pPRlPdHCH+oqg-Epw%XUOC#K(zvlGYAY|~@!Am%2q?My(3GPjj%xY~zKT=l{;4Ql906R-B{h=yfj1JXB+BBZ8sWw*FZJ{a_;7yys%oZK zU@r%9%^5FUGH`CoVn22E=`)2Y9l0=D$IzXtL3HkCM#gIF94y%WjF&8d$%c0R!=Hcl@SC35<_w;4_Ov6f zb;@g|QCxR+r%OQ3Qq*p(b{=^YEbKUYS9am=0sMb3bq4(~;YsB`kr8J#b+$!W+Vgl- zo(v9f_`WA&XMVF-sW4+wy8%|#U^&h$j)QaWlXH_xYm;FXlz;4`&YmD^r+x>59d|k) zn#PihHoW>3H-7uGzxAVk11&B&L(5(}qi#r@2YU|^E020$HSlXKaE;az$MN-eYyNn# z5w>}#cd`}eKI){4wQ&Tfl)w4nptJ9ibI6NRe*tu>L-2gB4b|1UuY^>*qM!}mAw^GqC3G|ZztMnj1LJ5W+!)To$# zpt%$!DztnUK@7{fpdLcMsTc!qZbm^T3RRfqA~faPL=+k&YBtPO5Yz_vmrtvTi5v6j zT~p4`VjN{M1j=QHpUfyvG}#V(3!+xmh6Wp}pyPUaRW!`Fg>qH_xjrOi?Ic8*8aY% z;>0S_Rq*s7If>;v&6Icygytp;!g>{3JNxdhJy5&%cw>5Wrg7%{97KSN^1ne5>p^}! z{3QS0nmRl5TB}t#F?77;JHP$m|9Z!-yasYow{j9Y2MXz%&VqRrKlSma@4D$XzWu9% z5Kb|%sdLBK@n}}LZ#Y+XC!l`GXzr|tQtG^w9};(bv?0G!PrD0o6u6!n!xqzkcx{)D zW!jOMC`lKgyN=e}xuq;Q=7O=^1f)_eE}{}Q@YeV+bmx`ZiKC`U64lz{n*SEEn4(aV z2ZD^7dlcX{qT^2fIg~n&mau9Z1C6B()xdE*&ge(^=cb;$gaG?wGby0R84h7Z6B~`LJ5Df|@Nu38a;RskL>@29L zyMp|o7kW@z@TCe*3v6&B%l28}BoKOHU4&W}MPt^2rTr8^=j-2lVdce*cg~1Z_3%aN z+}gb~#S&i8b(!}Kfj=>JF1L@GC%{Lix)+NLscTg@VJ#?K&{R3@=%axQ0`s3ed;g0* z{mGwvOmkDcdr=^%y_WdN6%|6+!Lc@On#RGX`E^o$v>1_fpZ&o@Pwn6P_QwaEU6-6Y zgVAIyY%ax77-h-1sHGZNrvp=@T9=)hgQ}-Ga(NR}%%`lKyzIOrm{4uC2p_sXK(H~r zYj&Nia6tR7K}#B(YDX*LzQ^!&Qv?hrdO%%w*{-=*@0Q?HQipu4780L2uj*OR(zSrF zXBrke9Gvec9dUSV&>UZMO!MF-7VJ9}i|!8?WX!~V1b15;jLmrP)j3$^CRwJjmZU0x z%eD!CxP%^G(tecA_0?UKGyC%T$kg}0zwmb-{T=Y+l=JeSqCE0IYeF^K2)d{&LEuxH z0CwYbfP>Ai+54P#7#s1=w^$+(#IJ#Z9p`fIfhk^?i>J!xO zaD{dTk3r=};-um0E_^MyH5~LHbkB6Rny@_Dr5`rNjOdDQ|J9`vKm6YNkHQRHb`IhM zK@f#GL4cZ*z2a+kETp3UW>62eR#AVNE-4oM#T#B!W@5qHs|yy85>~S>Sb@Qy#wWpI`PL@A{`dwhopNjTE?-QT>)4$yQt$*D==mva2?^t zUvl;aV(A`ZeAhzg--mZ!l)EYC;c~}*OO+5@V(iOU*^U6U4_z)LK=MS%kZ7h783vqm z;L}jbKBKC{^%vl!`U$*T#+33;;wEa5ZV!A}FAO8?{)qEibL$KEHFeX-UGUn8(vB}( zYH^vd5F$eOdYFnKvFd=nSjlDNg-}S@1@U_;YG&LekwyK~#bmQ|8fRG9>kr3b>~)qn zINu4_75^K~d0V-Q$a{tEik~jY!{IqT9dH-~0Mw z-~HZQZ#kWFdg|;n2?ufC<{YrYN9M=@1%mJ;R!?L;gBfg&0H#8hE|$S)gG>y^`90UB z7yveq1U8B6hy=e@PA`Xzf!Y=WDDzDSWOz`H&akY+iJAjQb&0s#Cq7u*i;XeIn6-GlIj}87421dYB%n zl`3^dw~LskLrX(29bP7iXipUmV4}yKa~eb> zr2EYcve`Lr6+Uyll~JtU$SArWRnK5p;0IZNZKqr>Me1}u17|VoOghc4LH=uQ!w;L9 z-H7F?C;)pFDo{?tpD!f%4QPkmC9i`x%3X+J7oNqU%9k;iWT=ZhWesRF44YR?4-799 z2)tDRGhf+&NC|H$)lw}I6tfEs_1f@L1?@Wbs^tm`=>zvUZ4^_z<#b9i$f$rq6f8wq zyQSI4FgtkMiU9F`xE?m4At9lo7LN^}u|fcopD4uYmrSZm@DJD%05G2|Ea0Cm8X(f? zMx0=`37i|;iYQUfav`!4@BwWVgJ;P*Ku66TTXPlEY_!JT;b1Uz#E&Vm;fdS!uW5Wk z0K)|NaTmKDxq5WjPcpB)9B5U(6s2y`iL@v&2zChsumUp_A_M4?8ZZJ#qiG33>W#2a zl&XAYjtT?dzZp<)`3en@_a)W~?E-yI)|8V5-Zh9&*Ai|9oyt(pi`Ph%JKms>SWj}2 zj0ti#Cd}6^)X$B&6BowELFBetrfw*BOXM~a9D9-wDl)26`{476HsryHCs36FBuOJa zE0P}AL~JWg4+&ZEz=@(vI10K#8%Ie?QY`t@Dp14sh62_=z@SpmVy!@&c-NapMqE#f zBW9&UGPn>S$_)$_&!u1qDOl=91W{ZE7A}QAFs9ORLCqykYKmns(vV`07m#AfAjt^~ zbW=W2i6WA5FEU+%;HNmhpttSM#~_x7eyeos0%7Key(=$yP~CkG+05dYiW*rpIjmNU zK+3q6%tcecXhkl&&7Jdu!iS-w0mTM#5bL>zV0{Kq%+MOVna#^*dJ`tyS9+qI3h*-h z9oW2-3$jKfsd7ywV{i;NKo!YwueBzM)8q&#uV&y13M0U^>{4?lV0|E)RGPw}hAX)v zxl4O+a*hP)s|vV8(4JWWCce=6*r9Luiaywp4Y?1YGilK(K7@ZDu>d_cAvC07%H^Ce zKWf=}2=g$&i-=ML^IVu-6Amk6h}E)>B#Pll(7bXxqMT_!v&myBHJ_eG`eofWzV=GFJTSuY=HcpqIqO~SzgA;L}uYoW`6&zF?5K)I710KfOqD??5Fd56aFqoq-5mNGOx;7E+CRii@WZGKTlBoE+ z2m+X9m2I&kPjEa**FbN=sxgeC>jj7mz(NBLpa2lojM@?0IaOP%xx>Ml9@wYsNlld0 z=AvsV92RTX(m|>gR~;Q_To-H!LrAXl77+|U6_Wuq9d$qh%_n(&3mvc{=~&Ik96cVv zCqpKOh$}L+?gH3Z45jDFiArp24w{qH&=K<)k$~Nabj=Rta>qskkqAe zO#v-NnLUvyD1b`TcOjpkNn=iBMYaTl1(dg#H)vk(6qn`rRT=GIc31|QV zuoCqbL&1hEGet29Xf0@KI}m1d}KGWUKnF~aZ>|2UPL8108DXa;RP&*dm;Dwji}-NR>!)A>kgo9jHjGPxxR_@t)yJ(c zcZ*$8rXS%*D9n9QJ-Byg$QCxG+!1UmQ&_T!rzZUd^8$&xPkv$%Z_t1(| z5)P*l0k}ubG+sY7F@~CO18E#(k_|v%!9$cv*ktY^p@b|Vo(7ly-jm&h%FXzjJ!3nBz93XY`3;U}(;`FBl>O~{GX#W(#u=1Me|0W{;(S#QZmzU5u= zANN$+q*?O=MX;_BwS>2FYxOszg}k*6Ba89OsQ}N6V?`U3jHD3?V@8q9gu!x;h)XoA zf8&gr=puePsuA03rgJCN+ED=x!qkDWONq&$IU1LB(a0Ry$%~`2)Ai}oC!iQ0VD$T4 za0BcKJ6Q$J25#+&yXR4NsAa?QY$7C{E^%?^;(!aWNpAhb|0hh^9bDpCsZuwnQS9H z%uarqsP=*@3K3TjWjPYLmq-o|&ATwB&#=!k+;q839#Dt@Q_u(2WvNZ%`(p#PStV3K5~wVl@t@ zK)lE65mD6q&=exJYOr2OqES6PcZzYL#0k((%&jK!g-VBQm9FjYv~EbQQkadZ2)j|2 zm1ml7jkE~D4D0J)#5k||S*(z{vpI1s12$1m&;*Zc=5$ zbI8Z(On$-ykbpygORhA%SN-)joqXTA(4?Vxk=7GkvirOqYs=V^MF{(x;}6QlKHdmE3aM-a5NpmlWU#F(lS zndzoZF_krVy8Ly*n$4n#$ut&rIvv)jb;HjFR4z6AJolsbwbSo?>^*taPT2k7O%l1M zc}58R^vrYqrG1aj4mxN}?JYaAm~1BUJ%LTN4w&(!?hCXI>|%c$3T{~B7d9g@dNgFE z!hDPvP)>)I-jK-G{#S> z6=eNYqD6GT8MqO^`gSD=V9(t5)Q=yyGaO!*eR8_9DEY+-IWMCXnk$1P9@q>^aBP{W zHr2G#5i9W(=m|DT-@i#pXnM*CuSECkfB5kie*~z9fXuH^S2j>l@&%J<71|w4Ro^8k zajaE{+tH5OhSHhn?ReDLNkovPdXlb%x8}uHD7Ee%m%-$4jj#>4vKwK0hOSI}@ZPt5 zqRgJ)Q1ssQf<=dL-IodUkD^@%wQ7DLT8F<$7Adz5e}fNP+2&R97gTg?5AwQQrAMxN z80jjVmBY!Bua&>A6E0z=k0YO6pMTQbKLlhBY5Hw7L{)j3K2O!UQaL1XBpgM`WDZUc zq9f!{kk1LhGDWTwo)0tWcn*`&6d95v++ZAJP+0@*aiB~e32&`wCk;LjO|c< zXnRfKOsW|YL|5t=r))+%Pi4poNP?9xqzymgbEL*>e^e>qs*}W0%3`+z2-`E6C^R{K zW-NeNaI2~~7@)dYm_PxO+VQUWDBtmZ7 zynIv%PqxfdszFSn#KO5IvrdnP?u*n%0d%P^%hvErFR%GYAYCmCuKnfLXy%NVNufY}sA%TDk{11omEAI@V2!wQfoT6Go0NH7#J7%#TY9C|;dehVc z^!a#YL*9@?)c~%+s({1br<9*Pdts_R=}vj`?rA?glg)z`XuL&^T)*Y2yO-#f?Vht( z?UP59pomT2TU~MMdM{gDaX<5}xUD{uk}9<&R2_=vL2aSRxB5)jKmpaZ)n~HRXQGW} zx0G!4naG__TYV;5eI|vEUKp0DlWelsw&v|>7k#VG#6xpB9P_O{lf0?v=dsU(alywW zbozZ1i?&O|^uMONVPNlQQCebvih>j4V*PG3Uth}e^-{Xx3t^GmJ12#Va`}P}P(aqo zKre+x2i5O*Y76oY=G8ZSg~l=6s)2rt_3wCsyLej>Gjqjb?rFT3?S*UT z(IAmO&sw0ln(7O>ocv`z;%{GNfuA3~CKYvldZws256Vx^^6B4VpL=@?lPu&UrHk4U zTm{EBb9AA%I(_*1(~*y`qGA(j zo8Rw1o&4e6z4Dh2d+(dnnTFz8vd$lUvJtO+AeQ?B_bf^tVj878u0DjjOdh?V)yHj1 zYq7yUW*5Hv-p}0QNjt=ztb?SBVw)xIjAM0A_Ca?5%B-cE$?)33KFwsv=dk-0eFxi@ zaRV7zqJ;yHG{MowyITahF1G1+0Z^$cL3##$?L9C5&OLjdxP8#s!+RcZj|>_b<=)LB zlT(xDu1w(X>Cw5#E2Gn+Q*Uh48&@W$PuHg>uQX;R$Lpu+ld}zTh9fhx=VxwU{Hr!Q z{>sJr+?gx&b2H~>=j>)LUFSNXhORpTn}iB&5XUwgco7YWx{1Qhxzk=El*YX0D=u9( zgT9EKD`2{?UfgzKt`q8A1-(KWUD9xLYRI3aeuVQLM3mz`@>mVQ3zt3r1fJ!q@ zO25ae<;avb+p7TKKi13hTA9j4ZPJxcw28?AkI3KacP3c9R2ZW5d*0Spbm_p4N0H*S zH&JRmjowcMmPO}su3&isf_Y!Xrnv!_U!cD%61<8(r9;(+$^dPO!Oi-D?Q$}FSf>O<8;4*%mc0@>z?O&SS%GHS69Lkryst4lA}1S8O~l>-aLA8y>Z1 zp0EHE9QlHo$pJK;QH!i^CSQ;qDcK{pywA^c!eF4aq2K*#6@@tPcC<8ZG_Ir%VPkHS zYboUp=qW$MD~IMv+y_20{rG|QXMQY~z24+Cd8vWC_#k?vUn8rC`@lQR#f?s>#g#8i z-P&!-xUZn5-&*t)NSec?+Z*McNom0vXv)MbwPx*j5S?Y(s$c wbsId*7qSC1*KPwJgp~hmwcCeoP`g2c^w&oOy8kaXuBlSzo{epmV7#;X-;qZO$N&HU diff --git a/uima/corpus-to-json-pipeline/crDescriptions.bin b/uima/corpus-to-json-pipeline/crDescriptions.bin deleted file mode 100644 index cbcf8a38864d9cd0c6c690c2c925b8164f863a2b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 37897 zcmeHQdyE}deV$$0@gt6(wpLRB-iSvq%YFz3Cz6!j##-(tmAloxS%L5u}rmK=V= zw6hc6@=A2eoL8=U)uj0tS5tZ)pV*pqRokEURgyV(Psho^80OVpRAW6q^u`jE#=WGi z#`ximmw9>w{BU~il4@rdU^^e=m@{6sXy&=0m-v~pL$4`P+1T0HCLZ0q90sRAFL`RY zI|7k8{7gmWj+cD<(OvP!Uh##Oj*{@#Ws81#$6c9o_}U0kCdP8&?52PF>a!j7|jX$$_{h5wIc z&Zr+@fqRP)$CJccnfB9c`7KY~`dfeLy>Ejv(sVYa{u|VCm#;5vI3v8i{jtQETQBJ<5}P)zLEiaw{%;Rvl{h{tL`H?`*o_1wF1ga~>RAgdV4{9Qut8#7Y~IL-_SjXYNqg z3o^e86O$a$&*5wL4~}6Ke~8)XQvs|3rsp62YSh_r-r4IVnLiKyHld)t7l1RJ^UhXG zHBP;`02XPBeIj!XRwRuwBW7yMQ`w?QV4$w3Xxxkk;onHxWAx*C{ydY!R3UR|o3UhK z=ysJ19R&uj?`ki`sfw(;#!%O?&1gojgDSy;H?-rh8%HY2a#fplt|ujpu{9seO%==w zg``*miR!A|g+_+p-8N4B4>W1D5FGlX1a=S^~ zR%xm_+g$U%`CdYGXvhU2$n^upazp0aTcW3d4w}|$+5uw~Z^#*8O)RhO z8o0X5%XjggeO;0ETln)B|G6D3nt*p~Xz$pd(`$Gj)k^B87gYoiQVl=s290rZaZ0XP zeFa%B^7gO2<3(S)bNeH=j5^!SLk?yVKZMmF*4aNbJ$3TpB>p`%erD?8_^I*fUurd5 z7pG1gYo40A*qWJ|XdZ1&&9*olsNw#Z+0!%E@chfx`~1ao%`?X@Hc!r+o;_nf_JVLy z1!@5>sJgtC1rI_?*pGv2K6uX07Mrk8iS}@K=Xv4LAYKCQ%91z;R5GdB0dN&XF>@vd zFi9WP3uvw;M@?l;&~4SvI`>t4Epj5 z>z`2R&HLa#@9&o%i>|cb%o?mU0m0|mbJ~jFM9u3y_qmU3`hi%wTf?bkfKEu7eQ8p4 zRn$>Y8~%d=0;9dwlJ|yHN|fBi)=<18j6mkxGq7Ge?Sk@l7WOR={MInP2!_EmfxiXn zhU34MVTX)Bf*xL#pcbxb%GzmZ%qrx3&AZMn-PZc$8L^r|u`<+x-9LC$gY??apFDCh z$09He&HlOciD{!crSXcDuoIN2efit>KfZJOn;#x^ww#AqwS5=k?Fo;jO^EyQ1Pum)MOuoN4U9Or_<`3j*iGT-uF~el3t$ECiw-yMUB`e! zU&Ww1L&kqF?=d1F!F1~huBz`~nCoSj)=HYG5KeL(6XFp1`I76oy7j3_X0Emj6=4BH zCda0~^{x4T`pEA>Ca0Yjg%w@SW8jkA27`~TW3p?H0~*}E%I8`t%d|~C@{B)a$G~T{ z;WOQp#9vsnkbFCoqaQVI3QJ~kQ+W1lbJF#{lh5H88*M70k+u+gH_>0f-#lQ~ty-`X(1$#W*meB6BE>~wR=o%ZJ36Dsl!y2n%! zdeKTY^8InQ2%8Xaco_ngwgPre-A>%@5z)EL$<)na-0(U!s>wqcL^OfYR~f^U{WoJG z#3+bxl)S4F*9+3v?I3r_fKY;-F7jI(oG&9Y*zKhf2Lao-sTgQ?|3uu%Vp!h;c%X!K zTrcXlUZ(?3Akme~Bz5QG#Jv-xWgr4 z)SedvE1Hm&x|lHoMXo<5Dp%HxyFEmX%EW_qDWREro{x}8yIq9JZago;kj3s~&b$Z| z2?hJQ+Jzm25KJ~(jm&wW4r#~jn(3J&KrKKRK-+54)1^{tbhL)#meA4D-+OnEee=n8 zCwhoupU9YtqGR@KVJ~?{-TfOm|z!*!4cXOkQ<3p2)TlIg=S|w zK>W%~;jZv>FY6+AdrB=yazvZWk&zK*Mb|sAp%6|c_NA9R{{4HN7wubBd~&|iXnPfQ zUO=jjBSR$aS&vBY*|Ja}eZMxs>Gt?x#grEoeD^vzq4_B*ycplP^MQwN`!Glk1({o+ zCu#NO=Y63hu5Asct2L35#A<41&PF7S0+@rz!5A(h1sZKHEi9@@Fn8v};FdGY@~}9o zkjsiOcq4kT~-}a^|PNZnp7B2!>q|k*?fS^_D{Zi_l=+arqI#X004u4@D0EblS6ukI4p-*Vw%j8z=vMssz-W0%4h_=z{e30 ztA?q_`cED?hpYoGbz%U%CFbUtu_p>7coV27FiWOlllmPUd10IsV7lYl%)gtW92}xb z0JvIxT|-m>qjE#4_9gJjDOn(l0WontWhnAwaP1+=`17 ztbwxo?$=X#<>7sTR`ooDJesx(K<)TgPVwQ?1^N8aGRri^=MuPguT7eD;tW(IK<^>v zl%#A60OgKaFib59u8?DlFi;{X2<+qc$sv{ z5ETINLqtlqaS4qIFwK-U&ozbowq7@-q_|`ue8UA+DUV_2N0SNeAgow|^m10AN)|9z zi{@QH+jg9TAWuqt19|WzO-U|`5{tF??M0$}up7aQN>yTtsd(5#P=H4bIEG#0d02Et zs0Y7k6s{rlW211t|I|a@_~xx|JeIpf6rA-fF+T4cak7z5BPHmwj0$oyzQ6>uCy@Uj zTFV(wA|fZ;aGUeEn^BnN5%?)x<;vHCflvZQKt&!iy6j^0)I5eZsD0)th5MF55D*ON zBh=!DW#}yFVPtS+*gO^zuPZdmzr@gr`l97hi*(M7eL}5$jHX~xsOw@tjHt+ab|!9y z@DB4{kh-*HXc~u#9QE5JVZfBnF|1b0i2o4$apte2REBR2-4WvBS`Gfs)`YEFnh27(fhg zfOD$tA;Tiy4ZUTVW&pBNcWqXQ5!Z8DWP#o#}>#~VSs5jDAek8ru#+ckGaEqyK{)a z@Si>&08qMpgcIr_8)a!LbfhKaK;kY)o-BdPQg%R&hI?eiRWPLS3hyI<&2A9>n4u9a zbp!vhrZ)^s1WyZKkhuFV9k}ABKv-8oZLAlO{Yty^)NqC@^SYb_r~-@?p{WNrOe``w zW3&dLo5r}PJ~=Z>g^BmyfLU;ONI=0uw0x23g>`{Nr)bJbL+>(FsBZ}1h6t(3H1(1d za^<W;{uU` zNOB37X_O>}SbGx&lDlnN7DvZ05Da0*TO+)BJZz)=BRf8pajs6Bpj`{lUC6J zcv#FItN}SSx~vcy#)*t0crn-f<;h$8YvfTYc=xg|664FIw32Q)IDFq9>*n)gxr#e$fx z;0y*UUyY$CKxCtrO82C%#k;H_sYRm+y70?6pGzYPRb z^1w1za;DHF3uCZQ^B@GIpVpNt8s}iGSRP;%Y%d06Kdm549ZHz&GKs5gr#$X;Rz%7S zM@T&L;1ZfR;a}vt4dkPRHMH^~3MNxzD+3x-iiD+rUX{s_Hrr-7F@a}g;#AQK;p&Tr zP-kt~oAWY+V`Am3vy!TZ73oC5Un)i;5IzqWuYGv|92P7o(gq+$^&=Y+b;Uhxt7+^5c9jb-y@oI1ic zszw~44Gt!Tlo+hd^uz7`Fr|dNjaz|6)3=ylnSnkjD&x>%z_6HGVh;g4@B-l%8Ku_5G!%k7Ws!EO!jhtmSH6v?5{NTmu3 z=>q6C2?M=;w6Ve8h;6L+472FGaqHf`%5^6Zn6Lc7cTm zMG%vCegKz~0vdBI2t94rtJU>=ePm6`oP_$`)_>m;OVj^q)bz`otY5?8OU_1{_j|WTnsTBrH@m+~X)j z;0NoYKnN7SXfwWY zj&Z#VBD%{s7-RRkn*Dk%7?;ysL>si30qr#cNH7hBNnO|UWd`&*^uJi(A81+sZ zu+Df$WIiCfm%*}#VLr;__$D5MI0{6Am4{B+Laj+%U~YLq7}=;uzI1waa%z?VD4ni> ziu5eGVKPw=RoaB1l2RjWGX`!@0Q((d@pF(~!gzus`HuOLpQS38*8mj|!5T?p#q^{u zs}(2+yeSM+FN;~8ZVP;a(FSe=g?F?NP79r+Lb9sFGGpKP7ynId}?;evIAZj z*aeg%rl_!5TQE8y6AY}IiTVR&CX#JcRtghg(h@ZRP#{ejS7V}>`AySo+%7-1xe`ay zJUxAs8Y*oguusx|A-`aqvZ>0oySpq5$xR9q9~EUc=CX3F8dzynV26t_E(Y6bbx0JV zcbgW|Ne2tP3m!3M1NCXI#dhj}G$)7L(14LKo^Y%B*+Q&v6W9mpqmV%d+Cu~afLgSu z2y!g6I)?9p)vQPxZJA)Ts*K@5uuC_(;44KOQ;r}S?n#8?I+>R;29oKJT4qVP;Q(#U7Y{~ogaT2d1G`qPiS|4S$S=Ls}|RqFlRnj=qEHzAsRd$v?widG~6bhEDGejR16 zA%I&yqD!iHq`*3I;{eujxBP^AeO57Z^uWVP$K7mX`*==B!J4RmQEGH9oUvhEK#*D* z^Qy6IKomDnuBY$G_`|}8gO(Kjy!`d`cS`W9sh%|}|9YPzi{BfTxFK=_@9*J1 zvNh*XbO~-=P{bnvFWv&}d0=p=HkUtS2cS-F6#LG)AOB=4S$SU~8%eh*((#!5{TD4i=?!lF46_H(5(kxK))6I(V$l#C5ziU>kn5safT&-lrZ%Vw>t@@f-Y7vkzdTwRC*lBf%D z8d1S#GH+Z&lSS>|CsK$T`8z?OlV>L;V8a4b+mIPA;wTB3ds6K5*b|RH%^q4{QfFqq z+8Pp>G6#nh@!>pES~L-s3?-6qOCCzXQeonofrnaMo>8%pi#4DYlIl{ZME{7T(YwO( z0LC)i_TC&0oDg3Htij#hd6>f)!8q$)4*0XY94)vQ?Jd|X$8k&zft%+fn`JVxk|RVf zY!}s5w!YXJ00R9U_81@>W0=Z5t#hD26T94s^$U4z@)L}ggo_Q83xb&?4vtLJKYWpG zoqA<$P8Q49?S>YJf)4L9@iZt}4%@1go4T%54Qi{@sq0!l_PW-_Hz8@7H<{{84WkH@ z1ksO#C{9x&x4IlDC%hN8lb`o%f#vd z6c}wV%QzN9GIWw7f?nn&A$*KwA(#t9GNmk@&fQdcC&z8#usu+WG8LaZ8OtG2skFVH zw0T^?p)^~u*)`J;U@}l729ip8PX^z{YU3bAUzKVsAi*B4=m&=!J*4B?Gx22=?US|? zQ{vL;_cE<*-krHT>HQp1K#&|5ET5d8VbgM;j^yw))a7dOte?H5%K=hIU=N|E%LlZ8 zraWsFnl~65KH`1TCrXBDaWE_AcS`v#I~ltQTRk8L#bnD;jk6+3C&O(RQ{DzY^2n*X&sN+YpgZ4ZJ#cwO zP#kzVJl9KfKVG>NlWWtP{>sEOUM*B@w6pkW$XrqgQKHXc*QG8TveQErAHJ-qm)XsWRxv|1)f}pD_f+*4IeZ9}T(yTiO z@>21SWS_pSW0&g4j7n}4_SvR`TVz+di$dw{fd&P6@SW_@s@vi8?hhNSTvIz7h(7YR za`x0X-+^Wkk{W#qeEyx~=WQF6?OfD+sk=Sb)9p!OzNbvgocK5Obv5fWM{0!rr#(#P zx_@)%x(9cZ7V=bY7iOyVr?yLhO#kgN(`_XJroGzCS(x^^{nqSUe0BkUvho9ER_1nV z)h0ygGpct`t$zm;`yR8ed)?3@l z8~O8I0(}I*HEnGt6aJvggaS8K&l)I*U!NAixbE41S_E19@d|6r!5?du1qxo*8zFzl zjgVx*C(2C7jogq;4w~-umKPj-q|2^SCPsGVXMM{HrQ}bCO36Tx*y!d+R`;*e{VV!F zxw?O)?qB(d_pi`oeRinH8j|;yd~n_Dt2e>@%ryH_1U_FDfg%*v+j?^xX%f_5tUPS? z`Cjc9r+k2s`YbIT8-^~)AJ17@WaU%Cxzk^%lY+y~;fX{f(LYv5WWuTC;!m8(LCd#` z)YiKR>tj9ZV?FC*J?(b)QkQ@}zO+8pvp&|7TSIU*0`rO7mdLh!W;^O*J)g<3p7m}* zHYefKnV*>eqrZo282ZXXg!+_BAXK}3#71J8I`wYCdN(0jRO;P?`3cPRZbI%lDN+FS1?L?c^$rf+qhalM;RpAu5A&-9`18zW8I~IF^wiso7tKjGbkBj2%KUFGy$ z%txOGk3AANbDw*PF6y&u?TbUY`!Z)|qcPStM;r4`>)MA`*dtFhKzy9dxI u{bgDlH1Vw7NGa6fA5+G8zE?T7*X*YJ5&zxe>V8W61@^1&3Y9;(-2FejcC3d0 diff --git a/uima/corpus-to-json-pipeline/desc/JCoRe JSON Writer.xml b/uima/corpus-to-json-pipeline/desc/JCoRe JSON Writer.xml index 0a8af9f5..1c4baee3 100644 --- a/uima/corpus-to-json-pipeline/desc/JCoRe JSON Writer.xml +++ b/uima/corpus-to-json-pipeline/desc/JCoRe JSON Writer.xml @@ -135,7 +135,7 @@ - /data/data_corpora/SIGIR2019/pubmed-for-pm-classifier-nn + /home/faessler/tmp/baseline2018files/baseline2018resultss diff --git a/uima/corpus-to-json-pipeline/desc/XMI Database Multiplier Reader.xml b/uima/corpus-to-json-pipeline/desc/XMI Database Multiplier Reader.xml index 6ccf4338..b1fb8d9a 100644 --- a/uima/corpus-to-json-pipeline/desc/XMI Database Multiplier Reader.xml +++ b/uima/corpus-to-json-pipeline/desc/XMI Database Multiplier Reader.xml @@ -321,7 +321,7 @@ - onefile_writer + dsyngsynbaselineids2018 diff --git a/uima/preprocessing/aeDescriptions.json b/uima/preprocessing/aeDescriptions.json index 014db11b..9cfa5565 100644 --- a/uima/preprocessing/aeDescriptions.json +++ b/uima/preprocessing/aeDescriptions.json @@ -1 +1 @@ -[{"uri":null,"location":"de.julielab.jcore.ae.jsbd.desc.jcore-jsbd-ae-biomedical-english","category":"ae","uimaDescPath":null,"metaDescription":{"description":"UIMA Wrapper for the JCoRe Sentence Boundary Detector (jcore-jsbd-ae) with a model trained on data from both the GENIA and PennBioIE corpus as well as additional material from MedLine abstracts.","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe Sentence Annotator, Biomedical English","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-jsbd-ae-biomedical-english","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/Users/faessler/.m2/repository/de/julielab/jcore-jsbd-ae-biomedical-english/2.4.0-SNAPSHOT/jcore-jsbd-ae-biomedical-english-2.4.0-SNAPSHOT.jar","packaging":"jar"}},"name":"JCoRe Sentence Annotator","active":false},{"uri":null,"location":"de.julielab.jcore.ae.jtbd.desc.jcore-jtbd-ae-biomedical-english","category":"ae","uimaDescPath":null,"metaDescription":{"description":"UIMA Wrapper for the JCoRe Token Boundary Detector with a model trained on a special biomedical corpus which consists of data from (manually annotated) material which we took from MedLine abstracts and a modified version of PennBioIE's underlying tokenization.\n","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe Token Annotator, Biomedical English","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-jtbd-ae-biomedical-english","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/Users/faessler/.m2/repository/de/julielab/jcore-jtbd-ae-biomedical-english/2.4.0-SNAPSHOT/jcore-jtbd-ae-biomedical-english-2.4.0-SNAPSHOT.jar","packaging":"jar"}},"name":"JCoRe Token Annotator","active":false},{"uri":null,"location":"de.julielab.jcore.ae.acronymtagger.desc.jcore-acronym-ae","category":"ae","uimaDescPath":null,"metaDescription":{"description":"Arconym Tagger\n based on the SCHWARTZ & HEARST Algorithm:\n Ariel S. Schwartz and Marti A. Hearst: A Simple Algorithm For Identifying Abbreviation Definitions in Biomedical\n Text. In: Pacific Symposium on Biocomputing, 2003.\n ","group":"morpho syntactic","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe Acronym Tagger","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-acronym-ae","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/Users/faessler/.m2/repository/de/julielab/jcore-acronym-ae/2.4.0-SNAPSHOT/jcore-acronym-ae-2.4.0-20190122.170350-16.jar","packaging":"jar"}},"name":"JCoRe AcronymAnnotator","active":false},{"uri":null,"location":"de.julielab.jcore.ae.opennlp.postag.desc.jcore-opennlp-postag-ae-biomedical-english","category":"ae","uimaDescPath":null,"metaDescription":{"description":"This project employs the OpenNLP wrapper (jcore-opennlp-postag-ae) with a model trained on the PennBioIE corpus.","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe OpenNLP POS Tagger, Biomedical English","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-opennlp-postag-ae-biomedical-english","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/Users/faessler/.m2/repository/de/julielab/jcore-opennlp-postag-ae-biomedical-english/2.4.0-SNAPSHOT/jcore-opennlp-postag-ae-biomedical-english-2.4.0-SNAPSHOT.jar","packaging":"jar"}},"name":"JCoRe OpenNLP POS Tagger","active":false},{"uri":null,"location":"de.julielab.jcore.ae.biolemmatizer.desc.jcore-biolemmatizer-ae","category":"ae","uimaDescPath":null,"metaDescription":{"description":"Lemmatization tool for morphological analysis of biomedical literature downloaded from SourceForge","group":"semantic","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe BioLemmatizer","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-biolemmatizer-ae","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/Users/faessler/.m2/repository/de/julielab/jcore-biolemmatizer-ae/2.4.0-SNAPSHOT/jcore-biolemmatizer-ae-2.4.0-SNAPSHOT.jar","packaging":"jar"}},"name":"BioLemmatizer","active":false},{"uri":null,"location":"de.julielab.jcore.ae.banner.desc.jcore-banner-ae-biomedical-english","category":"ae","uimaDescPath":null,"metaDescription":{"description":"The JCoRe BANNER Gene Tagger wrapper with a model for biomedical english.","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe BANNER AE for Biomedical English","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-banner-ae-biomedical-english","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/Users/faessler/.m2/repository/de/julielab/jcore-banner-ae-biomedical-english/2.4.0-SNAPSHOT/jcore-banner-ae-biomedical-english-2.4.0-20190114.184738-3.jar","packaging":"jar"}},"name":"JCoRe BANNER AE for Biomedical English","active":false},{"uri":null,"location":"de.julielab.jcore.ae.linnaeus.desc.jcore-linnaeus-species-ae-genera-species-proxies-dicts","category":"ae","uimaDescPath":null,"metaDescription":{"description":"This project is a resource for the JCoRe Linnaeus Annotator. The dictionaries contained herein are used for the recognition of concrete species names in text, e.g. \"human\", \"mouse\", \"n. furzeri\", \"c. elegans\" etc as well as species hints, i.e. indirect clues to a species like the word \"patient\" which most likely refers to a human. Additionally to such rather clear proxies, this project also includes a small dictionary containing maximum-frequency-proxies for genus expressions like \"Drosophila\" which will be mapped to \"D. melanogaster\". For the task of only finding concrete species names in text, there is the project jcore-linnaeus-species-ae-species-dict.","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe Linnaeus Species AE Genera and Species Proxies Dictionaries","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-linnaeus-species-ae-genera-species-proxies-dicts","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/Users/faessler/.m2/repository/de/julielab/jcore-linnaeus-species-ae-genera-species-proxies-dicts/2.4.0-SNAPSHOT/jcore-linnaeus-species-ae-genera-species-proxies-dicts-2.4.0-20190114.184655-3.jar","packaging":"jar"}},"name":"JCoRe LINNEAUS Species AE with Genera and Proxies","active":false},{"uri":null,"location":"de.julielab.jcore.ae.mutationfinder.desc.jcore-mutationfinder-ae","category":"ae","uimaDescPath":null,"metaDescription":{"description":"An analysis engine to recognize mentions of gene point mutations in document text. This is a wrapper\n around the original MutationFinder (http://mutationfinder.sourceforge.net/), published in the following paper:\n MutationFinder: A high-performance system for extracting point mutation mentions from text\n J. Gregory Caporaso, William A. Baumgartner Jr., David A. Randolph, K. Bretonnel Cohen, and Lawrence Hunter;\n Bioinformatics, 2007 23(14):1862-1865; doi:10.1093/bioinformatics/btm235;\n ","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe Mutation Finder AE","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-mutationfinder-ae","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/Users/faessler/.m2/repository/de/julielab/jcore-mutationfinder-ae/2.4.0-SNAPSHOT/jcore-mutationfinder-ae-2.4.0-20190122.170238-16.jar","packaging":"jar"}},"name":"JCoRe Mutation Annotator","active":false},{"uri":null,"location":"de.julielab.jcore.ae.lingscope.desc.jcore-lingscope-negation-ae","category":"ae","uimaDescPath":null,"metaDescription":{"description":"This component uses Lingscope with the baseline negation cue tagger and a CRF scope detector.\n ","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe Lingscope Negation AE","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-lingscope-negation-ae","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/Users/faessler/.m2/repository/de/julielab/jcore-lingscope-negation-ae/2.4.0-SNAPSHOT/jcore-lingscope-negation-ae-2.4.0-20190114.184635-2.jar","packaging":"jar"}},"name":"JCoRe Lingscope Negation AE","active":false},{"uri":null,"location":"de.julielab.jcore.ae.pmclassifier.desc.jcore-pmclassifier-ae","category":"ae","uimaDescPath":null,"metaDescription":{"description":"An AE that is a wrapper around the Precision Medicine Classifier delivered by the HPI coded in Python\n using auto-sklearn.\n ","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe Precision Medicine Classifier","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-pmclassifier-ae","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/Users/faessler/.m2/repository/de/julielab/jcore-pmclassifier-ae/2.4.0-SNAPSHOT/jcore-pmclassifier-ae-2.4.0-SNAPSHOT.jar","packaging":"jar"}},"name":"PM Classifier 2017","active":true},{"uri":null,"location":"de.julielab.jcore.ae.pmclassifier.desc.jcore-pmclassifier-ae","category":"ae","uimaDescPath":null,"metaDescription":{"description":"An AE that is a wrapper around the Precision Medicine Classifier delivered by the HPI coded in Python\n using auto-sklearn.\n ","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe Precision Medicine Classifier","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-pmclassifier-ae","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/Users/faessler/.m2/repository/de/julielab/jcore-pmclassifier-ae/2.4.0-SNAPSHOT/jcore-pmclassifier-ae-2.4.0-SNAPSHOT.jar","packaging":"jar"}},"name":"PM Classifier 2018","active":true}] \ No newline at end of file +[{"uri":null,"location":"de.julielab.jcore.ae.jsbd.desc.jcore-jsbd-ae-biomedical-english","category":"ae","uimaDescPath":null,"metaDescription":{"description":"UIMA Wrapper for the JCoRe Sentence Boundary Detector (jcore-jsbd-ae) with a model trained on data from both the GENIA and PennBioIE corpus as well as additional material from MedLine abstracts.","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe Sentence Annotator, Biomedical English","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-jsbd-ae-biomedical-english","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/home/faessler/.m2/repository/de/julielab/jcore-jsbd-ae-biomedical-english/2.4.0-SNAPSHOT/jcore-jsbd-ae-biomedical-english-2.4.0-SNAPSHOT.jar","packaging":"jar"}},"name":"JCoRe Sentence Annotator","active":true},{"uri":null,"location":"de.julielab.jcore.ae.jtbd.desc.jcore-jtbd-ae-biomedical-english","category":"ae","uimaDescPath":null,"metaDescription":{"description":"UIMA Wrapper for the JCoRe Token Boundary Detector with a model trained on a special biomedical corpus which consists of data from (manually annotated) material which we took from MedLine abstracts and a modified version of PennBioIE's underlying tokenization.\n","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe Token Annotator, Biomedical English","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-jtbd-ae-biomedical-english","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/home/faessler/.m2/repository/de/julielab/jcore-jtbd-ae-biomedical-english/2.4.0-SNAPSHOT/jcore-jtbd-ae-biomedical-english-2.4.0-SNAPSHOT.jar","packaging":"jar"}},"name":"JCoRe Token Annotator","active":true},{"uri":null,"location":"de.julielab.jcore.ae.acronymtagger.desc.jcore-acronym-ae","category":"ae","uimaDescPath":null,"metaDescription":{"description":"Arconym Tagger\n based on the SCHWARTZ & HEARST Algorithm:\n Ariel S. Schwartz and Marti A. Hearst: A Simple Algorithm For Identifying Abbreviation Definitions in Biomedical\n Text. In: Pacific Symposium on Biocomputing, 2003.\n ","group":"morpho syntactic","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe Acronym Tagger","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-acronym-ae","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/home/faessler/.m2/repository/de/julielab/jcore-acronym-ae/2.4.0-SNAPSHOT/jcore-acronym-ae-2.4.0-20190127.092613-36.jar","packaging":"jar"}},"name":"JCoRe AcronymAnnotator","active":false},{"uri":null,"location":"de.julielab.jcore.ae.opennlp.postag.desc.jcore-opennlp-postag-ae-biomedical-english","category":"ae","uimaDescPath":null,"metaDescription":{"description":"This project employs the OpenNLP wrapper (jcore-opennlp-postag-ae) with a model trained on the PennBioIE corpus.","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe OpenNLP POS Tagger, Biomedical English","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-opennlp-postag-ae-biomedical-english","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/home/faessler/.m2/repository/de/julielab/jcore-opennlp-postag-ae-biomedical-english/2.4.0-SNAPSHOT/jcore-opennlp-postag-ae-biomedical-english-2.4.0-SNAPSHOT.jar","packaging":"jar"}},"name":"JCoRe OpenNLP POS Tagger","active":true},{"uri":null,"location":"de.julielab.jcore.ae.biolemmatizer.desc.jcore-biolemmatizer-ae","category":"ae","uimaDescPath":null,"metaDescription":{"description":"Lemmatization tool for morphological analysis of biomedical literature downloaded from SourceForge","group":"semantic","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe BioLemmatizer","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-biolemmatizer-ae","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/home/faessler/.m2/repository/de/julielab/jcore-biolemmatizer-ae/2.4.0-SNAPSHOT/jcore-biolemmatizer-ae-2.4.0-SNAPSHOT.jar","packaging":"jar"}},"name":"BioLemmatizer","active":true},{"uri":null,"location":"de.julielab.jcore.ae.banner.desc.jcore-banner-ae-biomedical-english","category":"ae","uimaDescPath":null,"metaDescription":{"description":"The JCoRe BANNER Gene Tagger wrapper with a model for biomedical english.","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe BANNER AE for Biomedical English","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-banner-ae-biomedical-english","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/home/faessler/.m2/repository/de/julielab/jcore-banner-ae-biomedical-english/2.4.0-SNAPSHOT/jcore-banner-ae-biomedical-english-2.4.0-20190124.152530-4.jar","packaging":"jar"}},"name":"JCoRe BANNER AE for Biomedical English","active":false},{"uri":null,"location":"de.julielab.jcore.ae.linnaeus.desc.jcore-linnaeus-species-ae-genera-species-proxies-dicts","category":"ae","uimaDescPath":null,"metaDescription":{"description":"This project is a resource for the JCoRe Linnaeus Annotator. The dictionaries contained herein are used for the recognition of concrete species names in text, e.g. \"human\", \"mouse\", \"n. furzeri\", \"c. elegans\" etc as well as species hints, i.e. indirect clues to a species like the word \"patient\" which most likely refers to a human. Additionally to such rather clear proxies, this project also includes a small dictionary containing maximum-frequency-proxies for genus expressions like \"Drosophila\" which will be mapped to \"D. melanogaster\". For the task of only finding concrete species names in text, there is the project jcore-linnaeus-species-ae-species-dict.","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe Linnaeus Species AE Genera and Species Proxies Dictionaries","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-linnaeus-species-ae-genera-species-proxies-dicts","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/home/faessler/.m2/repository/de/julielab/jcore-linnaeus-species-ae-genera-species-proxies-dicts/2.4.0-SNAPSHOT/jcore-linnaeus-species-ae-genera-species-proxies-dicts-2.4.0-20190124.152429-4.jar","packaging":"jar"}},"name":"JCoRe LINNEAUS Species AE with Genera and Proxies","active":false},{"uri":null,"location":"de.julielab.jcore.ae.mutationfinder.desc.jcore-mutationfinder-ae","category":"ae","uimaDescPath":null,"metaDescription":{"description":"An analysis engine to recognize mentions of gene point mutations in document text. This is a wrapper\n around the original MutationFinder (http://mutationfinder.sourceforge.net/), published in the following paper:\n MutationFinder: A high-performance system for extracting point mutation mentions from text\n J. Gregory Caporaso, William A. Baumgartner Jr., David A. Randolph, K. Bretonnel Cohen, and Lawrence Hunter;\n Bioinformatics, 2007 23(14):1862-1865; doi:10.1093/bioinformatics/btm235;\n ","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe Mutation Finder AE","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-mutationfinder-ae","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/home/faessler/.m2/repository/de/julielab/jcore-mutationfinder-ae/2.4.0-SNAPSHOT/jcore-mutationfinder-ae-2.4.0-20190127.092504-36.jar","packaging":"jar"}},"name":"JCoRe Mutation Annotator","active":false},{"uri":null,"location":"de.julielab.jcore.ae.lingscope.desc.jcore-lingscope-negation-ae","category":"ae","uimaDescPath":null,"metaDescription":{"description":"This component uses Lingscope with the baseline negation cue tagger and a CRF scope detector.\n ","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe Lingscope Negation AE","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-lingscope-negation-ae","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/home/faessler/.m2/repository/de/julielab/jcore-lingscope-negation-ae/2.4.0-SNAPSHOT/jcore-lingscope-negation-ae-2.4.0-20190124.152409-3.jar","packaging":"jar"}},"name":"JCoRe Lingscope Negation AE","active":true},{"uri":null,"location":"de.julielab.jcore.ae.pmclassifier.desc.jcore-pmclassifier-ae","category":"ae","uimaDescPath":null,"metaDescription":{"description":"An AE that is a wrapper around the Precision Medicine Classifier delivered by the HPI coded in Python\n using auto-sklearn.\n ","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe Precision Medicine Classifier","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-pmclassifier-ae","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/home/faessler/.m2/repository/de/julielab/jcore-pmclassifier-ae/2.4.0-SNAPSHOT/jcore-pmclassifier-ae-2.4.0-SNAPSHOT.jar","packaging":"jar"}},"name":"PM Classifier 2017","active":false},{"uri":null,"location":"de.julielab.jcore.ae.pmclassifier.desc.jcore-pmclassifier-ae","category":"ae","uimaDescPath":null,"metaDescription":{"description":"An AE that is a wrapper around the Precision Medicine Classifier delivered by the HPI coded in Python\n using auto-sklearn.\n ","group":"general","module":null,"base":null,"exposable":true,"categories":["ae"],"name":"JCoRe Precision Medicine Classifier","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-pmclassifier-ae","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/home/faessler/.m2/repository/de/julielab/jcore-pmclassifier-ae/2.4.0-SNAPSHOT/jcore-pmclassifier-ae-2.4.0-SNAPSHOT.jar","packaging":"jar"}},"name":"PM Classifier 2018","active":false}] \ No newline at end of file diff --git a/uima/preprocessing/ccDescriptions.json b/uima/preprocessing/ccDescriptions.json index 08e52657..ff1a535b 100644 --- a/uima/preprocessing/ccDescriptions.json +++ b/uima/preprocessing/ccDescriptions.json @@ -1 +1 @@ -[{"uri":"file:/Users/faessler/Coding/git/ourtrecpm/uima/preprocessing/./desc/JCoRe%20XMI%20Database%20Writer.xml","location":"de.julielab.jcore.consumer.xmi.desc.jcore-xmi-db-writer","category":"consumer","uimaDescPath":"JCoRe XMI Database Writer.xml","metaDescription":{"description":"Writes CAS data in XMI format into a relational (PostgreSQL) database. It is possible to write the\n whole XMI for each document into the database. However, it is also possible to define a list of annotations that\n should be written into separate tables. The JCoRe XMI Database Reader can then be used to assemble XMI data from\n a given set of annotations.\n ","group":"general","module":null,"base":null,"exposable":true,"categories":["consumer"],"name":"JCoRe XMI Database Writer","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-xmi-db-writer","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/Users/faessler/.m2/repository/de/julielab/jcore-xmi-db-writer/2.4.0-SNAPSHOT/jcore-xmi-db-writer-2.4.0-20190122.170212-16.jar","packaging":"jar"}},"name":"JCoRe XMI Database Writer","active":true}] \ No newline at end of file +[{"uri":"file:/home/faessler/Coding/git/ourtrecpm/uima/preprocessing/./desc/JCoRe%20XMI%20DB%20Consumer.xml","location":"de.julielab.jcore.consumer.xmi.desc.jcore-xmi-db-writer","category":"consumer","uimaDescPath":"JCoRe XMI DB Consumer.xml","metaDescription":{"description":"Writes CAS data in XMI format into a relational (PostgreSQL) database. It is possible to write the\n whole XMI for each document into the database. However, it is also possible to define a list of annotations that\n should be written into separate tables. The JCoRe XMI Database Reader can then be used to assemble XMI data from\n a given set of annotations.\n ","group":"general","module":null,"base":null,"exposable":true,"categories":["consumer"],"name":"JCoRe XMI Database Writer","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-xmi-db-writer","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/home/faessler/.m2/repository/de/julielab/jcore-xmi-db-writer/2.4.0-SNAPSHOT/jcore-xmi-db-writer-2.4.0-SNAPSHOT.jar","packaging":"jar"}},"name":"JCoRe XMI DB Consumer","active":true},{"uri":"file:/home/faessler/Coding/git/ourtrecpm/uima/preprocessing/./desc/JCoRe%20Database%20Checkpoint%20Consumer.xml","location":"de.julielab.jcore.ae.checkpoint.desc.jcore-db-checkpoint-consumer","category":"consumer","uimaDescPath":"JCoRe Database Checkpoint Consumer.xml","metaDescription":{"description":"This is a JeDIS component. It can be used to set the 'last component' column in a subset table. This\n help to keep track of the pipeline status.\n ","group":"general","module":{"type":"GitHubRepository","name":"jcore-projects","version":"v2.4","updateable":true,"gitHubName":"JULIELab"},"base":null,"exposable":true,"categories":["consumer","ae"],"name":"JCoRe Database Checkpoint AE","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-db-checkpoint-ae","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/home/faessler/.m2/repository/de/julielab/jcore-db-checkpoint-ae/2.4.0-SNAPSHOT/jcore-db-checkpoint-ae-2.4.0-SNAPSHOT.jar","packaging":"jar"}},"name":"JCoRe Database Checkpoint Consumer","active":true}] \ No newline at end of file diff --git a/uima/preprocessing/cmDescriptions.json b/uima/preprocessing/cmDescriptions.json index bc1f74cb..6e4dfac2 100644 --- a/uima/preprocessing/cmDescriptions.json +++ b/uima/preprocessing/cmDescriptions.json @@ -1 +1 @@ -[{"uri":null,"location":"de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-multiplier","category":"multiplier","uimaDescPath":"JCoRe Abstract Database Multiplier.xml","metaDescription":{"description":"Reads CAS XMI data from a relational database (Postgres). Thus, the stored CASes may then be processed\n further.\n ","group":"general","module":null,"base":null,"exposable":true,"categories":["multiplier","reader"],"name":"JCoRe XMI Database Reader","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-xmi-db-reader","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/Users/faessler/.m2/repository/de/julielab/jcore-xmi-db-reader/2.4.0-SNAPSHOT/jcore-xmi-db-reader-2.4.0-20190122.170222-16.jar","packaging":"jar"}},"name":"JCoRe Abstract Database Multiplier","active":true}] \ No newline at end of file +[{"uri":null,"location":"de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-multiplier","category":"multiplier","uimaDescPath":"JCoRe Abstract Database Multiplier.xml","metaDescription":{"description":"Reads CAS XMI data from a relational database (Postgres). Thus, the stored CASes may then be processed\n further.\n ","group":"general","module":null,"base":null,"exposable":true,"categories":["multiplier","reader"],"name":"JCoRe XMI Database Reader","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-xmi-db-reader","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/home/faessler/.m2/repository/de/julielab/jcore-xmi-db-reader/2.4.0-SNAPSHOT/jcore-xmi-db-reader-2.4.0-SNAPSHOT.jar","packaging":"jar"}},"name":"JCoRe Abstract Database Multiplier","active":true}] \ No newline at end of file diff --git a/uima/preprocessing/crDescriptions.json b/uima/preprocessing/crDescriptions.json index 6a4d2f7a..c53b03a9 100644 --- a/uima/preprocessing/crDescriptions.json +++ b/uima/preprocessing/crDescriptions.json @@ -1 +1 @@ -{"uri":null,"location":"de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-multiplier-reader","category":"reader","uimaDescPath":"XMI Database Multiplier Reader.xml","metaDescription":{"description":"Reads CAS XMI data from a relational database (Postgres). Thus, the stored CASes may then be processed\n further.\n ","group":"general","module":null,"base":null,"exposable":true,"categories":["multiplier","reader"],"name":"JCoRe XMI Database Reader","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-xmi-db-reader","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/Users/faessler/.m2/repository/de/julielab/jcore-xmi-db-reader/2.4.0-SNAPSHOT/jcore-xmi-db-reader-2.4.0-20190122.170222-16.jar","packaging":"jar"}},"name":"XMI Database Multiplier Reader","active":true} \ No newline at end of file +{"uri":null,"location":"de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-multiplier-reader","category":"reader","uimaDescPath":"XMI Database Multiplier Reader.xml","metaDescription":{"description":"Reads CAS XMI data from a relational database (Postgres). Thus, the stored CASes may then be processed\n further.\n ","group":"general","module":null,"base":null,"exposable":true,"categories":["multiplier","reader"],"name":"JCoRe XMI Database Reader","pear":false,"base-project":null,"maven-artifact":{"groupId":"de.julielab","artifactId":"jcore-xmi-db-reader","version":"2.4.0-SNAPSHOT","classifier":null,"file":"/home/faessler/.m2/repository/de/julielab/jcore-xmi-db-reader/2.4.0-SNAPSHOT/jcore-xmi-db-reader-2.4.0-SNAPSHOT.jar","packaging":"jar"}},"name":"XMI Database Multiplier Reader","active":true} \ No newline at end of file diff --git a/uima/preprocessing/desc/CPE.xml b/uima/preprocessing/desc/CPE.xml index eb830406..7172609d 100644 --- a/uima/preprocessing/desc/CPE.xml +++ b/uima/preprocessing/desc/CPE.xml @@ -1,30 +1,55 @@ + + + + + + + - - - - + + + + + + + + - - - - - - - + + + + + + + + + + + + + + + + -1 + immediate + + + + diff --git a/uima/preprocessing/desc/JCoRe Abstract Database Multiplier.xml b/uima/preprocessing/desc/JCoRe Abstract Database Multiplier.xml index 5e01e68e..0ff9a93e 100644 --- a/uima/preprocessing/desc/JCoRe Abstract Database Multiplier.xml +++ b/uima/preprocessing/desc/JCoRe Abstract Database Multiplier.xml @@ -1,27 +1,52 @@ + org.apache.uima.java + true + de.julielab.jcore.reader.xmi.XmiDBMultiplier + + JCoRe Abstract Database Multiplier + A multiplier that receives document IDs to read from a database table from the DBMultiplierReader. The reader also delivers the path to the corpus storage system (CoStoSys) configuration and additional tables for joining with the main data table. This multiplier class is abstract and cannot be used directly.Extending classes must implement the next() method to actually read documents from the database and populate CASes with them. This component is a part of the Jena Document Information System, JeDIS. + JULIE Lab Jena, Germany + JULIE Lab Jena, Germany + + + + + + + + + + + + true + true + true + + + diff --git a/uima/preprocessing/desc/JCoRe XMI Database Writer.xml b/uima/preprocessing/desc/JCoRe XMI Database Writer.xml deleted file mode 100644 index b7cf6f9a..00000000 --- a/uima/preprocessing/desc/JCoRe XMI Database Writer.xml +++ /dev/null @@ -1,182 +0,0 @@ - - - org.apache.uima.java - true - de.julielab.jcore.consumer.xmi.XMIDBWriter - - JCoRe XMI Database Writer - This component is capable of storing the standard UIMA serialization of documents in one or even multiple database tables. The UIMA serialization format is XMI, an XML format that expressed an annotation graph. This component either stores the whole annotation graph in XMI format in a database row, together with the document ID. Alternatively, it makes use of the jcore-xmi-splitter to segment the annotation graph with respect to a user specified list of annotation types. Then, the XMI data of each annotation type is extracted from the document XMI data and stored in a separate table. The tables are created automatically according to the primary key of the active table schema in the Corpus Storage System (CoStoSys) configuration file that is also given as a parameter. The jcore-xmi-db-reader is capable of reading this kind of distributed annotation graph and reassemble a valid XMI document which then cas be deserialized into a CAS. This component is part of the Jena Document Information System, JeDIS. - JULIE Lab Jena, Germany - - - UpdateMode - If set to false, the attempt to write new data into an XMI document or annotation table that already has data for the respective document, will result in an error. If set to true, there will first occur a check if there already is XMI data for the currently written document and, if so, the contents will be updated. It is important to keep in mind that the update also includes empty data. That is, if an annotation type is specified in 'AnnotationsToStore' for which the current does not have data, possibly existing data will just be deleted. - Boolean - false - true - - - DeleteObsoleteAnnotations - Only in effect if 'StoreBaseDocument' is set to 'true'. Then, already existing annotation tables are retrieved from an internal database table the is specifically maintained to list existing annotation tables. When storing the base document, the annotations in these tables are removed for the document if this parameter is set to 'true', except tables specified in 'AnnotationsToStore'. The idea is that when storing the base document, all existing annotations become obsolete since they refer to a base document that no longer exists. - Boolean - false - false - - - PerformGZIP - Determines if the XMI data should be stored compressed or uncompressed. Without compression, the data will be directly viewable in a database browser, whereas compressed data appears as opaque byte sequence. Compression is supposed to reduce traffic over the network and save storage space on the database server. - Boolean - false - true - - - IncreasedAttributeSize - Integer that defines the maximum attribute size for the XMIs. Standard (parser wise) is 65536 * 8. It may be necessary to rise this value for larger documents since the document text is stored as an attribute of an XMI element. - Integer - false - false - - - StoreEntireXmiData - Boolean parameter indicating if the whole document should be stored as one large XMI data block. In this case there must not be any annotations specified for selection and the 'StoreBaseDocument' parameter will have no effect. - Boolean - false - true - - - DocumentTable - String parameter indicating the name of the table where the XMI data will be stored (if StoreEntireXmiData is true) or where the base document will be stored (if StoreBaseDocument is true). If the name is schema qualified, i.e. contains a dot, the table name will be used as provided. If no schema is qualified, the active data postgres schema as configured in the CoStoSys configuration will be used to find or create the table. - String - false - true - - - StoreRecursively - Only in effect when storing annotations separately from the base document. If set to true, annotations that are referenced by other annotations, i.e. are (direct or indirect) features of other annotations, they will be stored in the same table as the referencing annotation. For example, POS tags may be store together with tokens this way. If, however, a referenced annotation type is itself to be stored, it will be segmented away and stored in its own table. - Boolean - false - true - - - StoreBaseDocument - Boolean parameter indicating if the base document should be stored as well when annotations are specified for selection. The base document is the part of the XMI file that includes the document text. If you want to store annotations right with the base document, specify those in the 'BaseDocumentAnnotationTypes' parameter. - Boolean - false - true - - - BaseDocumentAnnotationTypes - Array parameter that takes Java annotation type names. These names will be stored with the base document, if the 'StoreBaseDocument' parameter is set to true. - String - true - false - - - AnnotationStoragePostgresSchema - This optional parameter specifies the Postgres schema in which the XMI annotation storage tables are located by default. If omitted, the active data schema from the CoStoSys configuration is used. The tables derived from the annotation types specified with the 'AnnotationsToStore' parameter will be stored in this postgres schema. The default can be overwritten for individual types. See the description of the 'AnnotationsToStore' parameter. - String - false - false - - - ComponentDbName - Subset tables store the name of the last component that has sent data for a document. This parameter allows to specify a custom name for each CAS DB Consumer. Defaults to the implementation class name. - String - false - true - - - CostosysConfigFile - File path or classpath resource location of a Corpus Storage System (CoStoSys) configuration file. This file specifies the database to write the XMI data into and the data table schema. This schema must at least define the primary key columns that the storage tables should have for each document. The primary key is currently just the document ID. Thus, at the moment, primary keys can only consist of a single element when using this component. This is a shortcoming of this specific component and must be changed here, if necessary. - String - false - true - - - AnnotationsToStore - An array of qualified UIMA type names, for instance de.julielab.jcore.types.Sentence. Annotations of those types are segmented away from the serialized document annotation graph in XMI format for storage in separate tables. When the 'StoreRecursively' parameter is set to true, annotations are stored together with referenced annotations, if those are not specified in the list of additional tables themselves. The table names are directly derived from the annotation type names by converting dots to underlines and adding a postgres schema qualification according to the active data postgres schema defined in the CoStoSys configuration. If an annotation table should be stored or looked up in another postgres schema, prepend the type name with the string 'q:' and the schema name, e.g. 'q:myschema.de.julielab.jcore.types.Token. - String - true - false - - - - - DeleteObsoleteAnnotations - - false - - - - ComponentDbName - - XMIDBWriter - - - - AnnotationsToStore - - - de.julielab.jcore.types.AutoDescriptor - - - - - PerformGZIP - - false - - - - StoreBaseDocument - - false - - - - StoreEntireXmiData - - false - - - - StoreRecursively - - true - - - - UpdateMode - - true - - - - CostosysConfigFile - - ../costosys.xml - - - - DocumentTable - - _data_xmi.documents - - - - - - - - - - - - - - true - true - false - - - diff --git a/uima/preprocessing/desc/PM Classifier 2017.xml b/uima/preprocessing/desc/PM Classifier 2017.xml deleted file mode 100644 index dbda7854..00000000 --- a/uima/preprocessing/desc/PM Classifier 2017.xml +++ /dev/null @@ -1,1213 +0,0 @@ - - - org.apache.uima.java - true - de.julielab.jcore.ae.pmclassifier.PMClassificationAnnotator - - PM Classifier 2017 - Descriptor automatically generated by uimaFIT - unknown - de.julielab.jcore.ae.pmclassifier - - - PmModel - - String - false - true - - - - - PmModel - - malletPmClassifier2017.mod.gz - - - - - - - de.julielab.jcore.types.pubmed.Header - The special Header for PubMed (http://www.pubmed.org) - documents - de.julielab.jcore.types.Header - - - citationStatus - Indicates the status of citation of a PubMed document - de.julielab.jcore.types.CitationStatus - - - otherIDs - Other IDs (then the PubMed ID) may delivered by partners of the NLM to PubMed/Medline - abstracts. If available, this feature discloses such other IDs and the respective sources. - uima.cas.FSArray - de.julielab.jcore.types.pubmed.OtherID - true - - - - - de.julielab.jcore.types.pubmed.ManualDescriptor - The special type for PubMed documents - de.julielab.jcore.types.ManualDescriptor - - - meSHList - A collection of objects of type - uima.julielab.uima.MeSHHeading - uima.cas.FSArray - de.julielab.jcore.types.MeshHeading - true - - - chemicalList - A collection of objects of type - uima.julielab.uima.Chemical - uima.cas.FSArray - de.julielab.jcore.types.Chemical - true - - - dBInfoList - A collection of objects of type - uima.julielab.uima.DBInfo - uima.cas.FSArray - de.julielab.jcore.types.DBInfo - true - - - keywordList - A collection of objects of type - uima.julielab.uima.Keyword - uima.cas.FSArray - de.julielab.jcore.types.Keyword - true - - - geneSymbolList - GeneSymbolList in PubMed - uima.cas.StringArray - - - - - de.julielab.jcore.types.CitationStatus - A citation status (PubMed relevant) - uima.cas.String - - - Completed - completed - - - In-Process - in-process - - - PubMed-not-MEDLINE - pubmed-not-medline - - - In-Data-Review - in-data-review - - - Publisher - publisher - - - MEDLINE - medline - - - OLDMEDLINE - oldmedline - - - - - de.julielab.jcore.types.pubmed.OtherID - PubMed/Medline abstracts sometimes have other IDs besides their PMID from different sources. - This type discloses the respective ID and source. For details see - https://www.nlm.nih.gov/bsd/mms/medlineelements.html#oid - de.julielab.jcore.types.Annotation - - - id - The "other" ID of the document (e.g. Pubmed Central). - uima.cas.String - - - source - The source that assigned the ID found at the 'id' feature to this document. - uima.cas.String - - - - - de.julielab.jcore.types.DocumentAnnotation - The super-type for the document annotation types (Header, Descriptor) - de.julielab.jcore.types.Annotation - - - de.julielab.jcore.types.Header - The Header type stores the bibliographical document information. - de.julielab.jcore.types.DocumentAnnotation - - - docType - The type of the document (e.g. abstract, fulltext, randomized). - de.julielab.jcore.types.DocType - - - source - The source of the document (e.g. WWW, database, etc). - uima.cas.String - - - docId - The identifier of the document with respect to its source. E.g.: PMID in PubMed. In combination with the source, this is a unique identifier for a document. - uima.cas.String - - - copyright - Copyright information. - uima.cas.String - - - truncated - Indicates whether the document is truncated. - uima.cas.Boolean - - - authors - The authors of the document. - uima.cas.FSArray - de.julielab.jcore.types.AuthorInfo - true - - - title - The title of the document. - uima.cas.String - - - pubTypeList - The list of the publication types. - uima.cas.FSArray - de.julielab.jcore.types.PubType - - - language - The language of the document. - de.julielab.jcore.types.Language - - - doi - document object identifier - uima.cas.String - - - - - de.julielab.jcore.types.PubType - An abstract type which should be used to store information on the publication. See subtypes Journal and an accumulative type (OtherPub) - de.julielab.jcore.types.Annotation - - - name - The name of the publication type (e.g. journal, technical report, book). - uima.cas.String - - - pubDate - The date on which the document was published. - de.julielab.jcore.types.Date - - - - - de.julielab.jcore.types.Journal - This type contains attributes to describe a journal publication. - de.julielab.jcore.types.PubType - - - ISSN - The international standard serial number. - uima.cas.String - - - volume - The volume number of the journal in which the article was published. - uima.cas.String - - - title - Full journal title. - uima.cas.String - - - impactFactor - The impact factor of the journal at the time of publication. - uima.cas.String - - - shortTitle - the short title of the Journal (e.g. "Nicotine Tob Res" for "Nicotine & tobacco research : official journal of the Society for Research on Nicotine and Tobacco") - uima.cas.String - - - issue - Issue of Journal - uima.cas.String - - - pages - Pages of Journal - uima.cas.String - - - nlmId - the nlm id of the journal - uima.cas.String - - - - - de.julielab.jcore.types.Date - Type to store dates - de.julielab.jcore.types.Annotation - - - day - day of the month. - uima.cas.Integer - - - month - month of the year. - uima.cas.Integer - - - year - full year (e.g. 2006 and NOT 06). - uima.cas.Integer - - - - - de.julielab.jcore.types.OtherPub - Accumulative type for all other types of publications - de.julielab.jcore.types.PubType - - - de.julielab.jcore.types.Descriptor - An abstract type for the document descriptors, there are two subclasses: one for automatically acquired meta information and one for meta information that was added manually (such as e.g. MeSH term lists) - de.julielab.jcore.types.Annotation - - - de.julielab.jcore.types.AutoDescriptor - The descriptor type for automatically (i.e. algorithmically) acquired meta information. It can be refined and extended. - de.julielab.jcore.types.Descriptor - - - documentClasses - Here the document classification result will be stored, for example for storing infos if the document is age-related or not. - uima.cas.FSArray - de.julielab.jcore.types.DocumentClass - - - documentTopics - A list of document topics derived from a topic model. - uima.cas.FSArray - de.julielab.jcore.types.DocumentTopics - - - - - de.julielab.jcore.types.ManualDescriptor - The descriptor type for the manually added information. - de.julielab.jcore.types.Descriptor - - - metaInfo - meta information about thos document, for example who is the annotator and which semantic types are annotated in this document - uima.cas.String - - - - - de.julielab.jcore.types.MeshHeading - Medical Subject Headings, see NLM's MeSH for a detailed description. - de.julielab.jcore.types.Annotation - - - descriptorName - see MeSH - uima.cas.String - - - qualifierName - see MeSH - uima.cas.String - - - descriptorNameMajorTopic - see MeSH - uima.cas.Boolean - - - qualifierNameMajorTopic - see MeSH - uima.cas.Boolean - - - - - de.julielab.jcore.types.Chemical - A chemical type - de.julielab.jcore.types.Annotation - - - registryNumber - A unique 5 to 9 digit number in hyphenated format assigned by the Chemical Abstract Service to specify chemical substances, a zero is a valid number when an actual number cannot be located or is not yet available.. - uima.cas.String - - - nameOfSubstance - The name of the substance that the registry number or the E.C. number identifies. - uima.cas.String - - - - - de.julielab.jcore.types.DBInfo - References to other databases, e.g. SwissProt, INTERPRO e.g. - de.julielab.jcore.types.Annotation - - - name - The name of the DB referred to - uima.cas.String - - - acList - A list of accession numbers for this DB. - uima.cas.StringArray - - - - - de.julielab.jcore.types.Keyword - A term of a controlled keyword list to describe the content of the publication. - de.julielab.jcore.types.Annotation - - - name - The name of the keyword. - uima.cas.String - - - source - The keyword source (terminology). - uima.cas.String - - - - - de.julielab.jcore.types.DocType - The document type - uima.cas.String - - - abstract - abstract - - - fulltext - fulltext - - - randomized - randomized - - - article - used for Wikipedia articles - - - disambiguation-page - used for Wikipedia disambiguation-pages - - - - - de.julielab.jcore.types.Language - The language of the document. Possible values: based on ISO-639-1 language codes; we start with a first selection (which might be extended later). - uima.cas.String - - - de - German - - - en - English - - - es - Espagnol - - - fr - French - - - it - Italian - - - pt - Portuguese - - - other - Other languages - - - eng - English - - - ger - German - - - fre - Franz̦sisch - - - ita - Italienisch - - - - - de.julielab.jcore.types.DocumentClass - A document class specification for the CAS' document text. - uima.tcas.Annotation - - - classname - The name of the document class this CAS has been classified to. - uima.cas.String - - - confidence - Confidence value of the classification into this class. - uima.cas.Double - - - - - de.julielab.jcore.types.AuthorInfo - AuthorInfo Type annotates the text segments containing the information about an author and his/her affiliation information. - de.julielab.jcore.types.Annotation - - - foreName - An author̢۪s forename,C - uima.cas.String - - - affiliation - Affiliation of the author. - uima.cas.String - - - contact - Contact information (emails, phones, etc.). - uima.cas.String - - - lastName - The last name of the author. - uima.cas.String - - - initials - Initials - uima.cas.String - - - - - de.julielab.jcore.types.RelatedArticle - - de.julielab.jcore.types.Annotation - - - relatedArticle - - uima.cas.String - - - - - de.julielab.jcore.types.RelatedArticleList - - de.julielab.jcore.types.Annotation - - - relatedArticles - - uima.cas.FSArray - de.julielab.jcore.types.RelatedArticle - true - - - - - de.julielab.jcore.types.FullTextLink - - de.julielab.jcore.types.Annotation - - - url - - uima.cas.String - - - iconUrl - - uima.cas.String - - - - - de.julielab.jcore.types.FullTextLinkList - - de.julielab.jcore.types.Annotation - - - fullTextLinks - - uima.cas.FSArray - de.julielab.jcore.types.FullTextLink - true - - - - - de.julielab.jcore.types.DocumentTopics - Topics label documents with vectors weights for their semantically most prominent words - de.julielab.jcore.types.DocumentAnnotation - - - Weights - Vector of weights denoting the semantical descriptivity of one word for the respective topic - uima.cas.DoubleArray - - - IDs - IDs for topics determined by the modeling implementation - uima.cas.IntegerArray - - - ModelID - ID identifying the model holding the topics to be labeled - uima.cas.String - - - TopicWords - The top words for the respective topic - uima.cas.StringArray - - - ModelVersion - Version of the model holding the topics - uima.cas.String - - - - - de.julielab.jcore.types.Annotation - The super-type for all types. - uima.tcas.Annotation - - - confidence - The component that made the annotation may put its confidence/score calculated internally here, O - uima.cas.String - - - componentId - Indicates which NLP component has been used to derive the annotation, C - uima.cas.String - - - id - - uima.cas.String - - - - - de.julielab.jcore.types.DiscontinuousAnnotation - Chains annotations of the same type - de.julielab.jcore.types.Annotation - - - value - Annotations to be chained. - uima.cas.FSArray - de.julielab.jcore.types.Annotation - - - - - de.julielab.jcore.types.ResourceEntry - The reference to an external resource - de.julielab.jcore.types.Annotation - - - source - The name of the resource, C - uima.cas.String - - - entryId - The identifier of the entry, C - uima.cas.String - - - version - The version of the resource, C - uima.cas.String - - - - - de.julielab.jcore.types.AbstractSectionHeading - The heading of a section of a structured abstract as - used by MEDLINE and PubMed. - The TitleType feature value should always be 'abstractSection'. - de.julielab.jcore.types.Title - - - label - The author-given label to the abstract section this heading belongs to. - uima.cas.String - - - nlmCategory - The NLM category associated with the section label given by the authors (see feature - 'label'). - uima.cas.String - - - - - de.julielab.jcore.types.pubmed.InternalReference - Internal references with a special feature for PMC related reference types. It would be a - cleaner class hierarchy if this annotation type would be a subtype of - de.julielab.jcore.types.InternalReference. However, this won't work because this general purpose type - does not have a restriction on the possible reference types. For PubMed/PMC we want to restrict the - reference type to the ones valid for PubMed and PMC. But we cannot overwrite the base feature. Thus we - can't extend the base InternalReference type but create a new one. - de.julielab.jcore.types.Annotation - - - reftype - The reference type: literature, figure, footnote etc. - de.julielab.jcore.types.pubmed.ReferenceType - - - refid - The ID of the referenced object. - uima.cas.String - - - - - de.julielab.jcore.types.pubmed.ReferenceType - A list of possible reference types specific to the PubMed InternalReference type. Includes the - basic reference types plus some PMC specific ones. - uima.cas.String - - - affiliation - A reference to an affiliation. - - - appendix - Reference to the appendix. - - - authornotes - Reference to author notes. - - - bibliography - Reference to an item of the bibliography. - - - chemical - A reference to a chemical. - - - contributor - Reference to a contributor. - - - correspondingauthor - Reference to a corresponding author. - - - displayformula - A formular typeset for display. This mostly means that it is set on a line of its own - with some padding using larger fonts. - - - figure - Reference to a figure. - - - footnote - The number of a footnote. - - - keyword - Reference to a keyword. - - - list - Reference to a list or a list item. - - - other - A reference of another type than enumerated in this type. - - - plate - Reference to a plate. - - - scheme - Reference to a scheme. - - - section - Reference to a section. - - - statement - Reference to a statement. - - - supplementary - Reference to supplementary information. - - - table - Reference to a table. - - - tablefootnote - Reference to a table footnote. - - - textbox - Reference to a textbox or sidebar. - - - - - de.julielab.jcore.types.Paragraph - A paragraph is a self-contained unit of discourse in a written text dealing with a particular point or idea. - de.julielab.jcore.types.Zone - - - de.julielab.jcore.types.Section - a section is a part of the text that often has a heading, an id, a section type, figures, tables, citations and footnotes that occur in this section - de.julielab.jcore.types.Zone - - - sectionHeading - the title of the section - de.julielab.jcore.types.Title - - - sectionType - the type of the section (e.g. results) - uima.cas.String - - - textObjects - the text objects (figure, table, boxed text etc.) that are associated with a particular section - uima.cas.FSArray - de.julielab.jcore.types.TextObject - true - - - sectionId - the id of the section, for example as mentioned in the original file, or level of the section - uima.cas.String - - - depth - depth of the section, e.g. 0 -> section, 1 -> subsection, 2 -> sub-subsection, ... - uima.cas.Integer - - - label - The section label, if given. This might, for example, just be the section number. - uima.cas.String - - - - - de.julielab.jcore.types.Zone - A Zone is a distinct division of text. It is an abstract Type and provides a parent type for sub-types which represent various kinds of text zones. - de.julielab.jcore.types.Annotation - - - de.julielab.jcore.types.Title - Title annotates titles covering various text units, including the whole paper, sections and subsections. - de.julielab.jcore.types.Zone - - - titleType - The type of the title: -table: title of a table -figure: title of a figure -caption: title of a caption -footnote: title of a footnote - de.julielab.jcore.types.TitleType - - - - - de.julielab.jcore.types.Caption - the caption of figures, tables etc. - -feature captionTitle is a Title-Annotation of the title of the caption, if existent. - -feature captionType is an Enumeration, stating to what type of entity the caption belongs, e.g. figure or table - de.julielab.jcore.types.Zone - - - captionTitle - The title of a figure / table caption, if it exists - de.julielab.jcore.types.Title - - - captionType - The type of entity, this caption belongs to, e.g. figure or table - de.julielab.jcore.types.CaptionType - - - - - de.julielab.jcore.types.TextObject - Object, on our case, are annotations such as figures, tables, boxed text etc. - de.julielab.jcore.types.Zone - - - objectType - such as figure, table, boxed-text etc. - uima.cas.String - - - objectId - the id of the object as found in the text - uima.cas.String - - - objectLabel - the label of an object - uima.cas.String - - - objectCaption - the caption that comes with the object - de.julielab.jcore.types.Caption - - - objectTitle - The title annotation of the text object, if it exists. The title might correspond to the objectLabel (which is of type String and thus no annotation on its own). - de.julielab.jcore.types.Title - - - - - de.julielab.jcore.types.AbstractText - Annotation of the complete abstract. - de.julielab.jcore.types.Zone - - - structuredAbstractParts - A List of all parts of a structured abstract. Empty, if the abstract consists of only one part and has no titles. - uima.cas.FSArray - de.julielab.jcore.types.AbstractSection - - - - - de.julielab.jcore.types.Style - Text-markup information (italic, bold etc.) on any (e.g. character) level. Allows to keep the original style markup of the text, several style types might be set to same (or overlapping) range, when different styles are set to the same text region. - de.julielab.jcore.types.Annotation - - - styleName - the name of the style used. - de.julielab.jcore.types.StyleName - - - encoding - the encoding used. - de.julielab.jcore.types.Encoding - - - - - de.julielab.jcore.types.StyleName - - uima.cas.String - - - slanted - - - - italic - - - - bold - - - - underscore - - - - stricke-though - - - - - - de.julielab.jcore.types.Encoding - Encoding Information - uima.cas.String - - - UTF-8 - - - - UTF-16 - - - - - - de.julielab.jcore.types.List - used for annotation of lists - de.julielab.jcore.types.Zone - - - itemList - contains items of the level 1. The items of the level 1 could contain further items of next level and so on in order to represent an iterative structure of list items. - uima.cas.FSArray - de.julielab.jcore.types.ListItem - true - - - - - de.julielab.jcore.types.ListItem - item of a list - de.julielab.jcore.types.Zone - - - itemList - items of the next level (sub-items) - uima.cas.FSArray - de.julielab.jcore.types.ListItem - true - - - level - Level of indentation of the list item. - uima.cas.Integer - - - - - de.julielab.jcore.types.SectionTitle - Title of a text section in contrast to the title of the whole document. - de.julielab.jcore.types.Title - - - depth - depth of the section, e.g. 0 -> section, 1 -> subsection, 2 -> sub-subsection, ... - uima.cas.Integer - - - - - de.julielab.jcore.types.Figure - Annotation for all elements in the CAS that belong to figures, e.g. figure title, figure caption etc. - de.julielab.jcore.types.TextObject - - - de.julielab.jcore.types.Table - An annotation for CAS elements that belong to a table, e.g. table title, table caption etc. - de.julielab.jcore.types.TextObject - - - footnotes - An array collecting all footnotes, appearing in this table - uima.cas.FSArray - de.julielab.jcore.types.Footnote - - - - - de.julielab.jcore.types.Footnote - Footnotes of all kinds, i.e. footnotes found in running text or in tables etc. - de.julielab.jcore.types.Zone - - - footnoteTitle - - de.julielab.jcore.types.Title - - - - - de.julielab.jcore.types.AbstractSection - - de.julielab.jcore.types.Zone - - - abstractSectionHeading - The title of a part of a structured abstract, e.g. "Background", "Methods", "Results", ... - de.julielab.jcore.types.Title - - - - - de.julielab.jcore.types.Bibliography - The whole bibliography, beginning with a title like e.g. References or Literature and ending at the end of the last reference. - de.julielab.jcore.types.Zone - - - de.julielab.jcore.types.TitleType - The type of a title: -abstract: title of an abstract or abstract part -table: title of a table -figure: title of a figure -caption: title of a caption -footnote: title of a footnote -abstract: title of an abstract or abstract part -document: title of the document - uima.cas.String - - - table - - - - figure - - - - caption - - - - footnote - - - - abstractSection - Denotes the heading of a section of a structured abstract - - - document - Denotes the document title - - - other - - - - section - - - - - - de.julielab.jcore.types.CaptionType - The type of entity the Caption belongs to, e.g. a figure or a table - uima.cas.String - - - table - - - - figure - - - - other - - - - - - de.julielab.jcore.types.Segment - A segment is comparable to "de.julielab.jcore.types.Section" but for medical document (e.g. discharge summaries) we want a distinction between these types - de.julielab.jcore.types.Zone - - - value - SecTag reference?? - uima.cas.String - - - - - de.julielab.jcore.types.InternalReference - Used for document-internal references like literature references, footnotes etc. - de.julielab.jcore.types.Annotation - - - reftype - The reference type: literature, figure, footnote etc. - uima.cas.String - - - refid - The ID of the referenced object. - uima.cas.String - - - - - - - - - true - true - false - - - diff --git a/uima/preprocessing/desc/PM Classifier 2018.xml b/uima/preprocessing/desc/PM Classifier 2018.xml deleted file mode 100644 index db3fe06e..00000000 --- a/uima/preprocessing/desc/PM Classifier 2018.xml +++ /dev/null @@ -1,1213 +0,0 @@ - - - org.apache.uima.java - true - de.julielab.jcore.ae.pmclassifier.PMClassificationAnnotator - - PM Classifier 2018 - Descriptor automatically generated by uimaFIT - unknown - de.julielab.jcore.ae.pmclassifier - - - PmModel - - String - false - true - - - - - PmModel - - malletPmClassifier2018.mod.gz - - - - - - - de.julielab.jcore.types.pubmed.Header - The special Header for PubMed (http://www.pubmed.org) - documents - de.julielab.jcore.types.Header - - - citationStatus - Indicates the status of citation of a PubMed document - de.julielab.jcore.types.CitationStatus - - - otherIDs - Other IDs (then the PubMed ID) may delivered by partners of the NLM to PubMed/Medline - abstracts. If available, this feature discloses such other IDs and the respective sources. - uima.cas.FSArray - de.julielab.jcore.types.pubmed.OtherID - true - - - - - de.julielab.jcore.types.pubmed.ManualDescriptor - The special type for PubMed documents - de.julielab.jcore.types.ManualDescriptor - - - meSHList - A collection of objects of type - uima.julielab.uima.MeSHHeading - uima.cas.FSArray - de.julielab.jcore.types.MeshHeading - true - - - chemicalList - A collection of objects of type - uima.julielab.uima.Chemical - uima.cas.FSArray - de.julielab.jcore.types.Chemical - true - - - dBInfoList - A collection of objects of type - uima.julielab.uima.DBInfo - uima.cas.FSArray - de.julielab.jcore.types.DBInfo - true - - - keywordList - A collection of objects of type - uima.julielab.uima.Keyword - uima.cas.FSArray - de.julielab.jcore.types.Keyword - true - - - geneSymbolList - GeneSymbolList in PubMed - uima.cas.StringArray - - - - - de.julielab.jcore.types.CitationStatus - A citation status (PubMed relevant) - uima.cas.String - - - Completed - completed - - - In-Process - in-process - - - PubMed-not-MEDLINE - pubmed-not-medline - - - In-Data-Review - in-data-review - - - Publisher - publisher - - - MEDLINE - medline - - - OLDMEDLINE - oldmedline - - - - - de.julielab.jcore.types.pubmed.OtherID - PubMed/Medline abstracts sometimes have other IDs besides their PMID from different sources. - This type discloses the respective ID and source. For details see - https://www.nlm.nih.gov/bsd/mms/medlineelements.html#oid - de.julielab.jcore.types.Annotation - - - id - The "other" ID of the document (e.g. Pubmed Central). - uima.cas.String - - - source - The source that assigned the ID found at the 'id' feature to this document. - uima.cas.String - - - - - de.julielab.jcore.types.DocumentAnnotation - The super-type for the document annotation types (Header, Descriptor) - de.julielab.jcore.types.Annotation - - - de.julielab.jcore.types.Header - The Header type stores the bibliographical document information. - de.julielab.jcore.types.DocumentAnnotation - - - docType - The type of the document (e.g. abstract, fulltext, randomized). - de.julielab.jcore.types.DocType - - - source - The source of the document (e.g. WWW, database, etc). - uima.cas.String - - - docId - The identifier of the document with respect to its source. E.g.: PMID in PubMed. In combination with the source, this is a unique identifier for a document. - uima.cas.String - - - copyright - Copyright information. - uima.cas.String - - - truncated - Indicates whether the document is truncated. - uima.cas.Boolean - - - authors - The authors of the document. - uima.cas.FSArray - de.julielab.jcore.types.AuthorInfo - true - - - title - The title of the document. - uima.cas.String - - - pubTypeList - The list of the publication types. - uima.cas.FSArray - de.julielab.jcore.types.PubType - - - language - The language of the document. - de.julielab.jcore.types.Language - - - doi - document object identifier - uima.cas.String - - - - - de.julielab.jcore.types.PubType - An abstract type which should be used to store information on the publication. See subtypes Journal and an accumulative type (OtherPub) - de.julielab.jcore.types.Annotation - - - name - The name of the publication type (e.g. journal, technical report, book). - uima.cas.String - - - pubDate - The date on which the document was published. - de.julielab.jcore.types.Date - - - - - de.julielab.jcore.types.Journal - This type contains attributes to describe a journal publication. - de.julielab.jcore.types.PubType - - - ISSN - The international standard serial number. - uima.cas.String - - - volume - The volume number of the journal in which the article was published. - uima.cas.String - - - title - Full journal title. - uima.cas.String - - - impactFactor - The impact factor of the journal at the time of publication. - uima.cas.String - - - shortTitle - the short title of the Journal (e.g. "Nicotine Tob Res" for "Nicotine & tobacco research : official journal of the Society for Research on Nicotine and Tobacco") - uima.cas.String - - - issue - Issue of Journal - uima.cas.String - - - pages - Pages of Journal - uima.cas.String - - - nlmId - the nlm id of the journal - uima.cas.String - - - - - de.julielab.jcore.types.Date - Type to store dates - de.julielab.jcore.types.Annotation - - - day - day of the month. - uima.cas.Integer - - - month - month of the year. - uima.cas.Integer - - - year - full year (e.g. 2006 and NOT 06). - uima.cas.Integer - - - - - de.julielab.jcore.types.OtherPub - Accumulative type for all other types of publications - de.julielab.jcore.types.PubType - - - de.julielab.jcore.types.Descriptor - An abstract type for the document descriptors, there are two subclasses: one for automatically acquired meta information and one for meta information that was added manually (such as e.g. MeSH term lists) - de.julielab.jcore.types.Annotation - - - de.julielab.jcore.types.AutoDescriptor - The descriptor type for automatically (i.e. algorithmically) acquired meta information. It can be refined and extended. - de.julielab.jcore.types.Descriptor - - - documentClasses - Here the document classification result will be stored, for example for storing infos if the document is age-related or not. - uima.cas.FSArray - de.julielab.jcore.types.DocumentClass - - - documentTopics - A list of document topics derived from a topic model. - uima.cas.FSArray - de.julielab.jcore.types.DocumentTopics - - - - - de.julielab.jcore.types.ManualDescriptor - The descriptor type for the manually added information. - de.julielab.jcore.types.Descriptor - - - metaInfo - meta information about thos document, for example who is the annotator and which semantic types are annotated in this document - uima.cas.String - - - - - de.julielab.jcore.types.MeshHeading - Medical Subject Headings, see NLM's MeSH for a detailed description. - de.julielab.jcore.types.Annotation - - - descriptorName - see MeSH - uima.cas.String - - - qualifierName - see MeSH - uima.cas.String - - - descriptorNameMajorTopic - see MeSH - uima.cas.Boolean - - - qualifierNameMajorTopic - see MeSH - uima.cas.Boolean - - - - - de.julielab.jcore.types.Chemical - A chemical type - de.julielab.jcore.types.Annotation - - - registryNumber - A unique 5 to 9 digit number in hyphenated format assigned by the Chemical Abstract Service to specify chemical substances, a zero is a valid number when an actual number cannot be located or is not yet available.. - uima.cas.String - - - nameOfSubstance - The name of the substance that the registry number or the E.C. number identifies. - uima.cas.String - - - - - de.julielab.jcore.types.DBInfo - References to other databases, e.g. SwissProt, INTERPRO e.g. - de.julielab.jcore.types.Annotation - - - name - The name of the DB referred to - uima.cas.String - - - acList - A list of accession numbers for this DB. - uima.cas.StringArray - - - - - de.julielab.jcore.types.Keyword - A term of a controlled keyword list to describe the content of the publication. - de.julielab.jcore.types.Annotation - - - name - The name of the keyword. - uima.cas.String - - - source - The keyword source (terminology). - uima.cas.String - - - - - de.julielab.jcore.types.DocType - The document type - uima.cas.String - - - abstract - abstract - - - fulltext - fulltext - - - randomized - randomized - - - article - used for Wikipedia articles - - - disambiguation-page - used for Wikipedia disambiguation-pages - - - - - de.julielab.jcore.types.Language - The language of the document. Possible values: based on ISO-639-1 language codes; we start with a first selection (which might be extended later). - uima.cas.String - - - de - German - - - en - English - - - es - Espagnol - - - fr - French - - - it - Italian - - - pt - Portuguese - - - other - Other languages - - - eng - English - - - ger - German - - - fre - Franz̦sisch - - - ita - Italienisch - - - - - de.julielab.jcore.types.DocumentClass - A document class specification for the CAS' document text. - uima.tcas.Annotation - - - classname - The name of the document class this CAS has been classified to. - uima.cas.String - - - confidence - Confidence value of the classification into this class. - uima.cas.Double - - - - - de.julielab.jcore.types.AuthorInfo - AuthorInfo Type annotates the text segments containing the information about an author and his/her affiliation information. - de.julielab.jcore.types.Annotation - - - foreName - An author̢۪s forename,C - uima.cas.String - - - affiliation - Affiliation of the author. - uima.cas.String - - - contact - Contact information (emails, phones, etc.). - uima.cas.String - - - lastName - The last name of the author. - uima.cas.String - - - initials - Initials - uima.cas.String - - - - - de.julielab.jcore.types.RelatedArticle - - de.julielab.jcore.types.Annotation - - - relatedArticle - - uima.cas.String - - - - - de.julielab.jcore.types.RelatedArticleList - - de.julielab.jcore.types.Annotation - - - relatedArticles - - uima.cas.FSArray - de.julielab.jcore.types.RelatedArticle - true - - - - - de.julielab.jcore.types.FullTextLink - - de.julielab.jcore.types.Annotation - - - url - - uima.cas.String - - - iconUrl - - uima.cas.String - - - - - de.julielab.jcore.types.FullTextLinkList - - de.julielab.jcore.types.Annotation - - - fullTextLinks - - uima.cas.FSArray - de.julielab.jcore.types.FullTextLink - true - - - - - de.julielab.jcore.types.DocumentTopics - Topics label documents with vectors weights for their semantically most prominent words - de.julielab.jcore.types.DocumentAnnotation - - - Weights - Vector of weights denoting the semantical descriptivity of one word for the respective topic - uima.cas.DoubleArray - - - IDs - IDs for topics determined by the modeling implementation - uima.cas.IntegerArray - - - ModelID - ID identifying the model holding the topics to be labeled - uima.cas.String - - - TopicWords - The top words for the respective topic - uima.cas.StringArray - - - ModelVersion - Version of the model holding the topics - uima.cas.String - - - - - de.julielab.jcore.types.Annotation - The super-type for all types. - uima.tcas.Annotation - - - confidence - The component that made the annotation may put its confidence/score calculated internally here, O - uima.cas.String - - - componentId - Indicates which NLP component has been used to derive the annotation, C - uima.cas.String - - - id - - uima.cas.String - - - - - de.julielab.jcore.types.DiscontinuousAnnotation - Chains annotations of the same type - de.julielab.jcore.types.Annotation - - - value - Annotations to be chained. - uima.cas.FSArray - de.julielab.jcore.types.Annotation - - - - - de.julielab.jcore.types.ResourceEntry - The reference to an external resource - de.julielab.jcore.types.Annotation - - - source - The name of the resource, C - uima.cas.String - - - entryId - The identifier of the entry, C - uima.cas.String - - - version - The version of the resource, C - uima.cas.String - - - - - de.julielab.jcore.types.AbstractSectionHeading - The heading of a section of a structured abstract as - used by MEDLINE and PubMed. - The TitleType feature value should always be 'abstractSection'. - de.julielab.jcore.types.Title - - - label - The author-given label to the abstract section this heading belongs to. - uima.cas.String - - - nlmCategory - The NLM category associated with the section label given by the authors (see feature - 'label'). - uima.cas.String - - - - - de.julielab.jcore.types.pubmed.InternalReference - Internal references with a special feature for PMC related reference types. It would be a - cleaner class hierarchy if this annotation type would be a subtype of - de.julielab.jcore.types.InternalReference. However, this won't work because this general purpose type - does not have a restriction on the possible reference types. For PubMed/PMC we want to restrict the - reference type to the ones valid for PubMed and PMC. But we cannot overwrite the base feature. Thus we - can't extend the base InternalReference type but create a new one. - de.julielab.jcore.types.Annotation - - - reftype - The reference type: literature, figure, footnote etc. - de.julielab.jcore.types.pubmed.ReferenceType - - - refid - The ID of the referenced object. - uima.cas.String - - - - - de.julielab.jcore.types.pubmed.ReferenceType - A list of possible reference types specific to the PubMed InternalReference type. Includes the - basic reference types plus some PMC specific ones. - uima.cas.String - - - affiliation - A reference to an affiliation. - - - appendix - Reference to the appendix. - - - authornotes - Reference to author notes. - - - bibliography - Reference to an item of the bibliography. - - - chemical - A reference to a chemical. - - - contributor - Reference to a contributor. - - - correspondingauthor - Reference to a corresponding author. - - - displayformula - A formular typeset for display. This mostly means that it is set on a line of its own - with some padding using larger fonts. - - - figure - Reference to a figure. - - - footnote - The number of a footnote. - - - keyword - Reference to a keyword. - - - list - Reference to a list or a list item. - - - other - A reference of another type than enumerated in this type. - - - plate - Reference to a plate. - - - scheme - Reference to a scheme. - - - section - Reference to a section. - - - statement - Reference to a statement. - - - supplementary - Reference to supplementary information. - - - table - Reference to a table. - - - tablefootnote - Reference to a table footnote. - - - textbox - Reference to a textbox or sidebar. - - - - - de.julielab.jcore.types.Paragraph - A paragraph is a self-contained unit of discourse in a written text dealing with a particular point or idea. - de.julielab.jcore.types.Zone - - - de.julielab.jcore.types.Section - a section is a part of the text that often has a heading, an id, a section type, figures, tables, citations and footnotes that occur in this section - de.julielab.jcore.types.Zone - - - sectionHeading - the title of the section - de.julielab.jcore.types.Title - - - sectionType - the type of the section (e.g. results) - uima.cas.String - - - textObjects - the text objects (figure, table, boxed text etc.) that are associated with a particular section - uima.cas.FSArray - de.julielab.jcore.types.TextObject - true - - - sectionId - the id of the section, for example as mentioned in the original file, or level of the section - uima.cas.String - - - depth - depth of the section, e.g. 0 -> section, 1 -> subsection, 2 -> sub-subsection, ... - uima.cas.Integer - - - label - The section label, if given. This might, for example, just be the section number. - uima.cas.String - - - - - de.julielab.jcore.types.Zone - A Zone is a distinct division of text. It is an abstract Type and provides a parent type for sub-types which represent various kinds of text zones. - de.julielab.jcore.types.Annotation - - - de.julielab.jcore.types.Title - Title annotates titles covering various text units, including the whole paper, sections and subsections. - de.julielab.jcore.types.Zone - - - titleType - The type of the title: -table: title of a table -figure: title of a figure -caption: title of a caption -footnote: title of a footnote - de.julielab.jcore.types.TitleType - - - - - de.julielab.jcore.types.Caption - the caption of figures, tables etc. - -feature captionTitle is a Title-Annotation of the title of the caption, if existent. - -feature captionType is an Enumeration, stating to what type of entity the caption belongs, e.g. figure or table - de.julielab.jcore.types.Zone - - - captionTitle - The title of a figure / table caption, if it exists - de.julielab.jcore.types.Title - - - captionType - The type of entity, this caption belongs to, e.g. figure or table - de.julielab.jcore.types.CaptionType - - - - - de.julielab.jcore.types.TextObject - Object, on our case, are annotations such as figures, tables, boxed text etc. - de.julielab.jcore.types.Zone - - - objectType - such as figure, table, boxed-text etc. - uima.cas.String - - - objectId - the id of the object as found in the text - uima.cas.String - - - objectLabel - the label of an object - uima.cas.String - - - objectCaption - the caption that comes with the object - de.julielab.jcore.types.Caption - - - objectTitle - The title annotation of the text object, if it exists. The title might correspond to the objectLabel (which is of type String and thus no annotation on its own). - de.julielab.jcore.types.Title - - - - - de.julielab.jcore.types.AbstractText - Annotation of the complete abstract. - de.julielab.jcore.types.Zone - - - structuredAbstractParts - A List of all parts of a structured abstract. Empty, if the abstract consists of only one part and has no titles. - uima.cas.FSArray - de.julielab.jcore.types.AbstractSection - - - - - de.julielab.jcore.types.Style - Text-markup information (italic, bold etc.) on any (e.g. character) level. Allows to keep the original style markup of the text, several style types might be set to same (or overlapping) range, when different styles are set to the same text region. - de.julielab.jcore.types.Annotation - - - styleName - the name of the style used. - de.julielab.jcore.types.StyleName - - - encoding - the encoding used. - de.julielab.jcore.types.Encoding - - - - - de.julielab.jcore.types.StyleName - - uima.cas.String - - - slanted - - - - italic - - - - bold - - - - underscore - - - - stricke-though - - - - - - de.julielab.jcore.types.Encoding - Encoding Information - uima.cas.String - - - UTF-8 - - - - UTF-16 - - - - - - de.julielab.jcore.types.List - used for annotation of lists - de.julielab.jcore.types.Zone - - - itemList - contains items of the level 1. The items of the level 1 could contain further items of next level and so on in order to represent an iterative structure of list items. - uima.cas.FSArray - de.julielab.jcore.types.ListItem - true - - - - - de.julielab.jcore.types.ListItem - item of a list - de.julielab.jcore.types.Zone - - - itemList - items of the next level (sub-items) - uima.cas.FSArray - de.julielab.jcore.types.ListItem - true - - - level - Level of indentation of the list item. - uima.cas.Integer - - - - - de.julielab.jcore.types.SectionTitle - Title of a text section in contrast to the title of the whole document. - de.julielab.jcore.types.Title - - - depth - depth of the section, e.g. 0 -> section, 1 -> subsection, 2 -> sub-subsection, ... - uima.cas.Integer - - - - - de.julielab.jcore.types.Figure - Annotation for all elements in the CAS that belong to figures, e.g. figure title, figure caption etc. - de.julielab.jcore.types.TextObject - - - de.julielab.jcore.types.Table - An annotation for CAS elements that belong to a table, e.g. table title, table caption etc. - de.julielab.jcore.types.TextObject - - - footnotes - An array collecting all footnotes, appearing in this table - uima.cas.FSArray - de.julielab.jcore.types.Footnote - - - - - de.julielab.jcore.types.Footnote - Footnotes of all kinds, i.e. footnotes found in running text or in tables etc. - de.julielab.jcore.types.Zone - - - footnoteTitle - - de.julielab.jcore.types.Title - - - - - de.julielab.jcore.types.AbstractSection - - de.julielab.jcore.types.Zone - - - abstractSectionHeading - The title of a part of a structured abstract, e.g. "Background", "Methods", "Results", ... - de.julielab.jcore.types.Title - - - - - de.julielab.jcore.types.Bibliography - The whole bibliography, beginning with a title like e.g. References or Literature and ending at the end of the last reference. - de.julielab.jcore.types.Zone - - - de.julielab.jcore.types.TitleType - The type of a title: -abstract: title of an abstract or abstract part -table: title of a table -figure: title of a figure -caption: title of a caption -footnote: title of a footnote -abstract: title of an abstract or abstract part -document: title of the document - uima.cas.String - - - table - - - - figure - - - - caption - - - - footnote - - - - abstractSection - Denotes the heading of a section of a structured abstract - - - document - Denotes the document title - - - other - - - - section - - - - - - de.julielab.jcore.types.CaptionType - The type of entity the Caption belongs to, e.g. a figure or a table - uima.cas.String - - - table - - - - figure - - - - other - - - - - - de.julielab.jcore.types.Segment - A segment is comparable to "de.julielab.jcore.types.Section" but for medical document (e.g. discharge summaries) we want a distinction between these types - de.julielab.jcore.types.Zone - - - value - SecTag reference?? - uima.cas.String - - - - - de.julielab.jcore.types.InternalReference - Used for document-internal references like literature references, footnotes etc. - de.julielab.jcore.types.Annotation - - - reftype - The reference type: literature, figure, footnote etc. - uima.cas.String - - - refid - The ID of the referenced object. - uima.cas.String - - - - - - - - - true - true - false - - - diff --git a/uima/preprocessing/desc/XMI Database Multiplier Reader.xml b/uima/preprocessing/desc/XMI Database Multiplier Reader.xml index 73a99b2a..a47a1bd9 100644 --- a/uima/preprocessing/desc/XMI Database Multiplier Reader.xml +++ b/uima/preprocessing/desc/XMI Database Multiplier Reader.xml @@ -1,198 +1,394 @@ + org.apache.uima.java + de.julielab.jcore.reader.xmi.XmiDBMultiplierReader + + XMI Database Multiplier Reader + This is an extension of the DBMultiplierReader to handle JeDIS XMI annotation module data. + + + ReadsBaseDocument + Indicates if this reader reads segmented annotation data. If set to false, the XMI data is expected to represent complete annotated documents. If it is set to true, a segmented annotation graph is expected and the table given with the 'Table' parameter will contain the document text together with some basic annotations. What exactly is stored in which manner is determined by the jcore-xmi-db-consumer used to write the data into the database. + Boolean + false + true + + + StoreMaxXmiId + This parameter is required to be set to true, if this reader is contained in a pipeline that also contains a jcore-xmi-db-writer andthe writer will segment the CAS annotation graph and store only parts of it. Then, it is important to keep track of the free XMI element IDs that may be assigned to new annotation elements to avoid ID clashes when assembling an XMI document from separately stored annotation graph segments. + Boolean + false + false + + + IncreasedAttributeSize + Maxmimum XML attribute size in bytes. Since the CAS document text is stored as an XMI attribute, it might happen for large documents that there is an error because the maximum attribute size is exceeded. This parameter allows to specify the maxmimum attribute size in order to avoid such errors. Should only be set if required. + Integer + false + false + + + XercesAttributeBufferSize + Initial XML parser buffer size in bytes. For large documents, it can happen that XMI parsing is extremely slow. By employing monitoring tools like the jconsole or (j)visualvm, the hot spots of work can be identified. If one of those is the XML attribute buffer resizing, this parameter should be set to a size that makes buffer resizing unnecessary. + Integer + false + false + + + SendCasToLast + UIMA DUCC relevant parameter when using a CAS multiplier. When set to true, the worker CAS from the collection reader is forwarded to the last component in the pipeline. This can be used to send information about the progress to the CAS consumer in order to have it perform batch operations. For this purpose, a feature structure of type WorkItem from the DUCC library is added to the worker CAS. This feature structure has information about the current progress. + Boolean + false + false + + + ResetTable + If set to true and the parameter 'Table' is set to a subset table, the subset table will be reset atthe initialization of the reader to be ready for processing of the whole subset. Do not use when multiple readers read the same subset table. + Boolean + false + false + + + FetchIdsProactively + If set to true and when reading from a subset table, batches of document IDs will be retrieved in a background thread while the previous batch is already in process. This is meant to minimize waiting time for the database. Deactivate this feature if you encounter issues with databaase connections. + Boolean + false + true + + + AdditionalTables + An array of table names or a string in the form of a qualified Java class, i.e. a dot-separated path. In the latter case, an existing table is searched for by converting the dots to underscores. A specific Postgres schema can be specified by prepending the Java-style path with a schema name followed by a colon, e.g. 'myschema:de.julielab.jcore.types.Token'. By default, the table names will be resolved against the active data postgres schema configured in the CoStoSys configuration file. If a name is already schema qualified, i.e. contains a dot or a colon, the active data schema will be ignored for this table. When reading documents from the document data table, the additional tables will be joined onto the data table using the primary keys of the queried documents. Using the table schema for the additional documents defined by the 'AdditionalTableSchema' parameter, the columns that are marked as 'retrieve=true' in the table schema, are returned together with the main document data. This mechanism is most prominently used to retrieve annotation table data together with the original document text in XMI format for the JeDIS system. + String + true + false + + + AdditionalTableSchemas + The table schemas that corresponds to the additional tables given with the 'AdditionalTables' parameter. If only one schema name is given, that schema must apply to all additional tables. + String + true + false + + + AdditionalTablesPostgresSchema + This optional parameter specifies the Postgres schema in which the additional tables to read are searched by default. If omitted, the active data schema from the CoStoSys configuration is assumed. The default can be overwritten for individual types. For details, see the description of the 'AdditionalTables' parameter. + String + false + false + + + BatchSize + + Integer + false + false + + + DBDriver + Currently unused because the Hikari JDBC library should recognize the correct driver. However, there seem to be cases where this doesn't work (HSQLDB). So we keep the parameter for later. When this issue comes up, the driver would have to be set manually. This isn't done right now. + String + false + false + + + Table + The data or subset database table to read from. The name will be resolved against the active Postgres schema defined in the CoStoSys configuration file.However, if the name contains a schema qualification (i.e. 'schemaname.tablename), the configuration file will be ignored in this point. + String + false + true + + + SelectionOrder + WARNING: Potential SQL injection vulnerability. Do not let unknown users interact with your database with this component. An SQL ORDER clause specifying in which order the documents in the target database table should be processed. Only the clause itself must be specified, the ORDER keyword is automatically added. + String + false + false + + + WhereCondition + WARNING: Potential SQL injection vulnerability. Do not let unknown users interact with your database with this component. Only used when reading data tables directly. No effect when the 'tableName' parameter specifies a subset table. The parameter value should be an SQL WHERE clause restricting the documents to be read. Only the clause itself must be specified, the WHERE keyword is added automatically. + String + false + false + + + Limit + + Integer + false + false + + + CostosysConfigFile + File path or classpath resource location to the CoStoSys XML configuration. This configuration must specify the table schema of the table referred to by the 'Table' parameter as active table schema. The active table schema is always the schema of the data table that is either queried directly for documents or, if 'tableName' points to a subset table, indirectly through the subset table. Make also sure that the active database connection in the configuration points to the correct database. + String + false + true + + + + + SendCasToLast + + false + + + + ResetTable + + false + + + + FetchIdsProactively + + true + + + + BatchSize + + 100 + + + + SelectionOrder + + + + + + Table + + preprocessing_all + + + + CostosysConfigFile + + ../costosys.xml + + + + ReadsBaseDocument + + true + + + + StoreMaxXmiId + + true + + + + + + + + + + + + true + false + true + + + diff --git a/uima/preprocessing/desc/aggregateAnalysisEngine.xml b/uima/preprocessing/desc/aggregateAnalysisEngine.xml index 9d1f062a..1876aefb 100644 --- a/uima/preprocessing/desc/aggregateAnalysisEngine.xml +++ b/uima/preprocessing/desc/aggregateAnalysisEngine.xml @@ -1,31 +1,84 @@ + org.apache.uima.java + false + - - + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + JCoRe Pipeline AAE + This AAE descriptor directly contains the analysis engines added through the JCoRe pipeline builder. The AAE serves to bundle all the components together. + + + + - PM Classifier 2017 - PM Classifier 2018 + + JCoRe Sentence Annotator + + JCoRe Token Annotator + + JCoRe OpenNLP POS Tagger + + BioLemmatizer + + JCoRe Lingscope Negation AE + + + + + true + true + false + + + diff --git a/uima/preprocessing/desc/cpeAAE.xml b/uima/preprocessing/desc/cpeAAE.xml index 97b7f2b1..409fcc15 100644 --- a/uima/preprocessing/desc/cpeAAE.xml +++ b/uima/preprocessing/desc/cpeAAE.xml @@ -1,34 +1,66 @@ + org.apache.uima.java + false + + + + + + + - - + + + + + + + + + + + + + JCoRe Abstract Database Multiplier + JCoRe Pipeline AAE - JCoRe XMI Database Writer + + JCoRe Consumer AAE + + + + + true + true + false + + + diff --git a/uima/trec2018-pubmed-indexer/desc/JCoRe ElasticSearch Consumer.xml b/uima/trec2018-pubmed-indexer/desc/JCoRe ElasticSearch Consumer.xml index a06c6f2a..698faf38 100644 --- a/uima/trec2018-pubmed-indexer/desc/JCoRe ElasticSearch Consumer.xml +++ b/uima/trec2018-pubmed-indexer/desc/JCoRe ElasticSearch Consumer.xml @@ -173,7 +173,7 @@ - sigir19_v1.0_pmnn + sigir19_v1.0 diff --git a/uima/trec2018-pubmed-indexer/desc/XMI Database Multiplier Reader.xml b/uima/trec2018-pubmed-indexer/desc/XMI Database Multiplier Reader.xml index 0abc4c29..306d9cd5 100644 --- a/uima/trec2018-pubmed-indexer/desc/XMI Database Multiplier Reader.xml +++ b/uima/trec2018-pubmed-indexer/desc/XMI Database Multiplier Reader.xml @@ -345,7 +345,7 @@ - indexing_all + indexing_all2 @@ -376,6 +376,10 @@ de.julielab.jcore.types.Organism de.julielab.jcore.types.AutoDescriptor + + de.julielab.jcore.types.Scope + + de.julielab.jcore.types.PointMutation diff --git a/uima/trec2018-pubmed-indexer/descAll/JCoRe ElasticSearch Consumer.xml b/uima/trec2018-pubmed-indexer/descAll/JCoRe ElasticSearch Consumer.xml index a06c6f2a..698faf38 100644 --- a/uima/trec2018-pubmed-indexer/descAll/JCoRe ElasticSearch Consumer.xml +++ b/uima/trec2018-pubmed-indexer/descAll/JCoRe ElasticSearch Consumer.xml @@ -173,7 +173,7 @@ - sigir19_v1.0_pmnn + sigir19_v1.0 diff --git a/uima/trec2018-pubmed-indexer/descAll/XMI Database Multiplier Reader.xml b/uima/trec2018-pubmed-indexer/descAll/XMI Database Multiplier Reader.xml index 0abc4c29..306d9cd5 100644 --- a/uima/trec2018-pubmed-indexer/descAll/XMI Database Multiplier Reader.xml +++ b/uima/trec2018-pubmed-indexer/descAll/XMI Database Multiplier Reader.xml @@ -345,7 +345,7 @@ - indexing_all + indexing_all2 @@ -376,6 +376,10 @@ de.julielab.jcore.types.Organism de.julielab.jcore.types.AutoDescriptor + + de.julielab.jcore.types.Scope + + de.julielab.jcore.types.PointMutation diff --git a/uima/trec2018-pubmed-indexer/src/main/java/de/julielab/jcore/trec2018/pubmed/Trec2018FieldGenerator.java b/uima/trec2018-pubmed-indexer/src/main/java/de/julielab/jcore/trec2018/pubmed/Trec2018FieldGenerator.java index 218e31cb..658edf20 100644 --- a/uima/trec2018-pubmed-indexer/src/main/java/de/julielab/jcore/trec2018/pubmed/Trec2018FieldGenerator.java +++ b/uima/trec2018-pubmed-indexer/src/main/java/de/julielab/jcore/trec2018/pubmed/Trec2018FieldGenerator.java @@ -42,9 +42,29 @@ public Document addFields(JCas jCas, Document document) throws CASException, Fie addDocumentClasses(jCas, document); addGsInfo(jCas, document); addPublicationType(jCas, document); + addNegationScopes(jCas, document); + addMutations(jCas, document); return document; } + private void addNegationScopes(JCas jCas, Document document) { + Collection negationScopes = JCasUtil.select(jCas, Scope.class); + ArrayFieldValue fieldValue = new ArrayFieldValue(); + for (Scope scope : negationScopes) { + fieldValue.add(new RawToken(scope.getCoveredText())); + } + document.addField("negationPhrases", fieldValue); + } + + private void addMutations(JCas jCas, Document document) { + Collection mutations = JCasUtil.select(jCas, PointMutation.class); + ArrayFieldValue fieldValue = new ArrayFieldValue(); + for (PointMutation mutation : mutations) { + fieldValue.add(new RawToken(mutation.getSpecificType())); + } + document.addField("mutations", fieldValue); + } + private void addPublicationType(JCas jCas, Document document) { Header header = JCasUtil.selectSingle(jCas, Header.class); FSArray pubTypeList = header.getPubTypeList();