Skip to content

Commit

Permalink
This is the state in which the project was at SIGIR submission time.
Browse files Browse the repository at this point in the history
  • Loading branch information
khituras committed Jan 31, 2019
1 parent 9565c25 commit 85a3732
Show file tree
Hide file tree
Showing 22 changed files with 438 additions and 102 deletions.
61 changes: 44 additions & 17 deletions notebooks/sigir19/.ipynb_checkpoints/StatVis-checkpoint.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions scripts/runAllBoostsOptimization.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

whats="genedis fields posneg additional extra pmclass mutation drug"
#whats="genedis fields posneg additional extra pmclass mutation drug"
whats="pmclass"

mvn compile

Expand Down
5 changes: 3 additions & 2 deletions scripts/runAllPmClassExperimentsLiterature.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@
mvn compile
# For the best_fields, the slop does not do anything, but a argument value is expected
sbatch --exclude=h5,h6 scripts/runPmClassExperimentsLiterature.sh best_fields OR 2017
sbatch --exclude=h5,h6 scripts/runPmClassExperimentsLiterature.sh best_fields AND 2017
#sbatch --exclude=h5,h6 scripts/runPmClassExperimentsLiterature.sh best_fields AND 2017

# Here, the boolean operator has no effect
sbatch --exclude=h5,h6 scripts/runPmClassExperimentsLiterature.sh phrase OR 2017


# For the best_fields, the slop does not do anything, but a argument value is expected
sbatch --exclude=h5,h6 scripts/runPmClassExperimentsLiterature.sh best_fields OR 2018
sbatch --exclude=h5,h6 scripts/runPmClassExperimentsLiterature.sh best_fields AND 2018
#sbatch --exclude=h5,h6 scripts/runPmClassExperimentsLiterature.sh best_fields AND 2018

# Here, the boolean operator has no effect
# Also, we leave phrases out because we really need to reduce the bumber of experimental combinations
sbatch --exclude=h5,h6 scripts/runPmClassExperimentsLiterature.sh phrase OR 2018
3 changes: 1 addition & 2 deletions scripts/runBoostOptimizer.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/bin/bash
#SBATCH --mem 10G
#SBATCH --mem 40G
#SBATCH --cpus-per-task 4
#SBATCH -J boostopt


mvn exec:java -Dexec.mainClass=at.medunigraz.imi.bst.trec.SigirPubmedExperimenterBoostOptimizer -Dexec.args="$1 $2"
2 changes: 1 addition & 1 deletion scripts/runPmClassExperimentsLiterature.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
#SBATCH --cpus-per-task 2
#SBATCH --mem 10G
#SBATCH --mem 40G
#SBATCH -J pmclassexp

mvn exec:java -Dexec.mainClass=at.medunigraz.imi.bst.trec.SigirPubmedExperimenterPmClass -Dexec.args="$1 $2 $3"
2 changes: 1 addition & 1 deletion scripts/runTermBoostExperimentsLiterature.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
#SBATCH --cpus-per-task 5
#SBATCH --mem 10G
#SBATCH --mem 30G
#SBATCH -J termboostexp

mvn exec:java -Dexec.mainClass=at.medunigraz.imi.bst.trec.SigirPubmedTermBoostExperimenterDefaultBoosting -Dexec.args="$1 $2 $3"
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
Expand Down Expand Up @@ -52,6 +53,7 @@ public static Map<String, Document> readDocuments(File documentJsonZip) throws D
}

public static void addPMLabels(File gsTable, Map<String, Document> docsById) throws DataReadingException {
int pmAndNotPMCounter = 0;
try (CSVParser csvRecords = CSVFormat.TDF.withFirstRecordAsHeader().parse(FileUtilities.getReaderFromFile(gsTable))) {
Iterator<CSVRecord> it = csvRecords.iterator();
while (it.hasNext()) {
Expand All @@ -65,12 +67,16 @@ public static void addPMLabels(File gsTable, Map<String, Document> docsById) thr
LOG.warn("Null document for doc ID " + trecDocId + ". Record: " + record);
continue;
}
if (doc.getPmLabel() != null && doc.getPmLabel().equals("Not PM") && pmRelDesc.equalsIgnoreCase("PM"))
++pmAndNotPMCounter;
if (doc.getPmLabel() != null && doc.getPmLabel().equals("PM") && pmRelDesc.equalsIgnoreCase("Not PM"))
++pmAndNotPMCounter;
// "Once PM, always PM"
if (doc.getPmLabel() == null || doc.getPmLabel().equalsIgnoreCase("Not PM")) {
doc.setPMLabel(pmRelDesc);
}
}

LOG.info("Encountered {} documents labeled as PM and also as Not PM for different queries", pmAndNotPMCounter);
} catch (IOException e1) {
throw new DataReadingException(e1);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ public static void performCrossVal(PMClassifier classifier, String jsongsdocs, S

int corrAllover = 0;
Set<Document> onceRight = new HashSet<>();
List<Double> foldResults = new ArrayList<>();
for (int fold = 0; fold < numFolds; fold++) {
int currentFold = fold;
Map<String, Document> train = IntStream.range(0, numFolds).filter(i -> i != currentFold).mapToObj(partitions::get).flatMap(Collection::stream).collect(toMap(Document::getId, d -> d));
Expand All @@ -84,12 +85,15 @@ public static void performCrossVal(PMClassifier classifier, String jsongsdocs, S
LOG.info("Evaluation for fold " + fold + ":");
LOG.info("Total: " + test.size());
LOG.info("Correct: " + corr);
LOG.info("That is " + (corr / (double) test.size()) * 100 + "%");
final double acc = (corr / (double) test.size()) * 100;
LOG.info("That is " + acc + "%");
foldResults.add(acc);


corrAllover += corr;
}

LOG.info("All fold results: {}", foldResults);
LOG.info("Allover eval:");
LOG.info("Total: " + documents.size());
LOG.info("Correct: " + corrAllover);
Expand Down
104 changes: 98 additions & 6 deletions src/main/java/at/medunigraz/imi/bst/trec/SigirParameters.java
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
package at.medunigraz.imi.bst.trec;

import java.text.DecimalFormat;
import java.util.HashMap;
import java.util.Map;

public class SigirParameters {

public final static String BEST_FIELDS = "best_fields";
public final static String MOST_FIELDS = "most_fields";
public final static String PHRASE = "phrase";

public static final Map<String, String> TREC_2018_HPIPUBNONE = new HashMap<>();

static {
TREC_2018_HPIPUBNONE.put("dis_boost", "1.5");
TREC_2018_HPIPUBNONE.put("dis_topic_boost", "1");
Expand Down Expand Up @@ -46,11 +47,11 @@ public class SigirParameters {
TREC_2018_HPIPUBNONE.put("gene_syn_multi_match_type", PHRASE);
TREC_2018_HPIPUBNONE.put("gene_desc_multi_match_type", PHRASE);
TREC_2018_HPIPUBNONE.put("gene_hyper_multi_match_type", BEST_FIELDS);
TREC_2018_HPIPUBNONE.put("cancer_multi_match_type", BEST_FIELDS);
TREC_2018_HPIPUBNONE.put("dna_multi_match_type", BEST_FIELDS);
TREC_2018_HPIPUBNONE.put("neg_boost_multi_match_type", BEST_FIELDS);
TREC_2018_HPIPUBNONE.put("pos_boost_multi_match_type", BEST_FIELDS);
TREC_2018_HPIPUBNONE.put("dgi_multi_match_type", BEST_FIELDS);
TREC_2018_HPIPUBNONE.put("cancer_multi_match_type", PHRASE);
TREC_2018_HPIPUBNONE.put("dna_multi_match_type", PHRASE);
TREC_2018_HPIPUBNONE.put("neg_boost_multi_match_type", PHRASE);
TREC_2018_HPIPUBNONE.put("pos_boost_multi_match_type", PHRASE);
TREC_2018_HPIPUBNONE.put("dgi_multi_match_type", PHRASE);

TREC_2018_HPIPUBNONE.put("dis_operator", "or");
TREC_2018_HPIPUBNONE.put("dis_prefterm_operator", "or");
Expand Down Expand Up @@ -123,6 +124,97 @@ public class SigirParameters {
LITERATURE_ES_DEFAULTS.put("phrase_slop", "10");
}

public static final Map<String, String> SIGIR19_BEST_2018 = LITERATURE_ES_DEFAULTS;
static {
SIGIR19_BEST_2018.put("dis_boost", "2");
SIGIR19_BEST_2018.put("dis_topic_boost", "1");
SIGIR19_BEST_2018.put("dis_prefterm_boost", "1");
SIGIR19_BEST_2018.put("dis_syn_boost", "1");
SIGIR19_BEST_2018.put("dis_hyper_boost", "1");
SIGIR19_BEST_2018.put("gene_boost", "1.5");
SIGIR19_BEST_2018.put("gene_topic_boost", "1");
SIGIR19_BEST_2018.put("gene_syn_boost", "1");
SIGIR19_BEST_2018.put("gene_desc_boost", "1");
SIGIR19_BEST_2018.put("gene_hyper_boost", "1");
SIGIR19_BEST_2018.put("title_boost", "^2.5");
SIGIR19_BEST_2018.put("abstract_boost", "^2");
SIGIR19_BEST_2018.put("keyword_boost", "");
SIGIR19_BEST_2018.put("meshTags_boost", "");
SIGIR19_BEST_2018.put("genes_field_boost", "");
SIGIR19_BEST_2018.put("pos_words_boost", ".5");
SIGIR19_BEST_2018.put("neg_words_boost", "0");
SIGIR19_BEST_2018.put("cancer_boost", "0.5");
SIGIR19_BEST_2018.put("chemo_boost", "1.5");
SIGIR19_BEST_2018.put("dna_boost", "0.5");
SIGIR19_BEST_2018.put("extra_boost", "1");
SIGIR19_BEST_2018.put("pm_boost", "1");
SIGIR19_BEST_2018.put("non_mel_boost", "1");
SIGIR19_BEST_2018.put("pm_gs_boost", "1");
SIGIR19_BEST_2018.put("dgi_boost", "0.5");
SIGIR19_BEST_2018.put("mut_boost", "1");

SIGIR19_BEST_2018.put("dis_multi_match_type", PHRASE);
SIGIR19_BEST_2018.put("dis_prefterm_multi_match_type", PHRASE);
SIGIR19_BEST_2018.put("dis_syn_multi_match_type", PHRASE);
SIGIR19_BEST_2018.put("dis_hyper_multi_match_type", PHRASE);
SIGIR19_BEST_2018.put("gene_multi_match_type", BEST_FIELDS);
SIGIR19_BEST_2018.put("gene_syn_multi_match_type", PHRASE);
SIGIR19_BEST_2018.put("gene_desc_multi_match_type", BEST_FIELDS);
SIGIR19_BEST_2018.put("gene_hyper_multi_match_type", PHRASE);
SIGIR19_BEST_2018.put("cancer_multi_match_type", PHRASE);
SIGIR19_BEST_2018.put("dna_multi_match_type", PHRASE);
SIGIR19_BEST_2018.put("neg_boost_multi_match_type", PHRASE);
SIGIR19_BEST_2018.put("pos_boost_multi_match_type", PHRASE);
SIGIR19_BEST_2018.put("dgi_multi_match_type", PHRASE);

SIGIR19_BEST_2018.put("dis_operator", "or");
SIGIR19_BEST_2018.put("dis_prefterm_operator", "or");
SIGIR19_BEST_2018.put("dis_syn_operator", "or");
SIGIR19_BEST_2018.put("dis_hyper_operator", "or");
SIGIR19_BEST_2018.put("gene_operator", "or");
SIGIR19_BEST_2018.put("gene_syn_operator", "or");
SIGIR19_BEST_2018.put("gene_hyper_operator", "or");
SIGIR19_BEST_2018.put("gene_desc_operator", "or");
SIGIR19_BEST_2018.put("cancer_operator", "or");
SIGIR19_BEST_2018.put("dna_operator", "or");

SIGIR19_BEST_2018.put("phrase_slop", "10");

SIGIR19_BEST_2018.put("pm_class_field", "pmclass2018lstmgru.keyword" );
}

public static final Map<String, String> SIGIR19_HPIPUBNONE_WEIGHTS = TREC_2018_HPIPUBNONE;
static {
SIGIR19_BEST_2018.put("dis_multi_match_type", PHRASE);
SIGIR19_BEST_2018.put("dis_prefterm_multi_match_type", PHRASE);
SIGIR19_BEST_2018.put("dis_syn_multi_match_type", PHRASE);
SIGIR19_BEST_2018.put("dis_hyper_multi_match_type", PHRASE);
SIGIR19_BEST_2018.put("gene_multi_match_type", BEST_FIELDS);
SIGIR19_BEST_2018.put("gene_syn_multi_match_type", PHRASE);
SIGIR19_BEST_2018.put("gene_desc_multi_match_type", BEST_FIELDS);
SIGIR19_BEST_2018.put("gene_hyper_multi_match_type", PHRASE);
SIGIR19_BEST_2018.put("cancer_multi_match_type", BEST_FIELDS);
SIGIR19_BEST_2018.put("dna_multi_match_type", BEST_FIELDS);
SIGIR19_BEST_2018.put("neg_boost_multi_match_type", BEST_FIELDS);
SIGIR19_BEST_2018.put("pos_boost_multi_match_type", BEST_FIELDS);
SIGIR19_BEST_2018.put("dgi_multi_match_type", BEST_FIELDS);

SIGIR19_BEST_2018.put("dis_operator", "or");
SIGIR19_BEST_2018.put("dis_prefterm_operator", "or");
SIGIR19_BEST_2018.put("dis_syn_operator", "or");
SIGIR19_BEST_2018.put("dis_hyper_operator", "or");
SIGIR19_BEST_2018.put("gene_operator", "or");
SIGIR19_BEST_2018.put("gene_syn_operator", "or");
SIGIR19_BEST_2018.put("gene_hyper_operator", "or");
SIGIR19_BEST_2018.put("gene_desc_operator", "or");
SIGIR19_BEST_2018.put("cancer_operator", "or");
SIGIR19_BEST_2018.put("dna_operator", "or");

SIGIR19_BEST_2018.put("phrase_slop", "10");

SIGIR19_BEST_2018.put("pm_class_field", "pmclass2018lstmgru.keyword" );
}

private SigirParameters() {
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,13 @@ public static void main(String[] args) {
if (what.equals("genedis")) {
List<Map<String, String>> parameters = new ArrayList<>();
List<String> suffixes = new ArrayList<>();
for (double genb = 1; genb < 3; genb += .5) {
for (double descb = 1; descb < 3; descb += .5) {
for (double gsynb = 1; gsynb < 3; gsynb += .5) {
for (double disb = 1; disb < 3; disb += .5) {
for (double dsynb = 1; dsynb < 3; gsynb += .5) {
int num = 0;
for (double genb = 1; genb < 2.5; genb += .5) {
for (double descb = 1; descb < 2.5; descb += .5) {
for (double gsynb = 1; gsynb < 2.5; gsynb += .5) {
for (double disb = 1; disb < 2.5; disb += .5) {
for (double dsynb = 1; dsynb < 2.5; dsynb += .5) {
System.out.println(num++);
Map<String, String> paramcombination = new HashMap<>(templateProperties);
paramcombination.put("gene_boost", String.valueOf(genb));
paramcombination.put("gene_desc_boost", String.valueOf(descb));
Expand All @@ -64,10 +66,10 @@ public static void main(String[] args) {
List<Map<String, String>> parameters = new ArrayList<>();
List<String> suffixes = new ArrayList<>();
for (double titb = 1; titb < 3; titb += .5) {
for (double abstrb = 1; abstrb < 3; abstrb += .5) {
for (double kwb = 1; kwb < 3; kwb += .5) {
for (double meshb = 1; meshb < 3; meshb += .5) {
for (double genesb = 1; genesb < 3; genesb += .5) {
for (double abstrb = 1; abstrb < 2.5; abstrb += .5) {
for (double kwb = 1; kwb < 2.5; kwb += .5) {
for (double meshb = 1; meshb < 2.5; meshb += .5) {
for (double genesb = 1; genesb < 2.5; genesb += .5) {
Map<String, String> paramcombination = new HashMap<>(templateProperties);
paramcombination.put("title_boost", "^" + titb);
paramcombination.put("abstract_boost", "^" + abstrb);
Expand Down Expand Up @@ -136,14 +138,31 @@ public static void main(String[] args) {
suffixes.add(suffix);
}
runExperimentsWithParameters("/templates/sigir19_experiments_biomed/mutations.json", parameters, suffixes, year, what, goldStandard, target);
}else if (what.equals("drug")) {
List<Map<String, String>> parameters = new ArrayList<>();
List<String> suffixes = new ArrayList<>();
for (double extrab = .5; extrab <= 3; extrab += .5) {
Map<String, String> paramcombination = new HashMap<>(templateProperties);
paramcombination.put("dgi_boost", String.valueOf(extrab));
String suffix = "--dgi" + df.format(extrab);
parameters.add(paramcombination);
suffixes.add(suffix);
}
runExperimentsWithParameters("/templates/sigir19_experiments_biomed/dgi.json", parameters, suffixes, year, what, goldStandard, target);
} else if (what.equals("pmclass")) {
List<Map<String, String>> parameters = new ArrayList<>();
List<String> suffixes = new ArrayList<>();
final List<String> pmfields = Arrays.asList("pmclass2017lstm.keyword",
"pmclass2017lstmatt.keyword",
// final List<String> pmfields = Arrays.asList("pmclass2017lstm.keyword",
// "pmclass2017lstmatt.keyword",
// "pmclass2017lstmgru.keyword",
// "pmclass2018lstm.keyword",
// "pmclass2018lstmatt.keyword",
// "pmclass2018lstmgru.keyword",
// "pmclass2017.keyword",
// "pmclass2018.keyword");
// We decided for the paper to only use the LogReg and GRU approaches, thus we don't need to optimize the others
final List<String> pmfields = Arrays.asList(
"pmclass2017lstmgru.keyword",
"pmclass2018lstm.keyword",
"pmclass2018lstmatt.keyword",
"pmclass2018lstmgru.keyword",
"pmclass2017.keyword",
"pmclass2018.keyword");
Expand All @@ -152,12 +171,12 @@ public static void main(String[] args) {
Map<String, String> paramcombination = new HashMap<>(templateProperties);
paramcombination.put("pm_boost", String.valueOf(extrab));
paramcombination.put("pm_class_field", pmfield);
String suffix = "--pm" + df.format(extrab) + "-pmf:";
String suffix = "--pm" + df.format(extrab) + "-pmf:"+pmfield;
parameters.add(paramcombination);
suffixes.add(suffix);
}
}
runExperimentsWithParameters("/templates/sigir19_pmclass_biomed", parameters, suffixes, year, what, goldStandard, target);
runExperimentsWithParameters("/templates/sigir19_pmclass_biomed", parameters, suffixes, year, what+"boostopt", goldStandard, target);

} else throw new IllegalStateException("Unknown mode " + what);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,9 @@ public static void main(String[] args) {
templateProperties.put("phrase_slop", "10");


final List<String> pmfields = Arrays.asList("pmclass2017lstm.keyword",
"pmclass2017lstmatt.keyword",
// We decided for the paper to only use the LogReg and GRU approaches, thus we don't need to optimize the others
final List<String> pmfields = Arrays.asList(
"pmclass2017lstmgru.keyword",
"pmclass2018lstm.keyword",
"pmclass2018lstmatt.keyword",
"pmclass2018lstmgru.keyword",
"pmclass2017.keyword",
"pmclass2018.keyword");
Expand Down

This file was deleted.

Loading

0 comments on commit 85a3732

Please sign in to comment.