Skip to content

Commit

Permalink
Lazy build a ClinicalTrial object (fixes bst-mug#20)
Browse files Browse the repository at this point in the history
This reduces memory consumption by not trying to keep all clinical trials in memory.
  • Loading branch information
michelole committed Jul 17, 2018
1 parent 675101e commit 853a225
Showing 1 changed file with 21 additions and 14 deletions.
35 changes: 21 additions & 14 deletions src/main/java/at/medunigraz/imi/bst/clinicaltrial/Indexing.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,18 @@
import org.apache.commons.lang3.StringEscapeUtils;
import org.elasticsearch.action.bulk.BulkProcessor;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.xcontent.XContentBuilder;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;

import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;

Expand All @@ -28,26 +30,31 @@ public static void main(String[] args) throws Exception {
}

static long indexAllClinicalTrials(String dataFolderWithFiles) throws Exception {

List<ClinicalTrial> clinicalTrials = getClinicalTrialsFromFolder(dataFolderWithFiles);

System.out.println("CLINICALTRIALS TOTAL READ: " + clinicalTrials.size());
System.out.println("STARTING INDEXING");

long startTime = System.currentTimeMillis();

BulkProcessor bulkProcessor = buildBuildProcessor();

for (ClinicalTrial trial: clinicalTrials) {
System.out.println("ADDING: " + trial.id);
Files.walk(Paths.get(dataFolderWithFiles))
.filter(Files::isRegularFile)
.forEach(file -> {
ClinicalTrial trial = getClinicalTrialFromFile(file.toString());
System.out.println("ADDING: " + trial.id);

try {
bulkProcessor.add(new IndexRequest(TrecConfig.ELASTIC_CT_INDEX, TrecConfig.ELASTIC_CT_TYPE, trial.id)
.source(buildJson(trial)));
} catch (IOException e) {
throw new RuntimeException(e);
}
});

bulkProcessor.add(new IndexRequest(TrecConfig.ELASTIC_CT_INDEX, TrecConfig.ELASTIC_CT_TYPE, trial.id)
.source(buildJson(trial)));
}
bulkProcessor.awaitClose(10, TimeUnit.MINUTES);

long indexingDuration = (System.currentTimeMillis() - startTime);

System.out.println("INDEXING TIME BULK: " + indexingDuration/1000 + " secs - " + clinicalTrials.size() + " articles");
System.out.println("INDEXING TIME BULK: " + indexingDuration/1000 + " secs");

return indexingDuration;
}
Expand All @@ -69,15 +76,15 @@ public void afterBulk(long executionId,
BulkRequest request,
BulkResponse response) {
if (response.hasFailures()) {
System.out.println("Failures!!!!");
throw new RuntimeException(response.buildFailureMessage());
}
}

@Override
public void afterBulk(long executionId,
BulkRequest request,
Throwable failure) {
System.out.println("Bulk failed and raised " + failure);
throw new RuntimeException(failure);
}
})
// Let's stay with the defaults for a while
Expand Down Expand Up @@ -134,7 +141,7 @@ public static List<ClinicalTrial> getClinicalTrialsFromFolder(String dataFolderW
return(clinicalTrials);
}

public static ClinicalTrial getClinicalTrialFromFile(String xmlTrialFileName) throws IOException {
public static ClinicalTrial getClinicalTrialFromFile(String xmlTrialFileName) {

return(ClinicalTrial.fromXml(xmlTrialFileName));
}
Expand Down

0 comments on commit 853a225

Please sign in to comment.