-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
literature_update.sh and reset_missing_documents.sh. Check the comments in the scripts for details.
- Loading branch information
Showing
3 changed files
with
149 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
#!/bin/bash | ||
# Script that runs in an infinite loop and updates the GePI interaction index. | ||
# CAUTION: This script is written for the specific situation in the JulieLab. You should adapt it for your environment. | ||
# Takes one parameter: | ||
# 1: Path to a file that defines the following environment variables: | ||
# - GEPI_REPO_DIR: Directory to the GePI Maven project root (i.e. the gepi/ directory within the gepi repository) | ||
# - GEPI_PREPROCESSING_PM: Path to the JCoRe Pipeline performing the NLP preprocessing of PubMed for GePI (e.g. the one at gepi-preprocessing/pubmed/preprocessing of this repository) | ||
# - GEPI_PREPROCESSING_PMC: Path to the JCoRe Pipeline performing the NLP preprocessing of PMC for GePI (e.g. the one at gepi-preprocessing/pmc/preprocessing of this repository) | ||
# - GEPI_INDEXING_PM: Path to the JCoRe Pipeline performing the ElasticSearch indexing of PubMed for GePI (e.g. the one at gepi-indexing/gepi-indexing-pubmed) | ||
# - GEPI_INDEXING_PMC: Path to the JCoRe Pipeline performing the ElasticSearch indexing of PubMed for GePI (e.g. the one at gepi-indexing/gepi-indexing-pubmed) | ||
set -e | ||
# Stop via CTRL-Z followed by "kill %%"" | ||
source $1 | ||
TIME_CMD="/usr/bin/time -v" | ||
|
||
# a day | ||
SECONDS_BETWEEN_UPDATES=86400 | ||
|
||
# For security reasons, our GePI ElasticSearch does not accept remote | ||
# connections. Thus, we need to tunnel to it for indexing. | ||
function tunnel_to_es() { | ||
cmd1='ssh -i ~/.ssh/id_rsa -4 -f -N -L 9201:localhost:9200 gepi-vm' | ||
pid=`pgrep -f "${cmd1}" || true` | ||
if [ -n "$pid" ]; then | ||
kill $pid | ||
fi | ||
${cmd1} | ||
cmd2='ssh -i ~/.ssh/id_rsa -4 -f -N -L 9301:localhost:9300 gepi-vm' | ||
pid=`pgrep -f "${cmd2}" || true` | ||
if [ -n "$pid" ]; then | ||
kill $pid | ||
fi | ||
${cmd2} | ||
} | ||
|
||
# Do the update again and again. There will be at least $SECONDS_BETWEEN_UPDATES between two runs. | ||
while [ 1 ] | ||
do | ||
START_TIME=`date +%s` | ||
|
||
cd $HOME/bin | ||
echo "[LitUpdate] Importing new PubMed XML documents into the database `date`" | ||
$TIME_CMD java -jar costosys.jar -dbc $GEPI_PREPROCESSING_PM/config/costosys.xml -im $GEPI_PREPROCESSING_PM/../pubmedImport.xml | ||
echo "[LitUpdate] Finished importing new PubMed XML documents into the database `date`" | ||
echo "[LitUpdate] Importing new PMC XML documents into the database `date`" | ||
$TIME_CMD java -jar costosys.jar -dbc $GEPI_PREPROCESSING_PMC/config/costosys.xml -ip $GEPI_PREPROCESSING_PMC/../pmcImport.xml | ||
echo "[LitUpdate] Finished importing new PMC XML documents into the database `date`" | ||
|
||
# Run the NLP processing | ||
echo "[LitUpdate] Running PubMed preprocessing `date`" | ||
cd $GEPI_PREPROCESSING_PM | ||
$TIME_CMD ./run.sh | ||
echo "[LitUpdate] Finished running PubMed preprocessing `date`" | ||
echo "[LitUpdate] Running PMC preprocessing `date`" | ||
cd $GEPI_PREPROCESSING_PMC | ||
$TIME_CMD ./run.sh | ||
echo "[LitUpdate] Finished running PMC preprocessing `date`" | ||
|
||
# Run the indexing | ||
# Reset documents that have been stuck in "in_process" for some reason (e.g. broken ES tunnel in last processing) | ||
java -jar $HOME/bin/costosys.jar -dbc $GEPI_PREPROCESSING_PM/config/costosys.xml -re gepi._documents_mirror -np | ||
# Open tunnel to ES | ||
tunnel_to_es | ||
echo "[LitUpdate] Running PubMed indexing `date`" | ||
cd $GEPI_INDEXING_PM | ||
$TIME_CMD ./run.sh | ||
echo "[LitUpdate] Finished running PubMed indexing `date`" | ||
echo "[LitUpdate] Running PMC indexing `date`" | ||
# Reset documents that have been stuck in "in_process" for some reason (e.g. broken ES tunnel in last processing) | ||
java -jar $HOME/bin/costosys.jar -dbc $GEPI_PREPROCESSING_PMC/config/costosys.xml -re gepi._documents_mirror -np | ||
# Reset the tunnel or re-create it if it collapsed before | ||
tunnel_to_es | ||
cd $GEPI_INDEXING_PMC | ||
$TIME_CMD ./run.sh | ||
echo "[LitUpdate] Finished running PMC indexing `date`" | ||
|
||
END_TIME=`date +%s` | ||
ELAPSED_TIME=$(($END_TIME-$START_TIME)) | ||
echo "[LitUpdate] Updated PubMed and PMC literature from XML to index in $ELAPSED_TIME seconds. `date`" | ||
if [ $ELAPSED_TIME -lt $SECONDS_BETWEEN_UPDATES ]; then | ||
SLEEP_TIME=$((SECONDS_BETWEEN_UPDATES-$ELAPSED_TIME)) | ||
echo "[LitUpdate] Sleeping for $SLEEP_TIME seconds before starting next update. `date`" | ||
sleep $SLEEP_TIME | ||
else | ||
echo "[LitUpdate] Update took longer than the time between update runs. Starting with a new update. `date`" | ||
fi | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/bin/bash | ||
# This script checks if there are documents in the JeDIS Postgres database which are missing from the | ||
# GePI ElasticSearch interaction index. Such documents should be reset for new processing in the JeDIS database. | ||
# Takes one parameter: | ||
# 1. Path to a file that defines the following environment variables: | ||
# - DBNAME_PUBMED: Name of the Postgres database where the PubMed JeDIS data for GePI is located. | ||
# - USER_PUBMED: Username for the PubMed JeDIS (Postgres) database. | ||
# - PASSWORD_PUBMED: Password for the PubMed JeDIS (Postgres) database. | ||
# - HOST_PUBMED: Host of the PubMed JeDIS (Postgres) database. | ||
# - PORT_PUBMED: Port of the PubMed JeDIS (Postgres) database. | ||
# - DBNAME_PMC: Name of the Postgres database where the PMC JeDIS data for GePI is located. | ||
# - USER_PMC: Username for the PMC JeDIS (Postgres) database. | ||
# - PASSWORD_PMC: Password for the PMC JeDIS (Postgres) database. | ||
# - HOST_PMC: Host of the PMC JeDIS (Postgres) database. | ||
# - PORT_PMC: Port of the PubMed JeDIS (Postgres) database. | ||
# - ES_INDEX: Name of the GePI ElasticSearch interaction index | ||
# - (Optional) ES_URL: The URL to ElasticSearch. Defaults to http://localhost:9201 | ||
source ~/.gepi-validation | ||
export PGPASSWORD=$PASSWORD_PUBMED | ||
echo "Writing PubMed IDs with EventMentions in the JeDIS database to pmid_pg.txt" | ||
psql -qtA -h $HOST_PUBMED -p $PORT_PUBMED -U $USER_PUBMED $DBNAME_PUBMED -c "SELECT pmid FROM _data_xmi.documents WHERE gnormplusbiosem\$de_julielab_jcore_types_eventmention IS NOT NULL" > pmid_pg.txt | ||
export PGPASSWORD=$PASSWORD_PMC | ||
echo "Writing PMC IDs with EventMentions in the JeDIS database to pmcid_pg.txt" | ||
psql -qtA -h $HOST_PMC -p $PORT_PMC -U $USER_PMC $DBNAME_PMC -c "SELECT pmcid FROM _data_xmi.documents WHERE gnormplusbiosem\$de_julielab_jcore_types_eventmention IS NOT NULL" > pmcid_pg.txt | ||
|
||
# This script pulls the document IDs from ElasticSearch and Postgres in an effort to make sure that every | ||
# document in the JeDIS database (Postgres) arrived in ElasticSearch. | ||
if [ -z "$ES_URL" ]; then | ||
ES_URL="http://localhost:9201" | ||
fi | ||
HEADER="-H Content-Type:application/json" | ||
curl -XPOST $ES_URL/$ES_INDEX/_search $HEADER -d '{ | ||
"query": { | ||
"match_all": {} | ||
}, | ||
"aggs": { | ||
"pmids": { | ||
"terms": { | ||
"field": "pmid", | ||
"size": 10000000 | ||
} | ||
}, | ||
"pmcids": { | ||
"terms": { | ||
"field": "pmcid", | ||
"size": 10000000 | ||
} | ||
} | ||
} | ||
}' > es_docid_aggregation.json | ||
grep -oE 'key":"[0-9]+' es_docid_aggregation.json | grep -oE '[0-9]+' > pmid_es.txt | ||
grep -oE 'key":"PMC[0-9]+' es_docid_aggregation.json | grep -oE 'PMC[0-9]+' > pmcid_es.txt | ||
|
||
echo "PubMed: Got `wc -l pmid_pg.txt` IDs from Postgres and `wc -l pmid_es.txt` from ElasticSearch" | ||
echo "PMC: Got `wc -l pmcid_pg.txt` IDs from Postgres and `wc -l pmcid_es.txt` from ElasticSearch" | ||
|
||
cat pmid_es.txt pmid_pg.txt | sort | uniq > pmid_missing.txt | ||
cat pmcid_es.txt pmcid_pg.txt | sort | uniq > pmcid_missing.txt | ||
|
||
echo "Missing PubMed: Got `wc -l pmid_missing.txt` unique doc IDs; assuming those are missing from ElasticSearch" | ||
echo "Missing PMC: Got `wc -l pmcid_missing.txt` unique doc IDs; assuming those are missing from ElasticSearch" | ||
|