From 7af800aacc22af1669aaa4a160606fde092d3a13 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 6 Sep 2024 16:12:33 +0200 Subject: [PATCH] Add maintenance scripts. literature_update.sh and reset_missing_documents.sh. Check the comments in the scripts for details. --- gepi/.gitignore | 1 - gepi/literature_update.sh | 87 +++++++++++++++++++++++++++++++++ gepi/reset_missing_documents.sh | 62 +++++++++++++++++++++++ 3 files changed, 149 insertions(+), 1 deletion(-) create mode 100755 gepi/literature_update.sh create mode 100644 gepi/reset_missing_documents.sh diff --git a/gepi/.gitignore b/gepi/.gitignore index 84eea1e5..9396a40d 100644 --- a/gepi/.gitignore +++ b/gepi/.gitignore @@ -13,4 +13,3 @@ gepi-indexing-pipeline/lib gepi-indexing-pipeline/resources gepi-indexing-pipeline/config/jcore-pipeline-config.jar missing_configuration.txt -literature_update.sh diff --git a/gepi/literature_update.sh b/gepi/literature_update.sh new file mode 100755 index 00000000..454f3fcd --- /dev/null +++ b/gepi/literature_update.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# Script that runs in an infinite loop and updates the GePI interaction index. +# CAUTION: This script is written for the specific situation in the JulieLab. You should adapt it for your environment. +# Takes one parameter: +# 1: Path to a file that defines the following environment variables: +# - GEPI_REPO_DIR: Directory to the GePI Maven project root (i.e. the gepi/ directory within the gepi repository) +# - GEPI_PREPROCESSING_PM: Path to the JCoRe Pipeline performing the NLP preprocessing of PubMed for GePI (e.g. the one at gepi-preprocessing/pubmed/preprocessing of this repository) +# - GEPI_PREPROCESSING_PMC: Path to the JCoRe Pipeline performing the NLP preprocessing of PMC for GePI (e.g. the one at gepi-preprocessing/pmc/preprocessing of this repository) +# - GEPI_INDEXING_PM: Path to the JCoRe Pipeline performing the ElasticSearch indexing of PubMed for GePI (e.g. the one at gepi-indexing/gepi-indexing-pubmed) +# - GEPI_INDEXING_PMC: Path to the JCoRe Pipeline performing the ElasticSearch indexing of PubMed for GePI (e.g. the one at gepi-indexing/gepi-indexing-pubmed) +set -e +# Stop via CTRL-Z followed by "kill %%"" +source $1 +TIME_CMD="/usr/bin/time -v" + +# a day +SECONDS_BETWEEN_UPDATES=86400 + +# For security reasons, our GePI ElasticSearch does not accept remote +# connections. Thus, we need to tunnel to it for indexing. +function tunnel_to_es() { + cmd1='ssh -i ~/.ssh/id_rsa -4 -f -N -L 9201:localhost:9200 gepi-vm' + pid=`pgrep -f "${cmd1}" || true` + if [ -n "$pid" ]; then + kill $pid + fi + ${cmd1} + cmd2='ssh -i ~/.ssh/id_rsa -4 -f -N -L 9301:localhost:9300 gepi-vm' + pid=`pgrep -f "${cmd2}" || true` + if [ -n "$pid" ]; then + kill $pid + fi + ${cmd2} +} + +# Do the update again and again. There will be at least $SECONDS_BETWEEN_UPDATES between two runs. +while [ 1 ] +do + START_TIME=`date +%s` + + cd $HOME/bin + echo "[LitUpdate] Importing new PubMed XML documents into the database `date`" + $TIME_CMD java -jar costosys.jar -dbc $GEPI_PREPROCESSING_PM/config/costosys.xml -im $GEPI_PREPROCESSING_PM/../pubmedImport.xml + echo "[LitUpdate] Finished importing new PubMed XML documents into the database `date`" + echo "[LitUpdate] Importing new PMC XML documents into the database `date`" + $TIME_CMD java -jar costosys.jar -dbc $GEPI_PREPROCESSING_PMC/config/costosys.xml -ip $GEPI_PREPROCESSING_PMC/../pmcImport.xml + echo "[LitUpdate] Finished importing new PMC XML documents into the database `date`" + + # Run the NLP processing + echo "[LitUpdate] Running PubMed preprocessing `date`" + cd $GEPI_PREPROCESSING_PM + $TIME_CMD ./run.sh + echo "[LitUpdate] Finished running PubMed preprocessing `date`" + echo "[LitUpdate] Running PMC preprocessing `date`" + cd $GEPI_PREPROCESSING_PMC + $TIME_CMD ./run.sh + echo "[LitUpdate] Finished running PMC preprocessing `date`" + + # Run the indexing + # Reset documents that have been stuck in "in_process" for some reason (e.g. broken ES tunnel in last processing) + java -jar $HOME/bin/costosys.jar -dbc $GEPI_PREPROCESSING_PM/config/costosys.xml -re gepi._documents_mirror -np + # Open tunnel to ES + tunnel_to_es + echo "[LitUpdate] Running PubMed indexing `date`" + cd $GEPI_INDEXING_PM + $TIME_CMD ./run.sh + echo "[LitUpdate] Finished running PubMed indexing `date`" + echo "[LitUpdate] Running PMC indexing `date`" + # Reset documents that have been stuck in "in_process" for some reason (e.g. broken ES tunnel in last processing) + java -jar $HOME/bin/costosys.jar -dbc $GEPI_PREPROCESSING_PMC/config/costosys.xml -re gepi._documents_mirror -np + # Reset the tunnel or re-create it if it collapsed before + tunnel_to_es + cd $GEPI_INDEXING_PMC + $TIME_CMD ./run.sh + echo "[LitUpdate] Finished running PMC indexing `date`" + + END_TIME=`date +%s` + ELAPSED_TIME=$(($END_TIME-$START_TIME)) + echo "[LitUpdate] Updated PubMed and PMC literature from XML to index in $ELAPSED_TIME seconds. `date`" + if [ $ELAPSED_TIME -lt $SECONDS_BETWEEN_UPDATES ]; then + SLEEP_TIME=$((SECONDS_BETWEEN_UPDATES-$ELAPSED_TIME)) + echo "[LitUpdate] Sleeping for $SLEEP_TIME seconds before starting next update. `date`" + sleep $SLEEP_TIME + else + echo "[LitUpdate] Update took longer than the time between update runs. Starting with a new update. `date`" + fi +done diff --git a/gepi/reset_missing_documents.sh b/gepi/reset_missing_documents.sh new file mode 100644 index 00000000..968c0812 --- /dev/null +++ b/gepi/reset_missing_documents.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# This script checks if there are documents in the JeDIS Postgres database which are missing from the +# GePI ElasticSearch interaction index. Such documents should be reset for new processing in the JeDIS database. +# Takes one parameter: +# 1. Path to a file that defines the following environment variables: +# - DBNAME_PUBMED: Name of the Postgres database where the PubMed JeDIS data for GePI is located. +# - USER_PUBMED: Username for the PubMed JeDIS (Postgres) database. +# - PASSWORD_PUBMED: Password for the PubMed JeDIS (Postgres) database. +# - HOST_PUBMED: Host of the PubMed JeDIS (Postgres) database. +# - PORT_PUBMED: Port of the PubMed JeDIS (Postgres) database. +# - DBNAME_PMC: Name of the Postgres database where the PMC JeDIS data for GePI is located. +# - USER_PMC: Username for the PMC JeDIS (Postgres) database. +# - PASSWORD_PMC: Password for the PMC JeDIS (Postgres) database. +# - HOST_PMC: Host of the PMC JeDIS (Postgres) database. +# - PORT_PMC: Port of the PubMed JeDIS (Postgres) database. +# - ES_INDEX: Name of the GePI ElasticSearch interaction index +# - (Optional) ES_URL: The URL to ElasticSearch. Defaults to http://localhost:9201 +source ~/.gepi-validation +export PGPASSWORD=$PASSWORD_PUBMED +echo "Writing PubMed IDs with EventMentions in the JeDIS database to pmid_pg.txt" +psql -qtA -h $HOST_PUBMED -p $PORT_PUBMED -U $USER_PUBMED $DBNAME_PUBMED -c "SELECT pmid FROM _data_xmi.documents WHERE gnormplusbiosem\$de_julielab_jcore_types_eventmention IS NOT NULL" > pmid_pg.txt +export PGPASSWORD=$PASSWORD_PMC +echo "Writing PMC IDs with EventMentions in the JeDIS database to pmcid_pg.txt" +psql -qtA -h $HOST_PMC -p $PORT_PMC -U $USER_PMC $DBNAME_PMC -c "SELECT pmcid FROM _data_xmi.documents WHERE gnormplusbiosem\$de_julielab_jcore_types_eventmention IS NOT NULL" > pmcid_pg.txt + +# This script pulls the document IDs from ElasticSearch and Postgres in an effort to make sure that every +# document in the JeDIS database (Postgres) arrived in ElasticSearch. +if [ -z "$ES_URL" ]; then + ES_URL="http://localhost:9201" +fi +HEADER="-H Content-Type:application/json" +curl -XPOST $ES_URL/$ES_INDEX/_search $HEADER -d '{ + "query": { + "match_all": {} + }, + "aggs": { + "pmids": { + "terms": { + "field": "pmid", + "size": 10000000 + } + }, + "pmcids": { + "terms": { + "field": "pmcid", + "size": 10000000 + } + } + } +}' > es_docid_aggregation.json +grep -oE 'key":"[0-9]+' es_docid_aggregation.json | grep -oE '[0-9]+' > pmid_es.txt +grep -oE 'key":"PMC[0-9]+' es_docid_aggregation.json | grep -oE 'PMC[0-9]+' > pmcid_es.txt + +echo "PubMed: Got `wc -l pmid_pg.txt` IDs from Postgres and `wc -l pmid_es.txt` from ElasticSearch" +echo "PMC: Got `wc -l pmcid_pg.txt` IDs from Postgres and `wc -l pmcid_es.txt` from ElasticSearch" + +cat pmid_es.txt pmid_pg.txt | sort | uniq > pmid_missing.txt +cat pmcid_es.txt pmcid_pg.txt | sort | uniq > pmcid_missing.txt + +echo "Missing PubMed: Got `wc -l pmid_missing.txt` unique doc IDs; assuming those are missing from ElasticSearch" +echo "Missing PMC: Got `wc -l pmcid_missing.txt` unique doc IDs; assuming those are missing from ElasticSearch" +