Add maintenance scripts.

literature_update.sh and reset_missing_documents.sh. Check the comments in the scripts for details.
JULIELab · Sep 6, 2024 · 7af800a · 7af800a
1 parent a06beff
commit 7af800a
Show file tree

Hide file tree

Showing 3 changed files with 149 additions and 1 deletion.
diff --git a/gepi/.gitignore b/gepi/.gitignore
@@ -13,4 +13,3 @@ gepi-indexing-pipeline/lib
 gepi-indexing-pipeline/resources
 gepi-indexing-pipeline/config/jcore-pipeline-config.jar
 missing_configuration.txt
-literature_update.sh
diff --git a/gepi/literature_update.sh b/gepi/literature_update.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+# Script that runs in an infinite loop and updates the GePI interaction index.
+# CAUTION: This script is written for the specific situation in the JulieLab. You should adapt it for your environment.
+# Takes one parameter:
+# 1: Path to a file that defines the following environment variables:
+# - GEPI_REPO_DIR: Directory to the GePI Maven project root (i.e. the gepi/ directory within the gepi repository)
+# - GEPI_PREPROCESSING_PM: Path to the JCoRe Pipeline performing the NLP preprocessing of PubMed for GePI (e.g. the one at gepi-preprocessing/pubmed/preprocessing of this repository)
+# - GEPI_PREPROCESSING_PMC: Path to the JCoRe Pipeline performing the NLP preprocessing of PMC for GePI (e.g. the one at gepi-preprocessing/pmc/preprocessing of this repository)
+# - GEPI_INDEXING_PM: Path to the JCoRe Pipeline performing the ElasticSearch indexing of PubMed for GePI (e.g. the one at gepi-indexing/gepi-indexing-pubmed)
+# - GEPI_INDEXING_PMC: Path to the JCoRe Pipeline performing the ElasticSearch indexing of PubMed for GePI (e.g. the one at gepi-indexing/gepi-indexing-pubmed)
+set -e
+# Stop via CTRL-Z followed by "kill %%""
+source $1
+TIME_CMD="/usr/bin/time -v"
+
+# a day
+SECONDS_BETWEEN_UPDATES=86400
+
+# For security reasons, our GePI ElasticSearch does not accept remote
+# connections. Thus, we need to tunnel to it for indexing.
+function tunnel_to_es() {
+	cmd1='ssh -i ~/.ssh/id_rsa -4 -f -N -L 9201:localhost:9200 gepi-vm'
+	pid=`pgrep -f "${cmd1}" || true`
+	if [ -n "$pid" ]; then
+		kill $pid
+	fi
+	${cmd1}
+	cmd2='ssh -i ~/.ssh/id_rsa -4 -f -N -L 9301:localhost:9300 gepi-vm'
+	pid=`pgrep -f "${cmd2}" || true`
+	if [ -n "$pid" ]; then
+		kill $pid
+	fi
+	${cmd2}
+}
+
+# Do the update again and again. There will be at least $SECONDS_BETWEEN_UPDATES between two runs.
+while [ 1 ]
+do
+	START_TIME=`date +%s`
+
+	cd $HOME/bin
+	echo "[LitUpdate] Importing new PubMed XML documents into the database `date`"
+	$TIME_CMD java -jar costosys.jar -dbc $GEPI_PREPROCESSING_PM/config/costosys.xml -im $GEPI_PREPROCESSING_PM/../pubmedImport.xml
+	echo "[LitUpdate] Finished importing new PubMed XML documents into the database `date`"
+	echo "[LitUpdate] Importing new PMC XML documents into the database `date`"
+	$TIME_CMD java -jar costosys.jar -dbc $GEPI_PREPROCESSING_PMC/config/costosys.xml -ip $GEPI_PREPROCESSING_PMC/../pmcImport.xml
+	echo "[LitUpdate] Finished importing new PMC XML documents into the database `date`"
+
+	# Run the NLP processing
+	echo "[LitUpdate] Running PubMed preprocessing `date`"
+	cd $GEPI_PREPROCESSING_PM
+	$TIME_CMD ./run.sh
+	echo "[LitUpdate] Finished running PubMed preprocessing `date`"
+	echo "[LitUpdate] Running PMC preprocessing `date`"
+	cd $GEPI_PREPROCESSING_PMC
+	$TIME_CMD ./run.sh
+	echo "[LitUpdate] Finished running PMC preprocessing `date`"
+
+	# Run the indexing
+	# Reset documents that have been stuck in "in_process" for some reason (e.g. broken ES tunnel in last processing)
+	java -jar $HOME/bin/costosys.jar -dbc $GEPI_PREPROCESSING_PM/config/costosys.xml -re gepi._documents_mirror -np
+	# Open tunnel to ES
+	tunnel_to_es
+	echo "[LitUpdate] Running PubMed indexing `date`"
+	cd $GEPI_INDEXING_PM
+	$TIME_CMD ./run.sh
+	echo "[LitUpdate] Finished running PubMed indexing `date`"
+	echo "[LitUpdate] Running PMC indexing `date`"
+	# Reset documents that have been stuck in "in_process" for some reason (e.g. broken ES tunnel in last processing)
+	java -jar $HOME/bin/costosys.jar -dbc $GEPI_PREPROCESSING_PMC/config/costosys.xml -re gepi._documents_mirror -np
+	# Reset the tunnel or re-create it if it collapsed before
+	tunnel_to_es
+	cd $GEPI_INDEXING_PMC
+	$TIME_CMD ./run.sh
+	echo "[LitUpdate] Finished running PMC indexing `date`"
+
+	END_TIME=`date +%s`
+	ELAPSED_TIME=$(($END_TIME-$START_TIME))
+	echo "[LitUpdate] Updated PubMed and PMC literature from XML to index in $ELAPSED_TIME seconds. `date`"
+	if [ $ELAPSED_TIME -lt $SECONDS_BETWEEN_UPDATES ]; then
+		SLEEP_TIME=$((SECONDS_BETWEEN_UPDATES-$ELAPSED_TIME))
+		echo "[LitUpdate] Sleeping for $SLEEP_TIME seconds before starting next update. `date`"
+		sleep $SLEEP_TIME
+	else
+		echo "[LitUpdate] Update took longer than the time between update runs. Starting with a new update. `date`"
+	fi
+done
diff --git a/gepi/reset_missing_documents.sh b/gepi/reset_missing_documents.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# This script checks if there are documents in the JeDIS Postgres database which are missing from the
+# GePI ElasticSearch interaction index. Such documents should be reset for new processing in the JeDIS database.
+# Takes one parameter:
+# 1. Path to a file that defines the following environment variables:
+# - DBNAME_PUBMED: Name of the Postgres database where the PubMed JeDIS data for GePI is located.
+# - USER_PUBMED: Username for the PubMed JeDIS (Postgres) database.
+# - PASSWORD_PUBMED: Password for the PubMed JeDIS (Postgres) database.
+# - HOST_PUBMED: Host of the PubMed JeDIS (Postgres) database.
+# - PORT_PUBMED: Port of the PubMed JeDIS (Postgres) database.
+# - DBNAME_PMC: Name of the Postgres database where the PMC JeDIS data for GePI is located.
+# - USER_PMC: Username for the PMC JeDIS (Postgres) database.
+# - PASSWORD_PMC: Password for the PMC JeDIS (Postgres) database.
+# - HOST_PMC: Host of the PMC JeDIS (Postgres) database.
+# - PORT_PMC: Port of the PubMed JeDIS (Postgres) database.
+# - ES_INDEX: Name of the GePI ElasticSearch interaction index
+# - (Optional) ES_URL: The URL to ElasticSearch. Defaults to http://localhost:9201
+source ~/.gepi-validation
+export PGPASSWORD=$PASSWORD_PUBMED
+echo "Writing PubMed IDs with EventMentions in the JeDIS database to pmid_pg.txt"
+psql -qtA -h $HOST_PUBMED -p $PORT_PUBMED -U $USER_PUBMED $DBNAME_PUBMED -c "SELECT pmid FROM _data_xmi.documents WHERE gnormplusbiosem\$de_julielab_jcore_types_eventmention IS NOT NULL" > pmid_pg.txt
+export PGPASSWORD=$PASSWORD_PMC
+echo "Writing PMC IDs with EventMentions in the JeDIS database to pmcid_pg.txt"
+psql -qtA -h $HOST_PMC -p $PORT_PMC -U $USER_PMC $DBNAME_PMC -c "SELECT pmcid FROM _data_xmi.documents WHERE gnormplusbiosem\$de_julielab_jcore_types_eventmention IS NOT NULL" > pmcid_pg.txt
+
+# This script pulls the document IDs from ElasticSearch and Postgres in an effort to make sure that every
+# document in the JeDIS database (Postgres) arrived in ElasticSearch.
+if [ -z "$ES_URL" ]; then
+  ES_URL="http://localhost:9201"
+fi
+HEADER="-H Content-Type:application/json"
+curl -XPOST $ES_URL/$ES_INDEX/_search $HEADER -d '{
+	"query": {
+		"match_all": {}
+	},
+	"aggs": {
+		"pmids": {
+			"terms": {
+				"field": "pmid",
+				"size": 10000000
+			}
+		},
+		"pmcids": {
+			"terms": {
+				"field": "pmcid",
+				"size": 10000000
+			}
+		}
+	}
+}' > es_docid_aggregation.json
+grep -oE 'key":"[0-9]+' es_docid_aggregation.json | grep  -oE '[0-9]+' > pmid_es.txt
+grep -oE 'key":"PMC[0-9]+' es_docid_aggregation.json | grep  -oE 'PMC[0-9]+' > pmcid_es.txt
+
+echo "PubMed: Got `wc -l pmid_pg.txt` IDs from Postgres and `wc -l pmid_es.txt` from ElasticSearch"
+echo "PMC: Got `wc -l pmcid_pg.txt` IDs from Postgres and `wc -l pmcid_es.txt` from ElasticSearch"
+
+cat pmid_es.txt pmid_pg.txt | sort | uniq > pmid_missing.txt
+cat pmcid_es.txt pmcid_pg.txt | sort | uniq > pmcid_missing.txt
+
+echo "Missing PubMed: Got `wc -l pmid_missing.txt` unique doc IDs; assuming those are missing from ElasticSearch"
+echo "Missing PMC: Got `wc -l pmcid_missing.txt` unique doc IDs; assuming those are missing from ElasticSearch"
+