From c39bf5a23718cc9164c7c5cb84cb3be9ae056548 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 20 May 2024 15:27:28 +0200 Subject: [PATCH] Add a new Exception type to indicate inconsistent data. In rare cases, the text format includes line breaks or other elements that break GNormPlus processing. This results in unrecoverable errors. The new Exception type indicates such cases so that the calling code can react accordingly. --- pom.xml | 2 +- src/GNormPluslib/GNR.java | 46 +++++++++++++++++-- .../InconsistentDataException.java | 32 +++++++++++++ src/GNormPluslib/SR.java | 7 ++- 4 files changed, 81 insertions(+), 6 deletions(-) create mode 100644 src/GNormPluslib/InconsistentDataException.java diff --git a/pom.xml b/pom.xml index 566685b..1808fda 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ de.julielab julielab-gnormplus jar - 1.0.1 + 1.0.2 JULIE Lab GNormPlus https://www.ncbi.nlm.nih.gov/research/bionlp/Tools/gnormplus/ diff --git a/src/GNormPluslib/GNR.java b/src/GNormPluslib/GNR.java index 8408f5e..376e4df 100644 --- a/src/GNormPluslib/GNR.java +++ b/src/GNormPluslib/GNR.java @@ -7,10 +7,13 @@ import java.io.*; import java.util.*; +import java.util.concurrent.Exchanger; +import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.stream.XMLStreamException; +import com.ctc.wstx.io.WstxInputSource; import org.tartarus.snowball.SnowballStemmer; import org.tartarus.snowball.ext.englishStemmer; @@ -170,8 +173,45 @@ public void LoadInputFile(String Filename,String FilenameAbb,String TrainTest) t cmd ="./Ab3P "+FilenameAbb+" "+FilenameAbb+".out"; //cmd ="java -jar bioadi.jar "+FilenameAbb+" > "+FilenameAbb+".out"; } - - Process process = runtime.exec(cmd); + + // We let the command run in its own thread. Then we can use process.waitFor() to set a timeout. + // We do this because in rare cases, the Ab3P program seems to run forever. + final String finalCmd = cmd; + final Process process = runtime.exec(finalCmd); + Thread t = new Thread("GNP Ab3P Runner") { + @Override + public void run() { + super.run(); + try { + System.out.println("Starting to find abbreviations with command " + finalCmd); + InputStream is = process.getInputStream(); + InputStreamReader isr = new InputStreamReader(is, "UTF-8"); + BufferedReader br = new BufferedReader(isr); + String line=""; + while ( (line = br.readLine()) != null) + { + fr.write(line); + fr.newLine(); + fr.flush(); + } + is.close(); + isr.close(); + br.close(); + fr.close(); + + } catch (IOException e) { + System.err.println("Error in Thread to run cmd " + finalCmd); + e.printStackTrace(); + } + } + }; + t.start(); + try { + process.waitFor(10, TimeUnit.MINUTES); + } catch (InterruptedException e) { + System.err.println("Command " + finalCmd + " was interrupted because it took too long."); + } + /*Process process = runtime.exec(cmd); InputStream is = process.getInputStream(); InputStreamReader isr = new InputStreamReader(is, "UTF-8"); BufferedReader br = new BufferedReader(isr); @@ -185,7 +225,7 @@ public void LoadInputFile(String Filename,String FilenameAbb,String TrainTest) t is.close(); isr.close(); br.close(); - fr.close(); + fr.close();*/ //Abb output -> Hash BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameAbb+".out"), "UTF-8")); line=""; diff --git a/src/GNormPluslib/InconsistentDataException.java b/src/GNormPluslib/InconsistentDataException.java new file mode 100644 index 0000000..6489ef5 --- /dev/null +++ b/src/GNormPluslib/InconsistentDataException.java @@ -0,0 +1,32 @@ +package GNormPluslib; + +public class InconsistentDataException extends RuntimeException { + private String docId; + + public String getDocId() { + return docId; + } + + public void setDocId(String docId) { + this.docId = docId; + } + + public InconsistentDataException() { + } + + public InconsistentDataException(String message) { + super(message); + } + + public InconsistentDataException(String message, Throwable cause) { + super(message, cause); + } + + public InconsistentDataException(Throwable cause) { + super(cause); + } + + public InconsistentDataException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/src/GNormPluslib/SR.java b/src/GNormPluslib/SR.java index c90896b..259c2d9 100644 --- a/src/GNormPluslib/SR.java +++ b/src/GNormPluslib/SR.java @@ -816,8 +816,11 @@ public void SpeciesAssignment(String Filename,String FilenameBioC) throws IOExce String G_mentions = anno[2]; String G_type = anno[3]; String G_mention_list[]=G_mentions.split("\\|"); - if (G_mention_list.length == 0) - throw new IllegalStateException("There is no gene mention but at least one was expected in document with ID " + data.getBioCDocobj().PMIDs.get(i) + " in paragraph with offset " + data.getBioCDocobj().PassageOffsets.get(i).get(j) + " and length " + PassageContext.length() + " beginning with " + PassageContext.substring(0, Math.min(PassageContext.length(), 80))); + if (G_mention_list.length == 0) { + InconsistentDataException e = new InconsistentDataException("There is no gene mention but at least one was expected in document with ID " + data.getBioCDocobj().PMIDs.get(i) + " in paragraph with offset " + data.getBioCDocobj().PassageOffsets.get(i).get(j) + " and length " + PassageContext.length() + " beginning with " + PassageContext.substring(0, Math.min(PassageContext.length(), 80))); + e.setDocId(data.getBioCDocobj().PMIDs.get(i)); + throw e; + } String G_mention=G_mention_list[0]; // only use the first term to detect species ; should be updated after SimConcept /** 1. prefix */