Release version 1.0.0.

JULIELab · Nov 16, 2022 · 9f309b4 · 9f309b4
1 parent 4dfd8ee
commit 9f309b4
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ GNormPlus has been developed at the NLM and is described in [1] and can be downl
 
 ## Code changes to output FamilyNames
 
-There are two places in the code marked with a comment containing "Erik Faessler". The added conditions lead to the output of the FamilyName entities. Those entities do not receive an ID from NCBI Gene.
+There are a few places in the code marked with a comment containing "Erik Faessler". Two of those changes  lead to the output of the FamilyName entities. Those entities do not receive an ID from NCBI Gene.
 
 ## Refactoring to allow multi-threaded processing
 

diff --git a/convertBioCDocClass.py b/convertBioCDocClass.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+"""
+This script was used to change the code of the BioCDoc class in order to use the
+"""
 import re
 
 def normalize(s):

diff --git a/pom.xml b/pom.xml
@@ -5,7 +5,7 @@
     <groupId>de.julielab</groupId>
     <artifactId>julielab-gnormplus</artifactId>
     <packaging>jar</packaging>
-    <version>1.0.0-SNAPSHOT</version>
+    <version>1.0.0</version>
     <name>JULIE Lab GNormPlus</name>
     <url>https://www.ncbi.nlm.nih.gov/research/bionlp/Tools/gnormplus/</url>
 
@@ -16,7 +16,6 @@
     </parent>
 
 
-
     <dependencies>
         <dependency>
             <groupId>com.pengyifan.bioc</groupId>
@@ -60,4 +59,18 @@
             </plugin>
         </plugins>
     </build>
+    <developers>
+        <developer>
+            <name>Erik Faessler</name>
+            <url>https://julielab.de/Staff/Faessler/</url>
+            <email>[email protected]</email>
+            <organization>JULIE Lab Jena, Germany</organization>
+            <organizationUrl>https://julielab.de/</organizationUrl>
+        </developer>
+    </developers>
+    <scm>
+        <url>https://github.com/JULIELab/gnormplus</url>
+        <connection>scm:git:https://github.com/JULIELab/gnormplus</connection>
+        <developerConnection>scm:git:https://github.com/JULIELab/gnormplus</developerConnection>
+    </scm>
 </project>
diff --git a/src/GNormPluslib/BioCDoc.java b/src/GNormPluslib/BioCDoc.java
@@ -821,6 +821,8 @@ public void BioCOutput(String input, String output, ArrayList<ArrayList<ArrayLis
 								Anno[4] = Anno[5];
 							}
 						}
+						if (Anno.length < 4)
+							throw new IllegalStateException("Document with ID " + PMID + " has annotation \"" + Arrays.toString(Anno) + "\" which is too short: A minimal length of 4 is expected.");
 						String type = Anno[3];
 						if (type.equals("GeneID")) {
 							type = "Gene";

diff --git a/src/GNormPluslib/SR.java b/src/GNormPluslib/SR.java
@@ -68,24 +68,28 @@ public void SpeciesRecognition(String Filename, String FilenameBioC, String Stra
 	        		// For anti-serum filtering
 	        		String ForwardSTR="";
 	        		String BackwardSTR="";
-	        		if(start>21)
-	        		{
-	        			ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(start-21,last);
-	        		}
-	        		else
-	        		{
-	        			ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(0,last);
-	        		}
-	        		if(PassageContext.length()>last+21)
-	        		{
-	        			BackwardSTR = PassageContext.substring(start,last+21);
-	        		}
-	        		else
-	        		{
-	        			BackwardSTR = PassageContext.substring(start,PassageContext.length());
-	        		}
-
-	        		String mention = anno[2];
+					try {
+						if(start>21)
+						{
+							ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(start-21,last);
+						}
+						else
+						{
+							ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(0,last);
+						}
+						if(PassageContext.length()>last+21)
+						{
+							BackwardSTR = PassageContext.substring(start,last+21);
+						}
+						else
+						{
+							BackwardSTR = PassageContext.substring(start,PassageContext.length());
+						}
+					} catch (Exception e) {
+						throw new RuntimeException("Exception in document " + Pmid + " in paragraph with offset " + data.getBioCDocobj().PassageOffsets.get(i).get(j) + " and length " + PassageContext.length() + " beginning with " + PassageContext.substring(0, Math.min(PassageContext.length(), 80)), e);
+					}
+
+					String mention = anno[2];
 	        		String id = anno[3];
 	        		String mention_tmp=mention.toLowerCase();
 	        		mention_tmp = mention_tmp.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
@@ -166,6 +170,10 @@ else if(!id.equals("NA"))
 					String anno[]=locations.get(k).split("\t");
 					int start= Integer.parseInt(anno[0]);
 	        		int last= Integer.parseInt(anno[1]);
+					if (last > PassageContext.length()) {
+						// Erik Faessler: We had offset issues with texts that contain non-ASCII characters
+						continue;
+					}
 	        		String mention = anno[2];
 	        		String id = anno[3];
 	        		if(data.getBioCDocobj().Annotations.size()>i && data.getBioCDocobj().Annotations.get(i).size()>j)
@@ -804,6 +812,8 @@ public void SpeciesAssignment(String Filename,String FilenameBioC) throws IOExce
 		        		String G_mentions = anno[2];
 		        		String G_type = anno[3];
 		        		String G_mention_list[]=G_mentions.split("\\|");
+						if (G_mention_list.length == 0)
+							throw new IllegalStateException("There is no gene mention but at least one was expected in document with ID " +  data.getBioCDocobj().PMIDs.get(i) + " in paragraph with offset " + data.getBioCDocobj().PassageOffsets.get(i).get(j) + " and length " + PassageContext.length() + " beginning with " + PassageContext.substring(0, Math.min(PassageContext.length(), 80)));
 		        		String G_mention=G_mention_list[0]; // only use the first term to detect species ; should be updated after SimConcept
 
 		        		/** 1. prefix */