diff --git a/jcore-gnormplus-ae/pom.xml b/jcore-gnormplus-ae/pom.xml
index 162af3314..e45c0009f 100644
--- a/jcore-gnormplus-ae/pom.xml
+++ b/jcore-gnormplus-ae/pom.xml
@@ -13,13 +13,13 @@
2.6.1
- 2.6.9
+ 2.6.10
de.julielab
julielab-gnormplus
- 1.0.1
+ [1.0.2,1.1)
de.julielab
diff --git a/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/ae/gnp/GNormPlusProcessing.java b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/ae/gnp/GNormPlusProcessing.java
index 2c131183b..4db547d10 100644
--- a/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/ae/gnp/GNormPlusProcessing.java
+++ b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/ae/gnp/GNormPlusProcessing.java
@@ -61,10 +61,17 @@ public static Path processWithGNormPlus(BioCCollection bioCCollection, String ou
w.writeCollection(bioCCollection);
}
GNormPlus.processFile(filePath.toString(), filePath.getFileName().toString(), outputFilePath.toString(), System.currentTimeMillis(), "Test");
- Files.delete(filePath);
} catch (IOException | XMLStreamException e) {
log.error("Could not process document {}", collectionId);
throw new AnalysisEngineProcessException(e);
+ } finally {
+ try {
+ if (Files.exists(filePath))
+ Files.delete(filePath);
+ } catch (IOException e) {
+ log.error("Could not delete temporary GNormPlus File {}", filePath);
+ throw new AnalysisEngineProcessException(e);
+ }
}
return outputFilePath;
}
diff --git a/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusMultiplierLogic.java b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusMultiplierLogic.java
index 0cff65f4d..f78007834 100644
--- a/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusMultiplierLogic.java
+++ b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusMultiplierLogic.java
@@ -1,5 +1,6 @@
package de.julielab.jcore.multiplier.gnp;
+import GNormPluslib.InconsistentDataException;
import com.pengyifan.bioc.BioCCollection;
import com.pengyifan.bioc.BioCDocument;
import de.julielab.jcore.ae.gnp.GNormPlusProcessing;
@@ -81,6 +82,7 @@ public AbstractCas next() throws AnalysisEngineProcessException {
// This allows batch-processing within GNP which reduces file writes and reads (GNP internally
// writes a lot of temporary files that contain all the documents given to it in one single batch file).
cachedCasData.clear();
+
while (baseMultiplierHasNext.get()) {
final JCas jCas = baseMultiplierNext.get();
boolean isDocumentHashUnchanged = false;
@@ -110,8 +112,9 @@ public AbstractCas next() throws AnalysisEngineProcessException {
// now process the whole batch with GNP
if (gnormPlusInputCollection.getDocmentCount() > 0) {
log.trace("Processing {} documents with GNormPlus.", gnormPlusInputCollection.getDocmentCount());
- final Path outputFilePath = GNormPlusProcessing.processWithGNormPlus(gnormPlusInputCollection, outputDirectory);
+ Path outputFilePath = null;
try {
+ outputFilePath = GNormPlusProcessing.processWithGNormPlus(gnormPlusInputCollection, outputDirectory);
bioCCasPopulator = new BioCCasPopulator(outputFilePath, Class.forName(outputGeneTypeName).getConstructor(JCas.class));
// delete the GNP output if we don't want to keep it
if (outputDirectory.isBlank()) {
@@ -123,6 +126,11 @@ public AbstractCas next() throws AnalysisEngineProcessException {
} catch (ClassNotFoundException | NoSuchMethodException e) {
log.error("Could not obtain UIMA gene annotation type constructor for class {}", outputGeneTypeName);
throw new AnalysisEngineProcessException(e);
+ } catch (InconsistentDataException e) {
+ log.warn("GNormPlus encountered a data issue it cannot recover from: {} - no gene annotations will be created for this document batch.", e.getMessage());
+ // We set the populator to null as a signal that there are no annotations to be read. This is
+ // used further down to skip entity population of cached CASes before returning the CASes.
+ bioCCasPopulator = null;
}
}
}
@@ -144,7 +152,8 @@ public AbstractCas next() throws AnalysisEngineProcessException {
}
// If the document is unchanged and we skip unchanged documents, we do not have a GNormPlus result for this
// document, skip.
- if (!(isDocumentHashUnchanged && skipUnchangedDocuments)) {
+ // Also skip if the casPopulator is null. This can happen above when there is an error in GNormPlus.
+ if (!(isDocumentHashUnchanged && skipUnchangedDocuments) && bioCCasPopulator != null) {
bioCCasPopulator.populateWithNextDocument(jCas, true);
bioCCasPopulator.clearDocument(currentBiocResultCollectionIndex++);
}
diff --git a/jcore-pmc-db-reader/pom.xml b/jcore-pmc-db-reader/pom.xml
index b8320041e..313bbfd19 100644
--- a/jcore-pmc-db-reader/pom.xml
+++ b/jcore-pmc-db-reader/pom.xml
@@ -30,7 +30,7 @@
de.julielab
jcore-db-reader
- 2.6.2
+ 2.6.3
de.julielab
diff --git a/jcore-xmi-db-reader/pom.xml b/jcore-xmi-db-reader/pom.xml
index 3f4917efd..4686387bd 100644
--- a/jcore-xmi-db-reader/pom.xml
+++ b/jcore-xmi-db-reader/pom.xml
@@ -13,7 +13,7 @@
Reads CAS XMI data from a relational database (Postgres). Thus, the stored CASes may then be processed
further.
-
+ 2.6.4-SNAPSHOT
de.julielab
diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java
index 1b9c9c080..3bd07b2d9 100644
--- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java
+++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java
@@ -162,11 +162,11 @@ public void populateCas(byte[][] data, JCas jCas) throws CasPopulationException
JCoReTools.deserializeXmi(jCas.getCas(), new ByteArrayInputStream(xmiByteData), xercesAttributeBufferSize);
} catch (SAXException e) {
String docData = new String(xmiByteData, StandardCharsets.UTF_8);
- if (!docData.contains("xmi:XMI xmlns:xmi=\"http://www.omg.org/XMI\""))
+ if (!docData.contains("xmi:XMI") || !docData.contains("xmlns:xmi=\"http://www.omg.org/XMI\""))
throw new CollectionException(new IllegalArgumentException("The document that has been received from the database does not " +
"appear to contain XMI data. The beginning of the document data is: " +
StringUtils.abbreviate(docData, 200), e));
- log.error("SAXException while deserializing CAS XMI data from a segmented and re-assemblied XMI " +
+ log.error("SAXException while deserializing CAS XMI data from a segmented and re-assembled XMI " +
"document. Beginning of data was: {}", StringUtils.abbreviate(docData, 200));
throw new CollectionException(e);
}
diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java
index da16d4ef0..0e29e6956 100644
--- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java
+++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java
@@ -14,6 +14,7 @@
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.AbstractCas;
+import org.apache.uima.cas.impl.XCASParsingException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
@@ -49,7 +50,7 @@ public class XmiDBMultiplier extends DBMultiplier implements Initializable {
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
logFinalXmi = Optional.ofNullable((Boolean) aContext.getConfigParameterValue(PARAM_LOG_FINAL_XMI)).orElse(false);
- truncationSize = Optional.ofNullable((Integer)aContext.getConfigParameterValue(PARAM_TRUNCATE_AT_SIZE)).orElse(0);
+ truncationSize = Optional.ofNullable((Integer) aContext.getConfigParameterValue(PARAM_TRUNCATE_AT_SIZE)).orElse(0);
}
@Override
@@ -87,9 +88,12 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException {
@Override
public AbstractCas next() throws AnalysisEngineProcessException {
- JCas jCas = getEmptyJCas();
- try {
- if (documentDataIterator.hasNext()) {
+ JCas jCas = null;
+ // we use a loop here because further down we catch a particular exception that should cause the current
+ // document to be skipped over
+ while (jCas == null && documentDataIterator.hasNext()) {
+ try {
+ jCas = getEmptyJCas();
log.trace("Returning next CAS");
try {
initializer.initializeAnnotationTableNames(jCas);
@@ -97,16 +101,35 @@ public AbstractCas next() throws AnalysisEngineProcessException {
throw new AnalysisEngineProcessException(e);
}
populateCas(jCas);
+ if (log.isTraceEnabled()) {
+ log.trace("Outgoing multiplier jCas instance: {}", jCas);
+ log.trace("Returning CAS containing document {}", JCoReTools.getDocId(jCas));
+ }
+ } catch (Throwable throwable) {
+ jCas.release();
+ // We want to skip XMI exception with the assumption that they stem from a corrupted JeDIS XMI
+ // annotation store. Because we can't know which documents in the complete dataset are corrupted.
+ // We just skip the corrupted ones, so they are stuck with in_process state, and we can collect
+ // them after processing.
+ Throwable cause = throwable.getCause();
+ while (cause != null && cause.getCause() != null)
+ cause = cause.getCause();
+ if (cause != null && cause instanceof XCASParsingException) {
+ log.warn("XCASParsingException occurred. That means that the JeDIS XMI modules could not be assembled into a complete, valid XMI document. The CAS is skipped and the next document is read. After processing, the skipped documents will still have 'in_process' state in the CoStoSys subset. This information can be used to repair/redo the preprocessing for the corrupted documents.");
+ // setting jCas to null, so we repeat the loop at the top and continue to the next document
+ jCas = null;
+ } else if (cause != null && cause instanceof IllegalArgumentException && cause.getMessage() != null && cause.getMessage().startsWith("Detected XMI ID clash")){
+ log.warn("XMI ID clash in assembled XMI detected. The XMI elements in the CoStoSys storage have non-unique XMI IDs, the data is corrupt. The CAS is skipped and the next document is read. After processing, the skipped documents will still have 'in_process' state in the CoStoSys subset. This information can be used to repair/redo the preprocessing for the corrupted documents.");
+ // setting jCas to null, so we repeat the loop at the top and continue to the next document
+ jCas = null;
+ }
+ else {
+ log.error("Error while reading document from the database. Releasing the CAS. ", throwable);
+ throw new AnalysisEngineProcessException(throwable);
+ }
}
- } catch (Throwable throwable) {
- log.error("Error while reading document from the database. Releasing the CAS. ", throwable);
- jCas.release();
- throw new AnalysisEngineProcessException(throwable);
- }
- if (log.isTraceEnabled()) {
- log.trace("Outgoing multiplier jCas instance: {}", jCas);
- log.trace("Returning CAS containing document {}", JCoReTools.getDocId(jCas));
}
+
return jCas;
}
@@ -129,7 +152,7 @@ private void populateCas(JCas jCas) throws AnalysisEngineProcessException {
}
boolean truncate = false;
if (truncationSize > 0) {
- if(data[pkSize].length > truncationSize)
+ if (data[pkSize].length > truncationSize)
truncate = true;
}
if (data != null && !truncate)