diff --git a/jcore-gnormplus-ae/pom.xml b/jcore-gnormplus-ae/pom.xml index 162af3314..e45c0009f 100644 --- a/jcore-gnormplus-ae/pom.xml +++ b/jcore-gnormplus-ae/pom.xml @@ -13,13 +13,13 @@ 2.6.1 - 2.6.9 + 2.6.10 de.julielab julielab-gnormplus - 1.0.1 + [1.0.2,1.1) de.julielab diff --git a/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/ae/gnp/GNormPlusProcessing.java b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/ae/gnp/GNormPlusProcessing.java index 2c131183b..4db547d10 100644 --- a/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/ae/gnp/GNormPlusProcessing.java +++ b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/ae/gnp/GNormPlusProcessing.java @@ -61,10 +61,17 @@ public static Path processWithGNormPlus(BioCCollection bioCCollection, String ou w.writeCollection(bioCCollection); } GNormPlus.processFile(filePath.toString(), filePath.getFileName().toString(), outputFilePath.toString(), System.currentTimeMillis(), "Test"); - Files.delete(filePath); } catch (IOException | XMLStreamException e) { log.error("Could not process document {}", collectionId); throw new AnalysisEngineProcessException(e); + } finally { + try { + if (Files.exists(filePath)) + Files.delete(filePath); + } catch (IOException e) { + log.error("Could not delete temporary GNormPlus File {}", filePath); + throw new AnalysisEngineProcessException(e); + } } return outputFilePath; } diff --git a/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusMultiplierLogic.java b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusMultiplierLogic.java index 0cff65f4d..f78007834 100644 --- a/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusMultiplierLogic.java +++ b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusMultiplierLogic.java @@ -1,5 +1,6 @@ package de.julielab.jcore.multiplier.gnp; +import GNormPluslib.InconsistentDataException; import com.pengyifan.bioc.BioCCollection; import com.pengyifan.bioc.BioCDocument; import de.julielab.jcore.ae.gnp.GNormPlusProcessing; @@ -81,6 +82,7 @@ public AbstractCas next() throws AnalysisEngineProcessException { // This allows batch-processing within GNP which reduces file writes and reads (GNP internally // writes a lot of temporary files that contain all the documents given to it in one single batch file). cachedCasData.clear(); + while (baseMultiplierHasNext.get()) { final JCas jCas = baseMultiplierNext.get(); boolean isDocumentHashUnchanged = false; @@ -110,8 +112,9 @@ public AbstractCas next() throws AnalysisEngineProcessException { // now process the whole batch with GNP if (gnormPlusInputCollection.getDocmentCount() > 0) { log.trace("Processing {} documents with GNormPlus.", gnormPlusInputCollection.getDocmentCount()); - final Path outputFilePath = GNormPlusProcessing.processWithGNormPlus(gnormPlusInputCollection, outputDirectory); + Path outputFilePath = null; try { + outputFilePath = GNormPlusProcessing.processWithGNormPlus(gnormPlusInputCollection, outputDirectory); bioCCasPopulator = new BioCCasPopulator(outputFilePath, Class.forName(outputGeneTypeName).getConstructor(JCas.class)); // delete the GNP output if we don't want to keep it if (outputDirectory.isBlank()) { @@ -123,6 +126,11 @@ public AbstractCas next() throws AnalysisEngineProcessException { } catch (ClassNotFoundException | NoSuchMethodException e) { log.error("Could not obtain UIMA gene annotation type constructor for class {}", outputGeneTypeName); throw new AnalysisEngineProcessException(e); + } catch (InconsistentDataException e) { + log.warn("GNormPlus encountered a data issue it cannot recover from: {} - no gene annotations will be created for this document batch.", e.getMessage()); + // We set the populator to null as a signal that there are no annotations to be read. This is + // used further down to skip entity population of cached CASes before returning the CASes. + bioCCasPopulator = null; } } } @@ -144,7 +152,8 @@ public AbstractCas next() throws AnalysisEngineProcessException { } // If the document is unchanged and we skip unchanged documents, we do not have a GNormPlus result for this // document, skip. - if (!(isDocumentHashUnchanged && skipUnchangedDocuments)) { + // Also skip if the casPopulator is null. This can happen above when there is an error in GNormPlus. + if (!(isDocumentHashUnchanged && skipUnchangedDocuments) && bioCCasPopulator != null) { bioCCasPopulator.populateWithNextDocument(jCas, true); bioCCasPopulator.clearDocument(currentBiocResultCollectionIndex++); } diff --git a/jcore-pmc-db-reader/pom.xml b/jcore-pmc-db-reader/pom.xml index b8320041e..313bbfd19 100644 --- a/jcore-pmc-db-reader/pom.xml +++ b/jcore-pmc-db-reader/pom.xml @@ -30,7 +30,7 @@ de.julielab jcore-db-reader - 2.6.2 + 2.6.3 de.julielab diff --git a/jcore-xmi-db-reader/pom.xml b/jcore-xmi-db-reader/pom.xml index 3f4917efd..4686387bd 100644 --- a/jcore-xmi-db-reader/pom.xml +++ b/jcore-xmi-db-reader/pom.xml @@ -13,7 +13,7 @@ Reads CAS XMI data from a relational database (Postgres). Thus, the stored CASes may then be processed further. - + 2.6.4-SNAPSHOT de.julielab diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java index 1b9c9c080..3bd07b2d9 100644 --- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java +++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java @@ -162,11 +162,11 @@ public void populateCas(byte[][] data, JCas jCas) throws CasPopulationException JCoReTools.deserializeXmi(jCas.getCas(), new ByteArrayInputStream(xmiByteData), xercesAttributeBufferSize); } catch (SAXException e) { String docData = new String(xmiByteData, StandardCharsets.UTF_8); - if (!docData.contains("xmi:XMI xmlns:xmi=\"http://www.omg.org/XMI\"")) + if (!docData.contains("xmi:XMI") || !docData.contains("xmlns:xmi=\"http://www.omg.org/XMI\"")) throw new CollectionException(new IllegalArgumentException("The document that has been received from the database does not " + "appear to contain XMI data. The beginning of the document data is: " + StringUtils.abbreviate(docData, 200), e)); - log.error("SAXException while deserializing CAS XMI data from a segmented and re-assemblied XMI " + + log.error("SAXException while deserializing CAS XMI data from a segmented and re-assembled XMI " + "document. Beginning of data was: {}", StringUtils.abbreviate(docData, 200)); throw new CollectionException(e); } diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java index da16d4ef0..0e29e6956 100644 --- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java +++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java @@ -14,6 +14,7 @@ import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.AbstractCas; +import org.apache.uima.cas.impl.XCASParsingException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; @@ -49,7 +50,7 @@ public class XmiDBMultiplier extends DBMultiplier implements Initializable { public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); logFinalXmi = Optional.ofNullable((Boolean) aContext.getConfigParameterValue(PARAM_LOG_FINAL_XMI)).orElse(false); - truncationSize = Optional.ofNullable((Integer)aContext.getConfigParameterValue(PARAM_TRUNCATE_AT_SIZE)).orElse(0); + truncationSize = Optional.ofNullable((Integer) aContext.getConfigParameterValue(PARAM_TRUNCATE_AT_SIZE)).orElse(0); } @Override @@ -87,9 +88,12 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { @Override public AbstractCas next() throws AnalysisEngineProcessException { - JCas jCas = getEmptyJCas(); - try { - if (documentDataIterator.hasNext()) { + JCas jCas = null; + // we use a loop here because further down we catch a particular exception that should cause the current + // document to be skipped over + while (jCas == null && documentDataIterator.hasNext()) { + try { + jCas = getEmptyJCas(); log.trace("Returning next CAS"); try { initializer.initializeAnnotationTableNames(jCas); @@ -97,16 +101,35 @@ public AbstractCas next() throws AnalysisEngineProcessException { throw new AnalysisEngineProcessException(e); } populateCas(jCas); + if (log.isTraceEnabled()) { + log.trace("Outgoing multiplier jCas instance: {}", jCas); + log.trace("Returning CAS containing document {}", JCoReTools.getDocId(jCas)); + } + } catch (Throwable throwable) { + jCas.release(); + // We want to skip XMI exception with the assumption that they stem from a corrupted JeDIS XMI + // annotation store. Because we can't know which documents in the complete dataset are corrupted. + // We just skip the corrupted ones, so they are stuck with in_process state, and we can collect + // them after processing. + Throwable cause = throwable.getCause(); + while (cause != null && cause.getCause() != null) + cause = cause.getCause(); + if (cause != null && cause instanceof XCASParsingException) { + log.warn("XCASParsingException occurred. That means that the JeDIS XMI modules could not be assembled into a complete, valid XMI document. The CAS is skipped and the next document is read. After processing, the skipped documents will still have 'in_process' state in the CoStoSys subset. This information can be used to repair/redo the preprocessing for the corrupted documents."); + // setting jCas to null, so we repeat the loop at the top and continue to the next document + jCas = null; + } else if (cause != null && cause instanceof IllegalArgumentException && cause.getMessage() != null && cause.getMessage().startsWith("Detected XMI ID clash")){ + log.warn("XMI ID clash in assembled XMI detected. The XMI elements in the CoStoSys storage have non-unique XMI IDs, the data is corrupt. The CAS is skipped and the next document is read. After processing, the skipped documents will still have 'in_process' state in the CoStoSys subset. This information can be used to repair/redo the preprocessing for the corrupted documents."); + // setting jCas to null, so we repeat the loop at the top and continue to the next document + jCas = null; + } + else { + log.error("Error while reading document from the database. Releasing the CAS. ", throwable); + throw new AnalysisEngineProcessException(throwable); + } } - } catch (Throwable throwable) { - log.error("Error while reading document from the database. Releasing the CAS. ", throwable); - jCas.release(); - throw new AnalysisEngineProcessException(throwable); - } - if (log.isTraceEnabled()) { - log.trace("Outgoing multiplier jCas instance: {}", jCas); - log.trace("Returning CAS containing document {}", JCoReTools.getDocId(jCas)); } + return jCas; } @@ -129,7 +152,7 @@ private void populateCas(JCas jCas) throws AnalysisEngineProcessException { } boolean truncate = false; if (truncationSize > 0) { - if(data[pkSize].length > truncationSize) + if (data[pkSize].length > truncationSize) truncate = true; } if (data != null && !truncate)