From 6c5e8c8bf9d6d1f650c8df81d769d610599fc77f Mon Sep 17 00:00:00 2001 From: khituras Date: Sat, 30 May 2020 11:09:52 +0200 Subject: [PATCH 001/269] Cord19 reader: Adding html element to `TabFigRef`. Seems to be new in Cord19. --- .../jcore/reader/cord19/jsonformat/TabFigRef.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/jsonformat/TabFigRef.java b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/jsonformat/TabFigRef.java index bfe873c48..0e7794322 100644 --- a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/jsonformat/TabFigRef.java +++ b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/jsonformat/TabFigRef.java @@ -19,6 +19,15 @@ public class TabFigRef { private String text; private String type; private String latex; + private String html; + + public String getHtml() { + return html; + } + + public void setHtml(String html) { + this.html = html; + } public String getLatex() { return latex; From 2d227284d91cbc7cff473e59f44ffdd506b04708 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 8 Jun 2020 17:07:58 +0200 Subject: [PATCH 002/269] Added first untested version of the Neo4j to relations consumer. --- .../reader/cord19/jsonformat/TabFigRef.java | 1 + jcore-neo4j-relations-consumer/LICENSE | 26 ++ jcore-neo4j-relations-consumer/README.md | 34 ++ jcore-neo4j-relations-consumer/pom.xml | 58 ++++ .../Neo4jRelationsConsumer.java | 254 ++++++++++++++ .../consumer/neo4jrelations/desc/PLACEHOLDER | 1 + .../desc/jcore-neo4j-relations-consumer.xml | 21 ++ ...Neo4jRelationsConsumerIntegrationTest.java | 31 ++ .../Neo4jRelationsConsumerTest.java | 28 ++ pom.xml | 327 ++++++++++++------ 10 files changed, 672 insertions(+), 109 deletions(-) create mode 100644 jcore-neo4j-relations-consumer/LICENSE create mode 100644 jcore-neo4j-relations-consumer/README.md create mode 100644 jcore-neo4j-relations-consumer/pom.xml create mode 100644 jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java create mode 100644 jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/PLACEHOLDER create mode 100644 jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml create mode 100644 jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java create mode 100644 jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java diff --git a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/jsonformat/TabFigRef.java b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/jsonformat/TabFigRef.java index 0e7794322..d35bc534e 100644 --- a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/jsonformat/TabFigRef.java +++ b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/jsonformat/TabFigRef.java @@ -22,6 +22,7 @@ public class TabFigRef { private String html; public String getHtml() { + return html; } diff --git a/jcore-neo4j-relations-consumer/LICENSE b/jcore-neo4j-relations-consumer/LICENSE new file mode 100644 index 000000000..fbbd41e05 --- /dev/null +++ b/jcore-neo4j-relations-consumer/LICENSE @@ -0,0 +1,26 @@ +BSD 2-Clause License + +Copyright (c) 2017, JULIE Lab +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/jcore-neo4j-relations-consumer/README.md b/jcore-neo4j-relations-consumer/README.md new file mode 100644 index 000000000..7b8a2a0a9 --- /dev/null +++ b/jcore-neo4j-relations-consumer/README.md @@ -0,0 +1,34 @@ +# JCoRe Neo4j Relations Consumer + +**Descriptor Path**: +``` +de.julielab.jcore.consumer.neo4jrelations.desc.jcore-neo4j-relations-consumer +``` + +Writes EventMentions to Neo4j. + + + +**1. Parameters** + +| Parameter Name | Parameter Type | Mandatory | Multivalued | Description | +|----------------|----------------|-----------|-------------|-------------| +| param1 | UIMA-Type | Boolean | Boolean | Description | +| param2 | UIMA-Type | Boolean | Boolean | Description | + +**2. Predefined Settings** + +| Parameter Name | Parameter Syntax | Example | +|----------------|------------------|---------| +| param1 | Syntax-Description | `Example` | +| param2 | Syntax-Description | `Example` | + +**3. Capabilities** + +| Type | Input | Output | +|------|:-----:|:------:| +| de.julielab.jcore.types.TYPE | | `+` | +| de.julielab.jcore.types.ace.TYPE | `+` | | + + +[1] Some Literature? diff --git a/jcore-neo4j-relations-consumer/pom.xml b/jcore-neo4j-relations-consumer/pom.xml new file mode 100644 index 000000000..e83c89ced --- /dev/null +++ b/jcore-neo4j-relations-consumer/pom.xml @@ -0,0 +1,58 @@ + + + + 4.0.0 + jcore-neo4j-relations-consumer + jar + de.julielab + + + de.julielab + jcore-base + 2.3.0-SNAPSHOT + + + 2.3.0-SNAPSHOT + + + + ch.qos.logback + logback-classic + test + + + org.slf4j + slf4j-api + + + de.julielab + jcore-types + ${jcore-version} + + + de.julielab + julielab-neo4j-plugins-concepts-representation + 3.0.0-SNAPSHOT + + + org.neo4j.test + neo4j-harness + 4.0.4 + test + + + de.julielab + julielab-neo4j-plugins-concepts + 3.0.0-SNAPSHOT + test + + + JCoRe Neo4j Relations Consumer + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-neo4j-relations-consumer + Writes EventMentions to Neo4j. + diff --git a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java new file mode 100644 index 000000000..4c4670d97 --- /dev/null +++ b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java @@ -0,0 +1,254 @@ +package de.julielab.jcore.consumer.neo4jrelations; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.HashMultiset; +import com.google.common.collect.Multiset; +import de.julielab.jcore.types.ConceptMention; +import de.julielab.jcore.types.ResourceEntry; +import de.julielab.jcore.types.ext.FlattenedRelation; +import de.julielab.neo4j.plugins.datarepresentation.ImportIERelation; +import de.julielab.neo4j.plugins.datarepresentation.ImportIERelationArgument; +import de.julielab.neo4j.plugins.datarepresentation.ImportIERelationDocument; +import de.julielab.neo4j.plugins.datarepresentation.ImportIETypedRelations; +import de.julielab.neo4j.plugins.datarepresentation.constants.ImportIERelations; +import org.apache.commons.io.IOUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.ws.rs.HttpMethod; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.HttpURLConnection; +import java.net.URI; +import java.net.URL; +import java.util.*; +import java.util.stream.StreamSupport; + +@ResourceMetaData(name = "JCoRe Neo4j Relations Consumer", description = "This component assumes that a Neo4j server with an installed julieliab-neo4j-plugins-concepts plugin installed. It then sends FlattenedRelation instances with more then one arguments to Neo4j. Note that this requires the event arguments to have a ResourceEntry list to obtain database concept IDs from.", vendor = "JULIE Lab, Germany", copyright = "JULIE Lab", version = "2.6.0-SNAPSHOT") +@TypeCapability(inputs = {"de.julielab.jcore.types.EventMention"}) +public class Neo4jRelationsConsumer extends JCasAnnotator_ImplBase { + + public static final String PARAM_URL = "URL"; + public static final String PARAM_ID_PROPERTY = "IdProperty"; + public static final String PARAM_SOURCE = "ConceptSource"; + private final static Logger log = LoggerFactory.getLogger(Neo4jRelationsConsumer.class); + @ConfigurationParameter(name = PARAM_URL, description = "The complete URL to the endpoint of the Neo4j server for relation insertion.") + private String url; + @ConfigurationParameter(name = PARAM_ID_PROPERTY, description = "The ID property to look up concept nodes in the Neo4j graph. Common options are 'id', 'sourceIds' and 'originalId'. You must know to which ID type the ResourceEntry objects of the relation arguments refer to.") + private String idProperty; + @ConfigurationParameter(name = PARAM_SOURCE, mandatory = false, description = "Optional. Sets the global source for the concept IDs taken from the ResourceEntry instances of the relation arguments. This causes the 'source' feature of the ResourceEntry objects to be omitted and to globally use the specified source instead. This causes the Neo4j database plugin to resolve the provided argument IDs against the source specified here.") + private String globalSource; + + private ImportIERelations importIERelations; + private ObjectMapper om; + + /** + * This method is called a single time by the framework at component + * creation. Here, descriptor parameters are read and initial setup is done. + */ + @Override + public void initialize(final UimaContext aContext) throws ResourceInitializationException { + url = (String) aContext.getConfigParameterValue(PARAM_URL); + idProperty = (String) aContext.getConfigParameterValue(PARAM_ID_PROPERTY); + globalSource = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_SOURCE)).orElse(null); + om = new ObjectMapper(); + om.setSerializationInclusion(JsonInclude.Include.NON_NULL); + om.setSerializationInclusion(JsonInclude.Include.NON_EMPTY); + } + + private void initImportRelations() { + importIERelations = globalSource != null ? new ImportIERelations(idProperty, globalSource) : new ImportIERelations(idProperty); + } + + /** + * This method is called for each document going through the component. This + * is where the actual work happens. + */ + @Override + public void process(final JCas aJCas) { + importIERelations.addRelationDocument(convertRelations(aJCas)); + } + + private ImportIERelationDocument convertRelations(JCas aJCas) { + Map> relationCounts = getEquivalentRelationGroups(aJCas); + ImportIERelationDocument relDoc = new ImportIERelationDocument(); + ImportIETypedRelations typedRelations = new ImportIETypedRelations(); + for (String relationType : relationCounts.keySet()) { + Multiset unificationRelations = relationCounts.get(relationType); + List ieRelations4relationType = new ArrayList<>(); + for (UnificationRelation rel : unificationRelations) { + ieRelations4relationType.add(rel.toImportRelation(unificationRelations.count(rel))); + } + } + relDoc.setRelations(typedRelations); + return relDoc; + } + + @Override + public void batchProcessComplete() throws AnalysisEngineProcessException { + super.batchProcessComplete(); + sendRelationsToNeo4j(); + } + + @Override + public void collectionProcessComplete() throws AnalysisEngineProcessException { + super.collectionProcessComplete(); + sendRelationsToNeo4j(); + } + + private void sendRelationsToNeo4j() throws AnalysisEngineProcessException { + try { + URL url = URI.create(this.url).toURL(); + HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); + urlConnection.setRequestMethod(HttpMethod.POST); + urlConnection.setDoOutput(true); + try (OutputStream outputStream = urlConnection.getOutputStream()) { + JsonFactory jf = new JsonFactory(om); + JsonGenerator g = jf.createGenerator(outputStream); + g.writeStartObject(); + g.writeObjectField(ImportIERelations.NAME_ID_PROPERTY, idProperty); + g.writeObjectField(ImportIERelations.NAME_ID_SOURCE, globalSource); + + List documents = importIERelations.getDocuments(); + g.writeFieldName(ImportIERelations.NAME_DOCUMENTS); + g.writeStartArray(); + for (ImportIERelationDocument document : (Iterable) documents::iterator) { + g.writeObject(document); + } + g.writeEndArray(); + g.writeEndObject(); + g.close(); + } + try (InputStream inputStream = urlConnection.getInputStream()) { + log.debug("Response from Neo4j: {}", IOUtils.toString(inputStream)); + } + importIERelations.clear(); + } catch (IOException e) { + log.error("Could not send relations to Neo4j", e); + throw new AnalysisEngineProcessException(e); + } + } + + /** + *

Iterates through the FlattenedRelations in the JCas and creates an intermediate representation that is primarily meant to group relations together that are basically the same. Then we can just count them instead of sending duplicates to the server.

+ * + * @param aJCas The JCas to get relations from. + * @return The grouped relations. + */ + private Map> getEquivalentRelationGroups(JCas aJCas) { + Map> relationCounts = new HashMap<>(); + for (FlattenedRelation fr : aJCas.getAnnotationIndex(FlattenedRelation.type)) { + Iterator cmIt = StreamSupport.stream(fr.getArguments().spliterator(), false).map(ConceptMention.class::cast).iterator(); + Set unificationArgs = new HashSet<>(); + while (cmIt.hasNext()) { + ConceptMention cm = cmIt.next(); + FSArray resourceEntryList = cm.getResourceEntryList(); + if (resourceEntryList != null) { + ResourceEntry resourceEntry = (ResourceEntry) resourceEntryList.get(0); + String id = resourceEntry.getEntryId(); + String source = resourceEntry.getSource(); + if (globalSource == null) + unificationArgs.add(new UnificationArgument(id, source)); + else + unificationArgs.add(new UnificationArgument(id)); + } + } + if (unificationArgs.size() > 1) { + UnificationRelation rel = new UnificationRelation(fr.getRootRelation().getSpecificType(), unificationArgs); + relationCounts.compute(rel.getRelationType(), (k, v) -> v != null ? v : HashMultiset.create()).add(rel); + } + } + return relationCounts; + } + + private class UnificationRelation { + private String relationType; + private Set args; + + public UnificationRelation(String relationType, Set args) { + this.relationType = relationType; + this.args = args; + } + + public ImportIERelation toImportRelation(int count) { + return ImportIERelation.of(count, (Iterable) args.stream().map(UnificationArgument::toImportArgument).iterator()); + } + + public String getRelationType() { + return relationType; + } + + public Set getArgs() { + return args; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + UnificationRelation that = (UnificationRelation) o; + return relationType.equals(that.relationType) && + args.equals(that.args); + } + + @Override + public int hashCode() { + return Objects.hash(relationType, args); + } + } + + private class UnificationArgument { + private String id; + private String source; + + public UnificationArgument(String id) { + this.id = id; + } + + public UnificationArgument(String id, String source) { + this.id = id; + this.source = source; + } + + public ImportIERelationArgument toImportArgument() { + return source != null ? ImportIERelationArgument.of(id, source) : ImportIERelationArgument.of(id); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + UnificationArgument that = (UnificationArgument) o; + return id.equals(that.id) && + Objects.equals(source, that.source); + } + + @Override + public int hashCode() { + return Objects.hash(id, source); + } + + public String getId() { + return id; + } + + public String getSource() { + return source; + } + } + + +} diff --git a/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/PLACEHOLDER b/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/PLACEHOLDER new file mode 100644 index 000000000..9f6c6ddb5 --- /dev/null +++ b/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/PLACEHOLDER @@ -0,0 +1 @@ +The actual descriptor must be created by UIMA fit. diff --git a/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml b/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml new file mode 100644 index 000000000..a0eadea2f --- /dev/null +++ b/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml @@ -0,0 +1,21 @@ + + + org.apache.uima.java + true + Neo4jRelationsConsumer + + JCoRe Neo4j Relations Consumer + + 2.3.0-SNAPSHOT + JULIE Lab Jena, Germany + + + + + + true + true + false + + + diff --git a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java new file mode 100644 index 000000000..6c853ecdd --- /dev/null +++ b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java @@ -0,0 +1,31 @@ + +package de.julielab.jcore.consumer.neo4jrelations; + +import de.julielab.neo4j.plugins.Indexes; +import de.julielab.neo4j.plugins.concepts.ConceptManager; +import org.apache.uima.UIMAException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.junit.Rule; +import org.junit.Test; +import org.neo4j.harness.junit.rule.Neo4jRule; + + +/** + * Unit tests for jcore-neo4j-relations-consumer. + * + */ +public class Neo4jRelationsConsumerIntegrationTest { + @Rule + public Neo4jRule neo4j = new Neo4jRule() + .withUnmanagedExtension("/concepts", ConceptManager.class).withFixture(graphDatabaseService -> { + new Indexes(null).createIndexes(graphDatabaseService); + return null; + }); + + @Test + public void insertEventMentions() throws UIMAException { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types"); + + } +} diff --git a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java new file mode 100644 index 000000000..41d24b178 --- /dev/null +++ b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java @@ -0,0 +1,28 @@ + +package de.julielab.jcore.consumer.neo4jrelations; + +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.junit.Test; + +import java.io.IOException; + + +/** + * Unit tests for jcore-neo4j-relations-consumer. + * + */ +public class Neo4jRelationsConsumerTest { + + + @Test + public void insertEventMentions() throws UIMAException, IOException { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types"); + AnalysisEngine engine = AnalysisEngineFactory.createEngine("de.julielab.jcore.consumer.neo4jrelations.desc.jcore-neo4j-relations-consumer", Neo4jRelationsConsumer.PARAM_URL, ""); + + + } +} diff --git a/pom.xml b/pom.xml index 5687e86e0..274a990dd 100644 --- a/pom.xml +++ b/pom.xml @@ -1,112 +1,221 @@ - - 4.0.0 - - de.julielab - jcore-parent - 2.5.1 - - jcore-base - pom - JCoRe Base - The POM for the JCoRe Base projects. - 2.5.1-SNAPSHOT - - JULIE Lab, Germany - http://www.julielab.de - - - - BSD-2-Clause - https://opensource.org/licenses/BSD-2-Clause - - - https://github.com/JULIELab/jcore-base - - - org.apache.uima - uimaj-core - ${uima-version} - - - org.apache.uima - uimafit-core - ${uimafit-version} - - - - jcore-ace-reader - jcore-acronym-ae - jcore-banner-ae - jcore-biolemmatizer-ae - jcore-bionlpformat-consumer - jcore-bionlpformat-reader - jcore-biosem-ae - jcore-conll-consumer - jcore-coordination-baseline-ae - jcore-ct-reader - jcore-descriptor-creator - jcore-dta-reader - jcore-ec-code-ae - jcore-elasticsearch-consumer - jcore-embedding-writer - jcore-event-flattener-ae - jcore-feature-value-replacement-ae - jcore-file-reader - jcore-flair-ner-ae - jcore-iexml-consumer - jcore-iexml-reader - jcore-ign-reader - jcore-iob-consumer - jcore-jnet-ae - jcore-jpos-ae - jcore-jsbd-ae - jcore-jtbd-ae - jcore-julielab-entity-evaluator-consumer - jcore-likelihood-assignment-ae - jcore-likelihood-detection-ae - jcore-lingpipegazetteer-ae - jcore-lingpipe-porterstemmer-ae - jcore-lingscope-ae - jcore-linnaeus-species-ae - jcore-mantra-xml-types - jcore-medxn-ae - jcore-msdoc-reader - jcore-mstparser-ae - jcore-muc7-reader - jcore-mutationfinder-ae - jcore-opennlp-chunk-ae - jcore-opennlp-parser-ae - jcore-opennlp-postag-ae - jcore-opennlp-sentence-ae - jcore-opennlp-token-ae - jcore-pmc-reader - jcore-pubtator-reader - jcore-stanford-lemmatizer-ae - jcore-topic-indexing-ae - jcore-topics-writer - jcore-txt-consumer - jcore-types - jcore-utilities - jcore-xml-mapper - jcore-xml-reader - jcore-xmi-reader - jcore-xmi-writer - jedis-parent - jcore-db-checkpoint-ae - jcore-ppd-writer - jcore-bc2gmformat-writer - jcore-bc2gm-reader - jcore-annotation-adder-ae - jcore-flair-token-embedding-ae - jcore-line-multiplier - jcore-cord19-reader - - - scm:git:https://github.com/JULIELab/jcore-base + + + 4.0.0 + + + + de.julielab + + jcore-parent + + 2.5.1 + + + + jcore-base + + pom + + JCoRe Base + + The POM for the JCoRe Base projects. + + 2.5.1-SNAPSHOT + + + + JULIE Lab, Germany + + http://www.julielab.de + + + + + + + + BSD-2-Clause + + https://opensource.org/licenses/BSD-2-Clause + + + + + + https://github.com/JULIELab/jcore-base + + + + + + org.apache.uima + + uimaj-core + + ${uima-version} + + + + + + org.apache.uima + + uimafit-core + + ${uimafit-version} + + + + + + + + jcore-ace-reader + + jcore-acronym-ae + + jcore-banner-ae + + jcore-biolemmatizer-ae + + jcore-bionlpformat-consumer + + jcore-bionlpformat-reader + + jcore-biosem-ae + + jcore-conll-consumer + + jcore-coordination-baseline-ae + + jcore-ct-reader + + jcore-descriptor-creator + + jcore-dta-reader + + jcore-ec-code-ae + + jcore-elasticsearch-consumer + + jcore-embedding-writer + + jcore-event-flattener-ae + + jcore-feature-value-replacement-ae + + jcore-file-reader + + jcore-flair-ner-ae + + jcore-iexml-consumer + + jcore-iexml-reader + + jcore-ign-reader + + jcore-iob-consumer + + jcore-jnet-ae + + jcore-jpos-ae + + jcore-jsbd-ae + + jcore-jtbd-ae + + jcore-julielab-entity-evaluator-consumer + + jcore-likelihood-assignment-ae + + jcore-likelihood-detection-ae + + jcore-lingpipegazetteer-ae + + jcore-lingpipe-porterstemmer-ae + + jcore-lingscope-ae + + jcore-linnaeus-species-ae + + jcore-mantra-xml-types + + jcore-medxn-ae + + jcore-msdoc-reader + + jcore-mstparser-ae + + jcore-muc7-reader + + jcore-mutationfinder-ae + + jcore-opennlp-chunk-ae + + jcore-opennlp-parser-ae + + jcore-opennlp-postag-ae + + jcore-opennlp-sentence-ae + + jcore-opennlp-token-ae + + jcore-pmc-reader + + jcore-pubtator-reader + + jcore-stanford-lemmatizer-ae + + jcore-topic-indexing-ae + + jcore-topics-writer + + jcore-txt-consumer + + jcore-types + + jcore-utilities + + jcore-xml-mapper + + jcore-xml-reader + + jcore-xmi-reader + + jcore-xmi-writer + + jedis-parent + + jcore-db-checkpoint-ae + + jcore-ppd-writer + + jcore-bc2gmformat-writer + + jcore-bc2gm-reader + + jcore-annotation-adder-ae + + jcore-flair-token-embedding-ae + + jcore-line-multiplier + + jcore-cord19-reader + + jcore-neo4j-relations-consumer + + + + + + scm:git:https://github.com/JULIELab/jcore-base - scm:git:https://github.com/JULIELab/jcore-base - scm:git:https://github.com/JULIELab/jcore-base - + + scm:git:https://github.com/JULIELab/jcore-base + + scm:git:https://github.com/JULIELab/jcore-base + + + From 35789670af0f0fc8dd1ee6ead7831826cd780523 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 8 Jun 2020 17:09:26 +0200 Subject: [PATCH 003/269] Letting travis run for the 2.6 branch. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 208b0219a..57daeceac 100644 --- a/.travis.yml +++ b/.travis.yml @@ -51,4 +51,4 @@ deploy: skip_cleanup: true on: all_branches: true - condition: $TRAVIS_BRANCH =~ ^v2.5|master$ \ No newline at end of file + condition: $TRAVIS_BRANCH =~ ^v2.6|master$ \ No newline at end of file From 9f047ab0fa5489f64d2e36e77d2d1fe9d4e83418 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 9 Jun 2020 09:34:17 +0200 Subject: [PATCH 004/269] Version 2.6.0-SNAPSHOT. Neo4jRelationsConsumer unit tests working. --- jcore-ace-reader/component.meta | 2 +- jcore-ace-reader/pom.xml | 2 +- .../reader/ace/desc/jcore-ace-reader.xml | 2 +- jcore-acronym-ae/component.meta | 2 +- jcore-acronym-ae/pom.xml | 2 +- .../acronymtagger/desc/jcore-acronym-ae.xml | 2 +- .../desc/JulesToolsAEDescriptor.xml | 2 +- .../desc/jcore-acronymtagger-test.xml | 2 +- .../types/StemNetSemanticsTypeSystem.xml | 2 +- .../acronyms/desc/jcore-acronym-writer.xml | 2 +- jcore-annotation-adder-ae/component.meta | 2 +- jcore-annotation-adder-ae/pom.xml | 2 +- .../desc/jcore-annotation-adder-ae.xml | 2 +- jcore-banner-ae/component.meta | 2 +- jcore-banner-ae/pom.xml | 2 +- .../jcore/ae/banner/desc/jcore-banner-ae.xml | 2 +- .../src/main/resources/desc/BANNERAE.xml | 2 +- .../src/main/resources/desc/bannerTS.xml | 2 +- jcore-bc2gm-reader/component.meta | 2 +- jcore-bc2gm-reader/pom.xml | 2 +- .../reader/bc2gm/desc/jcore-bc2gm-reader.xml | 2 +- jcore-bc2gmformat-writer/component.meta | 2 +- jcore-bc2gmformat-writer/pom.xml | 2 +- .../desc/jcore-bc2gmformat-writer.xml | 2 +- jcore-biolemmatizer-ae/component.meta | 2 +- jcore-biolemmatizer-ae/pom.xml | 2 +- .../desc/jcore-biolemmatizer-ae.xml | 2 +- jcore-bionlpformat-consumer/component.meta | 2 +- jcore-bionlpformat-consumer/pom.xml | 2 +- ...pformat-consumer-biomedical-sharedtask.xml | 2 +- .../jcore-bionlpformat-consumer-medical.xml | 2 +- .../jcore-bionlpformat-consumer-segment.xml | 2 +- .../test/resources/types/jcore-all-types.xml | 2 +- .../types/jcore-semantics-biology-types.xml | 2 +- jcore-bionlpformat-reader/component.meta | 2 +- jcore-bionlpformat-reader/pom.xml | 2 +- ...nlpformat-reader-biomedical-sharedtask.xml | 2 +- .../jcore-bionlpformat-reader-medical.xml | 2 +- .../jcore-bionlpformat-reader-segment.xml | 2 +- .../bionlpformat/desc/EventReaderTest.xml | 2 +- jcore-biosem-ae/component.meta | 2 +- jcore-biosem-ae/pom.xml | 6 +- jcore-conll-consumer/component.meta | 2 +- jcore-conll-consumer/pom.xml | 2 +- .../conll/desc/jcore-conll-consumer.xml | 2 +- jcore-coordination-baseline-ae/component.meta | 2 +- jcore-coordination-baseline-ae/pom.xml | 2 +- ...core-coordination-baseline-ae-conjunct.xml | 2 +- ...-coordination-baseline-ae-coordination.xml | 2 +- .../jcore-coordination-baseline-ae-eee.xml | 2 +- ...core-coordination-baseline-ae-ellipsis.xml | 2 +- .../resources/desc/ConjunctAnnotatorTest.xml | 2 +- .../desc/CoordinationAnnotatorTest.xml | 2 +- .../test/resources/desc/EEEAnnotatorTest.xml | 2 +- .../resources/desc/EllipsisAnnotatorTest.xml | 2 +- jcore-cord19-reader/component.meta | 2 +- jcore-cord19-reader/pom.xml | 2 +- .../desc/jcore-cord19-multiplier-reader.xml | 2 +- .../cord19/desc/jcore-cord19-multiplier.xml | 2 +- jcore-ct-reader/component.meta | 2 +- jcore-ct-reader/pom.xml | 2 +- .../ct/desc/jcore-clinicaltrials-reader.xml | 2 +- jcore-db-checkpoint-ae/component.meta | 2 +- jcore-db-checkpoint-ae/pom.xml | 2 +- .../desc/jcore-db-checkpoint-ae.xml | 2 +- .../desc/jcore-db-checkpoint-consumer.xml | 2 +- jcore-db-reader/component.meta | 2 +- jcore-db-reader/pom.xml | 4 +- .../db/desc/jcore-db-multiplier-reader.xml | 2 +- jcore-descriptor-creator/pom.xml | 2 +- .../de.julielab.jcore.ae.testae.TestAE.xml | 2 +- ...ore.consumer.testconsumer.Testconsumer.xml | 2 +- ...ltiplier.testmultiplier.TestMultiplier.xml | 2 +- ...lab.jcore.reader.testreader.TestReader.xml | 2 +- jcore-dta-reader/component.meta | 2 +- jcore-dta-reader/pom.xml | 2 +- .../reader/dta/desc/jcore-dta-reader.xml | 2 +- jcore-ec-code-ae/component.meta | 2 +- jcore-ec-code-ae/pom.xml | 2 +- jcore-elasticsearch-consumer/component.meta | 2 +- jcore-elasticsearch-consumer/pom.xml | 2 +- .../es/desc/jcore-elasticsearch-consumer.xml | 2 +- .../consumer/es/desc/jcore-json-writer.xml | 2 +- .../julielab/jcore/consumer/es/testTypes.xml | 2 +- jcore-embedding-writer/component.meta | 2 +- jcore-embedding-writer/pom.xml | 2 +- .../ew/desc/jcore-embedding-writer.xml | 2 +- jcore-event-flattener-ae/component.meta | 2 +- jcore-event-flattener-ae/pom.xml | 2 +- .../desc/jcore-event-flattener-ae.xml | 2 +- .../component.meta | 2 +- jcore-feature-value-replacement-ae/pom.xml | 2 +- .../jcore-feature-value-replacement-ae.xml | 2 +- jcore-file-reader/component.meta | 2 +- jcore-file-reader/pom.xml | 2 +- .../reader/file/desc/jcore-file-reader.xml | 2 +- jcore-flair-ner-ae/component.meta | 2 +- jcore-flair-ner-ae/pom.xml | 4 +- .../ae/flairner/desc/jcore-flair-ner-ae.xml | 2 +- jcore-flair-token-embedding-ae/component.meta | 2 +- jcore-flair-token-embedding-ae/pom.xml | 2 +- .../desc/jcore-flair-token-embedding-ae.xml | 2 +- jcore-iexml-consumer/component.meta | 2 +- jcore-iexml-consumer/pom.xml | 4 +- .../iexml/desc/jcore-iexml-consumer.xml | 2 +- jcore-iexml-reader/component.meta | 2 +- jcore-iexml-reader/pom.xml | 4 +- .../reader/iexml/desc/jcore-iexml-reader.xml | 2 +- jcore-ign-reader/component.meta | 2 +- jcore-ign-reader/pom.xml | 2 +- .../reader/ign/desc/jcore-ign-reader.xml | 2 +- jcore-iob-consumer/component.meta | 2 +- jcore-iob-consumer/pom.xml | 2 +- .../cas2iob/desc/jcore-iob-consumer.xml | 2 +- .../cas2iob/desc/ToIOBConsumerTest.xml | 2 +- .../consumer/cas2iob/types/TestTypeSystem.xml | 2 +- .../jcore/ae/jemas/desc/jcore-jemas-ae.xml | 2 +- jcore-jnet-ae/component.meta | 2 +- jcore-jnet-ae/pom.xml | 2 +- .../jcore/ae/jnet/desc/jcore-jnet-ae.xml | 2 +- .../ae/jnet/uima/EntityAnnotatorTest.xml | 2 +- .../jcore/ae/jnet/uima/tsDescriptor.xml | 2 +- jcore-jpos-ae/component.meta | 2 +- jcore-jpos-ae/pom.xml | 2 +- .../jcore/ae/jpos/desc/jcore-jpos.xml | 2 +- .../test/resources/POSTagAnnotatorTest.xml | 2 +- jcore-jsbd-ae/component.meta | 2 +- jcore-jsbd-ae/pom.xml | 2 +- .../jcore/ae/jsbd/desc/jcore-jsbd-ae.xml | 2 +- .../ae/jsbd/desc/SentenceAnnotatorTest.xml | 2 +- .../SentenceAnnotator_with-scope_Test.xml | 2 +- .../ae/jsbd/desc/paragraph-scope-type.xml | 2 +- jcore-jtbd-ae/component.meta | 2 +- jcore-jtbd-ae/pom.xml | 2 +- .../jcore/ae/jtbd/desc/jcore-jtbd.xml | 2 +- .../jcore/ae/jtbd/desc/TokenAnnotatorTest.xml | 2 +- .../component.meta | 2 +- .../pom.xml | 2 +- ...ore-julielab-entity-evaluator-consumer.xml | 2 +- jcore-likelihood-assignment-ae/component.meta | 2 +- jcore-likelihood-assignment-ae/pom.xml | 2 +- .../desc/jcore-likelihood-assignment-ae.xml | 2 +- jcore-likelihood-detection-ae/component.meta | 2 +- jcore-likelihood-detection-ae/pom.xml | 2 +- .../desc/jcore-likelihood-detection-ae.xml | 2 +- jcore-line-multiplier/component.meta | 2 +- jcore-line-multiplier/pom.xml | 2 +- .../line/desc/jcore-line-multiplier-ae.xml | 2 +- .../line/desc/jcore-line-multiplier-ae.xml | 2 +- .../component.meta | 2 +- jcore-lingpipe-porterstemmer-ae/pom.xml | 2 +- .../desc/jcore-lingpipe-porterstemmer-ae.xml | 2 +- jcore-lingpipegazetteer-ae/component.meta | 2 +- jcore-lingpipegazetteer-ae/pom.xml | 2 +- ...ipe-gazetteer-ae-configurable-resource.xml | 2 +- .../desc/jcore-lingpipe-gazetteer-ae.xml | 2 +- .../ApproxGazetteerAnnotatorTest.xml | 2 +- .../resources/ExactGazetteerAnnotatorTest.xml | 2 +- jcore-lingscope-ae/component.meta | 2 +- jcore-lingscope-ae/pom.xml | 2 +- .../ae/lingscope/desc/jcore-lingscope-ae.xml | 2 +- jcore-linnaeus-species-ae/component.meta | 2 +- jcore-linnaeus-species-ae/pom.xml | 2 +- .../ae/linnaeus/desc/jcore-linnaeus-ae.xml | 2 +- jcore-mantra-xml-types/pom.xml | 2 +- jcore-medxn-ae/component.meta | 2 +- jcore-medxn-ae/pom.xml | 2 +- .../jcore/ae/medxn/desc/MedNormAE.xml | 2 +- .../desc/jcore-medxn-ae-attributes-german.xml | 2 +- .../desc/jcore-medxn-ae-extractor-german.xml | 2 +- jcore-msdoc-reader/component.meta | 2 +- jcore-msdoc-reader/pom.xml | 2 +- .../reader/msdoc/desc/jcore-msdoc-reader.xml | 2 +- jcore-mstparser-ae/component.meta | 2 +- jcore-mstparser-ae/pom.xml | 2 +- .../ae/mstparser/desc/jcore-mstparser.xml | 2 +- .../desc/MSTParserDescriptorTest.xml | 2 +- jcore-muc7-reader/component.meta | 2 +- jcore-muc7-reader/pom.xml | 2 +- .../reader/muc7/desc/jcore-muc7-reader.xml | 2 +- .../reader/muc7/desc/jcore-muc7-reader.xml | 2 +- jcore-mutationfinder-ae/component.meta | 2 +- jcore-mutationfinder-ae/pom.xml | 2 +- .../desc/jcore-mutationfinder-ae.xml | 2 +- jcore-neo4j-relations-consumer/pom.xml | 18 ++- .../Neo4jRelationsConsumer.java | 15 ++- .../consumer/neo4jrelations/desc/PLACEHOLDER | 1 - .../desc/jcore-neo4j-relations-consumer.xml | 98 +++++++++++--- .../Neo4jRelationsConsumerTest.java | 126 +++++++++++++++++- jcore-opennlp-chunk-ae/component.meta | 2 +- jcore-opennlp-chunk-ae/pom.xml | 2 +- .../src/test/resources/ChunkAnnotatorTest.xml | 2 +- .../ChunkAnnotatorTestDefaultMappings.xml | 2 +- jcore-opennlp-parser-ae/component.meta | 2 +- jcore-opennlp-parser-ae/pom.xml | 2 +- .../desc/jcore-opennlpparser.xml | 2 +- .../desc/jcore-opennlpparser-test.xml | 2 +- jcore-opennlp-postag-ae/component.meta | 2 +- jcore-opennlp-postag-ae/pom.xml | 2 +- .../desc/jcore-opennlppostag.xml | 2 +- .../test/resources/PosTagAnnotatorTest.xml | 2 +- jcore-opennlp-sentence-ae/component.meta | 2 +- jcore-opennlp-sentence-ae/pom.xml | 2 +- .../test/resources/SentenceAnnotatorTest.xml | 2 +- jcore-opennlp-token-ae/component.meta | 2 +- .../desc/TokenAnnotator.xml | 2 +- jcore-opennlp-token-ae/pom.xml | 2 +- .../src/test/resources/TokenAnnotatorTest.xml | 2 +- jcore-pmc-reader/component.meta | 2 +- jcore-pmc-reader/pom.xml | 2 +- .../pmc/desc/jcore-pmc-multiplier.xml | 2 +- .../pmc/desc/jcore-pmc-multiplier-reader.xml | 2 +- .../reader/pmc/desc/jcore-pmc-reader.xml | 2 +- jcore-ppd-writer/component.meta | 2 +- jcore-ppd-writer/pom.xml | 2 +- .../consumer/ppd/desc/jcore-ppd-writer.xml | 2 +- jcore-pubtator-reader/component.meta | 2 +- jcore-pubtator-reader/pom.xml | 2 +- .../pubtator/desc/jcore-pubtator-reader.xml | 2 +- jcore-stanford-lemmatizer-ae/component.meta | 2 +- jcore-stanford-lemmatizer-ae/pom.xml | 2 +- .../lemma/desc/jcore-stanford-lemmatizer.xml | 2 +- .../desc/jcore-stanford-lemmatizer-ae.xml | 2 +- jcore-topic-indexing-ae/component.meta | 2 +- jcore-topic-indexing-ae/pom.xml | 4 +- .../desc/jcore-topic-indexing-ae.xml | 2 +- jcore-topics-writer/component.meta | 2 +- jcore-topics-writer/pom.xml | 2 +- .../topics/desc/jcore-topics-writer.xml | 2 +- jcore-txt-consumer/component.meta | 2 +- jcore-txt-consumer/pom.xml | 2 +- .../consumer/txt/desc/jcore-txt-consumer.xml | 2 +- jcore-types/pom.xml | 2 +- .../jcore-dbtable-multiplier-types.xml | 2 +- .../jcore-uri-multiplier-types.xml | 2 +- .../types/extensions/jcore-ace-types.xml | 2 +- .../jcore-document-meta-extension-types.xml | 2 +- .../types/extensions/jcore-dta-types.xml | 2 +- .../extensions/jcore-evaluation-types.xml | 2 +- .../types/extensions/jcore-mantra-types.xml | 2 +- .../types/extensions/jcore-medical-types.xml | 2 +- .../types/extensions/jcore-mmax-types.xml | 2 +- .../types/extensions/jcore-muc7-types.xml | 2 +- .../extensions/jcore-semantics-ace-types.xml | 2 +- .../jcore-semantics-bootstrep-types.xml | 2 +- ...core-semantics-mention-extension-types.xml | 2 +- .../jcore-semantics-stemnet-types.xml | 2 +- .../extensions/jcore-wikipedia-types.xml | 2 +- .../jcore/types/jcore-affect-types.xml | 2 +- .../julielab/jcore/types/jcore-all-types.xml | 2 +- .../jcore/types/jcore-basic-types.xml | 2 +- .../jcore/types/jcore-discourse-types.xml | 2 +- ...core-document-meta-clinicaltrial-types.xml | 2 +- .../jcore-document-meta-pubmed-types.xml | 2 +- .../jcore/types/jcore-document-meta-types.xml | 2 +- ...document-structure-clinicaltrial-types.xml | 2 +- .../jcore-document-structure-pubmed-types.xml | 2 +- .../types/jcore-document-structure-types.xml | 2 +- .../jcore/types/jcore-morpho-syntax-types.xml | 2 +- .../types/jcore-semantics-biology-types.xml | 2 +- .../types/jcore-semantics-concept-types.xml | 2 +- .../types/jcore-semantics-mention-types.xml | 2 +- .../priorities/jcore-type-priorities.xml | 2 +- jcore-utilities/pom.xml | 2 +- .../src/test/resources/AETestDescriptor.xml | 2 +- jcore-xmi-db-reader/component.meta | 2 +- jcore-xmi-db-reader/pom.xml | 8 +- .../desc/jcore-xmi-db-multiplier-reader.xml | 2 +- .../xmi/desc/jcore-xmi-db-multiplier.xml | 2 +- .../reader/xmi/desc/jcore-xmi-db-reader.xml | 2 +- jcore-xmi-db-writer/component.meta | 2 +- jcore-xmi-db-writer/pom.xml | 4 +- .../consumer/xmi/desc/jcore-xmi-db-writer.xml | 2 +- jcore-xmi-reader/component.meta | 2 +- jcore-xmi-reader/pom.xml | 2 +- .../reader/xmi/desc/jcore-xmi-reader.xml | 2 +- jcore-xmi-writer/component.meta | 2 +- jcore-xmi-writer/pom.xml | 2 +- .../consumer/xmi/desc/jcore-xmi-writer.xml | 2 +- .../jcore/consumer/xmi/CasToXmiConsumer.xml | 2 +- jcore-xml-db-reader/component.meta | 2 +- jcore-xml-db-reader/pom.xml | 6 +- .../reader/xml/desc/jcore-xml-db-reader.xml | 2 +- jcore-xml-mapper/pom.xml | 2 +- .../test/resources/XMLReaderDescriptor.xml | 2 +- ...Descriptor_medline_Unicode_outside_BMP.xml | 2 +- ...aderDescriptor_medline_missingInputDir.xml | 2 +- ...XMLReaderDescriptor_medline_singleFile.xml | 2 +- ...MLReaderDescriptor_medline_singleFile2.xml | 2 +- jcore-xml-reader/component.meta | 2 +- jcore-xml-reader/pom.xml | 4 +- .../reader/xml/desc/XMLMultiplierReader.xml | 2 +- ...edlineReaderDescriptor_missingInputDir.xml | 2 +- .../test/resources/PubmedXMLMultiplier.xml | 2 +- .../test/resources/XMLMultiplierReader.xml | 2 +- jedis-parent/pom.xml | 2 +- pom.xml | 2 +- 297 files changed, 531 insertions(+), 339 deletions(-) delete mode 100644 jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/PLACEHOLDER diff --git a/jcore-ace-reader/component.meta b/jcore-ace-reader/component.meta index 65d83f33b..0ed4db39b 100644 --- a/jcore-ace-reader/component.meta +++ b/jcore-ace-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-ace-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe ACE Reader" } diff --git a/jcore-ace-reader/pom.xml b/jcore-ace-reader/pom.xml index fad4ca485..fdf961ad1 100644 --- a/jcore-ace-reader/pom.xml +++ b/jcore-ace-reader/pom.xml @@ -13,7 +13,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-ace-reader/src/main/resources/de/julielab/jcore/reader/ace/desc/jcore-ace-reader.xml b/jcore-ace-reader/src/main/resources/de/julielab/jcore/reader/ace/desc/jcore-ace-reader.xml index 6d7d29ff9..a1eae5b5b 100644 --- a/jcore-ace-reader/src/main/resources/de/julielab/jcore/reader/ace/desc/jcore-ace-reader.xml +++ b/jcore-ace-reader/src/main/resources/de/julielab/jcore/reader/ace/desc/jcore-ace-reader.xml @@ -5,7 +5,7 @@ AceReader Descriptor automatically generated by uimaFIT - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-acronym-ae/component.meta b/jcore-acronym-ae/component.meta index 4ccd014c0..5e9a4da4c 100644 --- a/jcore-acronym-ae/component.meta +++ b/jcore-acronym-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-acronym-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Acronym Tagger" } diff --git a/jcore-acronym-ae/pom.xml b/jcore-acronym-ae/pom.xml index df40261b4..dfd4fce45 100644 --- a/jcore-acronym-ae/pom.xml +++ b/jcore-acronym-ae/pom.xml @@ -14,7 +14,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-acronym-ae/src/main/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronym-ae.xml b/jcore-acronym-ae/src/main/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronym-ae.xml index f31cada2f..2ca072f45 100755 --- a/jcore-acronym-ae/src/main/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronym-ae.xml +++ b/jcore-acronym-ae/src/main/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronym-ae.xml @@ -6,7 +6,7 @@ JCoRe AcronymAnnotator - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/JulesToolsAEDescriptor.xml b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/JulesToolsAEDescriptor.xml index 9aa0a7e09..1e2c24294 100644 --- a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/JulesToolsAEDescriptor.xml +++ b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/JulesToolsAEDescriptor.xml @@ -6,7 +6,7 @@ JulesToolsDescriptor - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronymtagger-test.xml b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronymtagger-test.xml index 8e179d4c3..60c613aaf 100755 --- a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronymtagger-test.xml +++ b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronymtagger-test.xml @@ -6,7 +6,7 @@ JCoRe AcronymAnnotator - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/types/StemNetSemanticsTypeSystem.xml b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/types/StemNetSemanticsTypeSystem.xml index fd197d12f..5b37032f1 100644 --- a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/types/StemNetSemanticsTypeSystem.xml +++ b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/types/StemNetSemanticsTypeSystem.xml @@ -2,7 +2,7 @@ StemNetSemanticsTypeSystem -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT http://www.julielab.de diff --git a/jcore-acronym-writer/src/main/resources/de/julielab/jcore/consumer/acronyms/desc/jcore-acronym-writer.xml b/jcore-acronym-writer/src/main/resources/de/julielab/jcore/consumer/acronyms/desc/jcore-acronym-writer.xml index 5f3073b02..6659cbf31 100644 --- a/jcore-acronym-writer/src/main/resources/de/julielab/jcore/consumer/acronyms/desc/jcore-acronym-writer.xml +++ b/jcore-acronym-writer/src/main/resources/de/julielab/jcore/consumer/acronyms/desc/jcore-acronym-writer.xml @@ -6,7 +6,7 @@ JCoRe Acronym Writer Writes acronym annotation to a text file. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT OutputFile diff --git a/jcore-annotation-adder-ae/component.meta b/jcore-annotation-adder-ae/component.meta index 500127938..3978e1017 100644 --- a/jcore-annotation-adder-ae/component.meta +++ b/jcore-annotation-adder-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-annotation-adder-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Annotation Adder" } diff --git a/jcore-annotation-adder-ae/pom.xml b/jcore-annotation-adder-ae/pom.xml index 1473a562b..a8f6ce3bd 100644 --- a/jcore-annotation-adder-ae/pom.xml +++ b/jcore-annotation-adder-ae/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml b/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml index fcd2c1d27..2a72b89f9 100644 --- a/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml +++ b/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml @@ -6,7 +6,7 @@ JCoRe Annotation Adder This component helps to import annotations made on the exact CAS document text by an external process back into the CAS. To this end, the component is prepared to read several data formats. Currently, simple offset-based annotations are supported with configurable UIMA types. The component supports character and token based offsets. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT OffsetMode diff --git a/jcore-banner-ae/component.meta b/jcore-banner-ae/component.meta index 8785baa0c..2a01d6ff1 100644 --- a/jcore-banner-ae/component.meta +++ b/jcore-banner-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-banner-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Banner" } diff --git a/jcore-banner-ae/pom.xml b/jcore-banner-ae/pom.xml index 139a33c03..9e47d8857 100644 --- a/jcore-banner-ae/pom.xml +++ b/jcore-banner-ae/pom.xml @@ -66,7 +66,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT .. diff --git a/jcore-banner-ae/src/main/resources/de/julielab/jcore/ae/banner/desc/jcore-banner-ae.xml b/jcore-banner-ae/src/main/resources/de/julielab/jcore/ae/banner/desc/jcore-banner-ae.xml index 844073c9e..b98b5f42f 100644 --- a/jcore-banner-ae/src/main/resources/de/julielab/jcore/ae/banner/desc/jcore-banner-ae.xml +++ b/jcore-banner-ae/src/main/resources/de/julielab/jcore/ae/banner/desc/jcore-banner-ae.xml @@ -5,7 +5,7 @@ jcore-banner-ae - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-banner-ae/src/main/resources/desc/BANNERAE.xml b/jcore-banner-ae/src/main/resources/desc/BANNERAE.xml index 28c2a1499..05b35368f 100644 --- a/jcore-banner-ae/src/main/resources/desc/BANNERAE.xml +++ b/jcore-banner-ae/src/main/resources/desc/BANNERAE.xml @@ -6,7 +6,7 @@ BANNERAE - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-banner-ae/src/main/resources/desc/bannerTS.xml b/jcore-banner-ae/src/main/resources/desc/bannerTS.xml index d25adc102..70aaf0715 100644 --- a/jcore-banner-ae/src/main/resources/desc/bannerTS.xml +++ b/jcore-banner-ae/src/main/resources/desc/bannerTS.xml @@ -2,7 +2,7 @@ bannerTS basic typesystem started by sid - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-bc2gm-reader/component.meta b/jcore-bc2gm-reader/component.meta index 748123c36..3b60c95ed 100644 --- a/jcore-bc2gm-reader/component.meta +++ b/jcore-bc2gm-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-bc2gm-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe BioCreative II Gene Mention Reader" } diff --git a/jcore-bc2gm-reader/pom.xml b/jcore-bc2gm-reader/pom.xml index 1ec0602a9..f8579d215 100644 --- a/jcore-bc2gm-reader/pom.xml +++ b/jcore-bc2gm-reader/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-bc2gm-reader/src/main/resources/de/julielab/jcore/reader/bc2gm/desc/jcore-bc2gm-reader.xml b/jcore-bc2gm-reader/src/main/resources/de/julielab/jcore/reader/bc2gm/desc/jcore-bc2gm-reader.xml index 04e62abd2..b3b40d26c 100644 --- a/jcore-bc2gm-reader/src/main/resources/de/julielab/jcore/reader/bc2gm/desc/jcore-bc2gm-reader.xml +++ b/jcore-bc2gm-reader/src/main/resources/de/julielab/jcore/reader/bc2gm/desc/jcore-bc2gm-reader.xml @@ -5,7 +5,7 @@ JCoRe BioCreative II Gene Mention reader This component reads gene annotated sentences in the BioCreative II Gene Mention challenge format. Each CAS will contain one annotated sentence. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT SentencesFile diff --git a/jcore-bc2gmformat-writer/component.meta b/jcore-bc2gmformat-writer/component.meta index 384a54b21..2b7c90e41 100644 --- a/jcore-bc2gmformat-writer/component.meta +++ b/jcore-bc2gmformat-writer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-bc2gmformat-writer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe BioCreative II Gene Mention Format Writer" } diff --git a/jcore-bc2gmformat-writer/pom.xml b/jcore-bc2gmformat-writer/pom.xml index c68e9f170..8092a37ee 100644 --- a/jcore-bc2gmformat-writer/pom.xml +++ b/jcore-bc2gmformat-writer/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-bc2gmformat-writer/src/main/resources/de/julielab/jcore/consumer/bc2gmformat/desc/jcore-bc2gmformat-writer.xml b/jcore-bc2gmformat-writer/src/main/resources/de/julielab/jcore/consumer/bc2gmformat/desc/jcore-bc2gmformat-writer.xml index 2e122f8b6..811375d76 100644 --- a/jcore-bc2gmformat-writer/src/main/resources/de/julielab/jcore/consumer/bc2gmformat/desc/jcore-bc2gmformat-writer.xml +++ b/jcore-bc2gmformat-writer/src/main/resources/de/julielab/jcore/consumer/bc2gmformat/desc/jcore-bc2gmformat-writer.xml @@ -6,7 +6,7 @@ JCoRe BioCreative II Gene Mention Format writer This component writes gene annotations in the CAS to the format employed by the BioCreative II Gene Mention challenge. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT OutputDirectory diff --git a/jcore-biolemmatizer-ae/component.meta b/jcore-biolemmatizer-ae/component.meta index 66fd947c5..2b698fcb5 100644 --- a/jcore-biolemmatizer-ae/component.meta +++ b/jcore-biolemmatizer-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-biolemmatizer-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe BioLemmatizer" } diff --git a/jcore-biolemmatizer-ae/pom.xml b/jcore-biolemmatizer-ae/pom.xml index bf56276d0..241617304 100644 --- a/jcore-biolemmatizer-ae/pom.xml +++ b/jcore-biolemmatizer-ae/pom.xml @@ -8,7 +8,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-biolemmatizer-ae/src/main/resources/de/julielab/jcore/ae/biolemmatizer/desc/jcore-biolemmatizer-ae.xml b/jcore-biolemmatizer-ae/src/main/resources/de/julielab/jcore/ae/biolemmatizer/desc/jcore-biolemmatizer-ae.xml index 27b446003..137eb219c 100644 --- a/jcore-biolemmatizer-ae/src/main/resources/de/julielab/jcore/ae/biolemmatizer/desc/jcore-biolemmatizer-ae.xml +++ b/jcore-biolemmatizer-ae/src/main/resources/de/julielab/jcore/ae/biolemmatizer/desc/jcore-biolemmatizer-ae.xml @@ -6,7 +6,7 @@ BioLemmatizer - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-bionlpformat-consumer/component.meta b/jcore-bionlpformat-consumer/component.meta index e4c0dedc0..4071c4a18 100644 --- a/jcore-bionlpformat-consumer/component.meta +++ b/jcore-bionlpformat-consumer/component.meta @@ -22,7 +22,7 @@ "maven-artifact": { "artifactId": "jcore-bionlpformat-consumer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe BioNLP Format Consumer" } diff --git a/jcore-bionlpformat-consumer/pom.xml b/jcore-bionlpformat-consumer/pom.xml index bf58e21a4..d868129aa 100644 --- a/jcore-bionlpformat-consumer/pom.xml +++ b/jcore-bionlpformat-consumer/pom.xml @@ -6,7 +6,7 @@ jcore-base de.julielab - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-biomedical-sharedtask.xml b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-biomedical-sharedtask.xml index 45463be92..3d358227d 100644 --- a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-biomedical-sharedtask.xml +++ b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-biomedical-sharedtask.xml @@ -5,7 +5,7 @@ JCoRe BioNLP Event Consumer - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-medical.xml b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-medical.xml index 5ebfec59f..547769316 100644 --- a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-medical.xml +++ b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-medical.xml @@ -5,7 +5,7 @@ JCoRe BioNLP Format Event Consumer (Medical) - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-segment.xml b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-segment.xml index dc654b37b..be36250a4 100644 --- a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-segment.xml +++ b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-segment.xml @@ -7,7 +7,7 @@ JCoRe BioNLP Format Segment Consumer - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-bionlpformat-consumer/src/test/resources/types/jcore-all-types.xml b/jcore-bionlpformat-consumer/src/test/resources/types/jcore-all-types.xml index 670239d8d..7c320da41 100644 --- a/jcore-bionlpformat-consumer/src/test/resources/types/jcore-all-types.xml +++ b/jcore-bionlpformat-consumer/src/test/resources/types/jcore-all-types.xml @@ -2,7 +2,7 @@ JCoRe All Types This is just a convenience file, assembling all JCoRe types - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-bionlpformat-consumer/src/test/resources/types/jcore-semantics-biology-types.xml b/jcore-bionlpformat-consumer/src/test/resources/types/jcore-semantics-biology-types.xml index 0f6fca3ac..c01c57fe9 100644 --- a/jcore-bionlpformat-consumer/src/test/resources/types/jcore-semantics-biology-types.xml +++ b/jcore-bionlpformat-consumer/src/test/resources/types/jcore-semantics-biology-types.xml @@ -2,7 +2,7 @@ JCoRe Semantics Biology Types The type system contains types of the biomedical domain. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-bionlpformat-reader/component.meta b/jcore-bionlpformat-reader/component.meta index 6f10e9e95..229346ad7 100644 --- a/jcore-bionlpformat-reader/component.meta +++ b/jcore-bionlpformat-reader/component.meta @@ -22,7 +22,7 @@ "maven-artifact": { "artifactId": "jcore-bionlpformat-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe BioNLP Format Reader" } diff --git a/jcore-bionlpformat-reader/pom.xml b/jcore-bionlpformat-reader/pom.xml index 862c09d97..65fcefb66 100644 --- a/jcore-bionlpformat-reader/pom.xml +++ b/jcore-bionlpformat-reader/pom.xml @@ -6,7 +6,7 @@ jcore-base de.julielab - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-biomedical-sharedtask.xml b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-biomedical-sharedtask.xml index ccd6c46f6..0ba9c91cf 100644 --- a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-biomedical-sharedtask.xml +++ b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-biomedical-sharedtask.xml @@ -5,7 +5,7 @@ JCoRe BioNLP Event Reader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-medical.xml b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-medical.xml index 74cdb9e62..810dfac8c 100644 --- a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-medical.xml +++ b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-medical.xml @@ -5,7 +5,7 @@ BioNLP Format Reader Medical - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-segment.xml b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-segment.xml index aea0bc469..1f4944403 100644 --- a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-segment.xml +++ b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-segment.xml @@ -5,7 +5,7 @@ BioNLP Format Reader Segment - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-bionlpformat-reader/src/test/resources/de/julielab/jcore/reader/bionlpformat/desc/EventReaderTest.xml b/jcore-bionlpformat-reader/src/test/resources/de/julielab/jcore/reader/bionlpformat/desc/EventReaderTest.xml index 38ed5aed3..3813fdc7d 100644 --- a/jcore-bionlpformat-reader/src/test/resources/de/julielab/jcore/reader/bionlpformat/desc/EventReaderTest.xml +++ b/jcore-bionlpformat-reader/src/test/resources/de/julielab/jcore/reader/bionlpformat/desc/EventReaderTest.xml @@ -5,7 +5,7 @@ EventReader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-biosem-ae/component.meta b/jcore-biosem-ae/component.meta index dd5fcf39d..efff383f6 100644 --- a/jcore-biosem-ae/component.meta +++ b/jcore-biosem-ae/component.meta @@ -9,7 +9,7 @@ "maven-artifact": { "artifactId": "jcore-biosem-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe BioSem Event Annotator" } diff --git a/jcore-biosem-ae/pom.xml b/jcore-biosem-ae/pom.xml index ece3b845a..eec6bc55f 100644 --- a/jcore-biosem-ae/pom.xml +++ b/jcore-biosem-ae/pom.xml @@ -5,7 +5,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT jcore-biosem-ae JCoRe BioSem Event Annotator @@ -32,7 +32,7 @@ de.julielab jcore-bionlpformat-reader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT test @@ -48,7 +48,7 @@ de.julielab jcore-bionlpformat-consumer - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT test diff --git a/jcore-conll-consumer/component.meta b/jcore-conll-consumer/component.meta index e754ff444..87ff59f38 100644 --- a/jcore-conll-consumer/component.meta +++ b/jcore-conll-consumer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-conll-consumer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe CONLL Consumer" } diff --git a/jcore-conll-consumer/pom.xml b/jcore-conll-consumer/pom.xml index fef60e5bf..4ba6ef20c 100644 --- a/jcore-conll-consumer/pom.xml +++ b/jcore-conll-consumer/pom.xml @@ -4,7 +4,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT jcore-conll-consumer diff --git a/jcore-conll-consumer/src/main/resources/de/julielab/jcore/consumer/conll/desc/jcore-conll-consumer.xml b/jcore-conll-consumer/src/main/resources/de/julielab/jcore/consumer/conll/desc/jcore-conll-consumer.xml index 30f0366eb..854c345d4 100644 --- a/jcore-conll-consumer/src/main/resources/de/julielab/jcore/consumer/conll/desc/jcore-conll-consumer.xml +++ b/jcore-conll-consumer/src/main/resources/de/julielab/jcore/consumer/conll/desc/jcore-conll-consumer.xml @@ -6,7 +6,7 @@ JCoRe Conll Consumer - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-coordination-baseline-ae/component.meta b/jcore-coordination-baseline-ae/component.meta index c79a816e4..361310479 100644 --- a/jcore-coordination-baseline-ae/component.meta +++ b/jcore-coordination-baseline-ae/component.meta @@ -26,7 +26,7 @@ "maven-artifact": { "artifactId": "jcore-coordination-baseline-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Coordination Tagger Baseline" } diff --git a/jcore-coordination-baseline-ae/pom.xml b/jcore-coordination-baseline-ae/pom.xml index eaff316fa..ea88c0b43 100644 --- a/jcore-coordination-baseline-ae/pom.xml +++ b/jcore-coordination-baseline-ae/pom.xml @@ -13,7 +13,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-conjunct.xml b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-conjunct.xml index 1e5a6c860..40bb374a8 100644 --- a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-conjunct.xml +++ b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-conjunct.xml @@ -6,7 +6,7 @@ JCoRe ConjunctAnnotator -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-coordination.xml b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-coordination.xml index b5db7b69b..55b4377d0 100644 --- a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-coordination.xml +++ b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-coordination.xml @@ -6,7 +6,7 @@ JCoRe CoordinationAnnotator -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-eee.xml b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-eee.xml index 50c01690b..434bfd967 100644 --- a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-eee.xml +++ b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-eee.xml @@ -6,7 +6,7 @@ JCoRe EEEAnnotator -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-ellipsis.xml b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-ellipsis.xml index 8e73905d3..a508d4ab7 100644 --- a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-ellipsis.xml +++ b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-ellipsis.xml @@ -6,7 +6,7 @@ JCoRe EllipsisAnnotator -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-coordination-baseline-ae/src/test/resources/desc/ConjunctAnnotatorTest.xml b/jcore-coordination-baseline-ae/src/test/resources/desc/ConjunctAnnotatorTest.xml index 50c97ebbc..be03ff4bb 100644 --- a/jcore-coordination-baseline-ae/src/test/resources/desc/ConjunctAnnotatorTest.xml +++ b/jcore-coordination-baseline-ae/src/test/resources/desc/ConjunctAnnotatorTest.xml @@ -6,7 +6,7 @@ ConjunctAnnotator -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT diff --git a/jcore-coordination-baseline-ae/src/test/resources/desc/CoordinationAnnotatorTest.xml b/jcore-coordination-baseline-ae/src/test/resources/desc/CoordinationAnnotatorTest.xml index ca9a48170..a256a83b6 100644 --- a/jcore-coordination-baseline-ae/src/test/resources/desc/CoordinationAnnotatorTest.xml +++ b/jcore-coordination-baseline-ae/src/test/resources/desc/CoordinationAnnotatorTest.xml @@ -6,7 +6,7 @@ CoordinationAnnotator -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT diff --git a/jcore-coordination-baseline-ae/src/test/resources/desc/EEEAnnotatorTest.xml b/jcore-coordination-baseline-ae/src/test/resources/desc/EEEAnnotatorTest.xml index 3683f5210..4b470443d 100644 --- a/jcore-coordination-baseline-ae/src/test/resources/desc/EEEAnnotatorTest.xml +++ b/jcore-coordination-baseline-ae/src/test/resources/desc/EEEAnnotatorTest.xml @@ -6,7 +6,7 @@ EEEAnnotator -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT diff --git a/jcore-coordination-baseline-ae/src/test/resources/desc/EllipsisAnnotatorTest.xml b/jcore-coordination-baseline-ae/src/test/resources/desc/EllipsisAnnotatorTest.xml index beea12e3e..422a96e06 100644 --- a/jcore-coordination-baseline-ae/src/test/resources/desc/EllipsisAnnotatorTest.xml +++ b/jcore-coordination-baseline-ae/src/test/resources/desc/EllipsisAnnotatorTest.xml @@ -6,7 +6,7 @@ EllipsisAnnotator -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT diff --git a/jcore-cord19-reader/component.meta b/jcore-cord19-reader/component.meta index 3fd15f733..66bd41580 100644 --- a/jcore-cord19-reader/component.meta +++ b/jcore-cord19-reader/component.meta @@ -19,7 +19,7 @@ "maven-artifact": { "artifactId": "jcore-cord19-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe CORD-19 Reader" } diff --git a/jcore-cord19-reader/pom.xml b/jcore-cord19-reader/pom.xml index a1cdf1d9a..b77f93e91 100644 --- a/jcore-cord19-reader/pom.xml +++ b/jcore-cord19-reader/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier-reader.xml b/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier-reader.xml index 90f5da426..fc54b7b2e 100644 --- a/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier-reader.xml +++ b/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier-reader.xml @@ -5,7 +5,7 @@ JCoRe CORD-19 Multiplier Reader This component reads file paths to JSON files and the CORD-19 (https://pages.semanticscholar.org/coronavirus-research) meta data file to send them to CAS multipliers. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier.xml b/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier.xml index b539b1511..812eeb5c6 100644 --- a/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier.xml +++ b/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier.xml @@ -6,7 +6,7 @@ JCoRe CORD-19 CAS Multiplier This component reads the CORD-19 (https://pages.semanticscholar.org/coronavirus-research) JSON format into UIMA CAS instances. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-ct-reader/component.meta b/jcore-ct-reader/component.meta index a131ea835..309b82f92 100644 --- a/jcore-ct-reader/component.meta +++ b/jcore-ct-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-ct-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Clinical Trials Reader" } diff --git a/jcore-ct-reader/pom.xml b/jcore-ct-reader/pom.xml index bfc239518..ac50c8cdb 100644 --- a/jcore-ct-reader/pom.xml +++ b/jcore-ct-reader/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-ct-reader/src/main/resources/de/julielab/jcore/reader/ct/desc/jcore-clinicaltrials-reader.xml b/jcore-ct-reader/src/main/resources/de/julielab/jcore/reader/ct/desc/jcore-clinicaltrials-reader.xml index 100df0acd..33e4a0f03 100644 --- a/jcore-ct-reader/src/main/resources/de/julielab/jcore/reader/ct/desc/jcore-clinicaltrials-reader.xml +++ b/jcore-ct-reader/src/main/resources/de/julielab/jcore/reader/ct/desc/jcore-clinicaltrials-reader.xml @@ -5,7 +5,7 @@ JCoRe Clinical Trials Reader This component reads the XML format provided by ClinicalTrials.gov. To this end, the JCoRe type system contains a number of types specifically created for this kind of document. Note that the CAS text created by this reader might be confusing without checking the corresponding annotations. This is due to the fact that the CT XML contains multiple enumerations which are not very well reflected in plain text. Also, enumerations with subitems, such as the outcomes, are not displayed in the expected groups of items. Instead, each item type is displayed separately. This could be changed, if necessary. Since all items are correctly annotated by their category, this might not even be an issue, depending on the downstream tasks. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT InputDirectory diff --git a/jcore-db-checkpoint-ae/component.meta b/jcore-db-checkpoint-ae/component.meta index b703ae5c4..958bc8f17 100644 --- a/jcore-db-checkpoint-ae/component.meta +++ b/jcore-db-checkpoint-ae/component.meta @@ -19,7 +19,7 @@ "maven-artifact": { "artifactId": "jcore-db-checkpoint-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Database Checkpoint AE" } diff --git a/jcore-db-checkpoint-ae/pom.xml b/jcore-db-checkpoint-ae/pom.xml index 3cac45687..f7ed71533 100644 --- a/jcore-db-checkpoint-ae/pom.xml +++ b/jcore-db-checkpoint-ae/pom.xml @@ -9,7 +9,7 @@ de.julielab jedis-parent - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT ../jedis-parent diff --git a/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-ae.xml b/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-ae.xml index 31e3605e8..8264367e1 100644 --- a/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-ae.xml +++ b/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-ae.xml @@ -6,7 +6,7 @@ JCoRe Database Checkpoint AE This component can be used when using a JCoRe database reader that reads from a CoStoSys/JeDIS subset. Enters the configured component name in the 'last component' column. Can also mark documents as being completely processed. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT CheckpointName diff --git a/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-consumer.xml b/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-consumer.xml index 5ac25514c..59b0bf054 100644 --- a/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-consumer.xml +++ b/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-consumer.xml @@ -6,7 +6,7 @@ JCoRe Database Checkpoint Writer This component can be used when using a JCoRe database reader that reads from a CoStoSys/JeDIS subset. Enters the configured component name in the 'last component' column. Can also mark documents as being completely processed. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT CheckpointName diff --git a/jcore-db-reader/component.meta b/jcore-db-reader/component.meta index a6793b944..78b3ba1ad 100644 --- a/jcore-db-reader/component.meta +++ b/jcore-db-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-db-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Database Reader" } diff --git a/jcore-db-reader/pom.xml b/jcore-db-reader/pom.xml index 2129cc7e0..bf3b215b9 100644 --- a/jcore-db-reader/pom.xml +++ b/jcore-db-reader/pom.xml @@ -3,7 +3,7 @@ jedis-parent de.julielab - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT ../jedis-parent 4.0.0 @@ -44,7 +44,7 @@ de.julielab jcore-xml-mapper - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT test diff --git a/jcore-db-reader/src/main/resources/de/julielab/jcore/reader/db/desc/jcore-db-multiplier-reader.xml b/jcore-db-reader/src/main/resources/de/julielab/jcore/reader/db/desc/jcore-db-multiplier-reader.xml index 489b2b92a..9637ab27d 100644 --- a/jcore-db-reader/src/main/resources/de/julielab/jcore/reader/db/desc/jcore-db-multiplier-reader.xml +++ b/jcore-db-reader/src/main/resources/de/julielab/jcore/reader/db/desc/jcore-db-multiplier-reader.xml @@ -10,7 +10,7 @@ sent by this reader. The component leverages the corpus storage system (CoStoSys) for this purpose and is part of the Jena Document Information System, JeDIS. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT ResetTable diff --git a/jcore-descriptor-creator/pom.xml b/jcore-descriptor-creator/pom.xml index 0336524bf..aae843561 100644 --- a/jcore-descriptor-creator/pom.xml +++ b/jcore-descriptor-creator/pom.xml @@ -4,7 +4,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT jcore-descriptor-creator diff --git a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/ae/testae/desc/de.julielab.jcore.ae.testae.TestAE.xml b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/ae/testae/desc/de.julielab.jcore.ae.testae.TestAE.xml index 34208ad32..558a62b57 100644 --- a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/ae/testae/desc/de.julielab.jcore.ae.testae.TestAE.xml +++ b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/ae/testae/desc/de.julielab.jcore.ae.testae.TestAE.xml @@ -6,7 +6,7 @@ de.julielab.jcore.ae.testae.TestAE Descriptor automatically generated by uimaFIT - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT de.julielab.jcore.ae.testae diff --git a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/consumer/testconsumer/desc/de.julielab.jcore.consumer.testconsumer.Testconsumer.xml b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/consumer/testconsumer/desc/de.julielab.jcore.consumer.testconsumer.Testconsumer.xml index 7d1d5a224..3bf9a16c1 100644 --- a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/consumer/testconsumer/desc/de.julielab.jcore.consumer.testconsumer.Testconsumer.xml +++ b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/consumer/testconsumer/desc/de.julielab.jcore.consumer.testconsumer.Testconsumer.xml @@ -6,7 +6,7 @@ de.julielab.jcore.consumer.testconsumer.Testconsumer Descriptor automatically generated by uimaFIT - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT de.julielab.jcore.consumer.testconsumer diff --git a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/multiplier/testmultiplier/desc/de.julielab.jcore.multiplier.testmultiplier.TestMultiplier.xml b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/multiplier/testmultiplier/desc/de.julielab.jcore.multiplier.testmultiplier.TestMultiplier.xml index 8167fbb68..8ef78db33 100644 --- a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/multiplier/testmultiplier/desc/de.julielab.jcore.multiplier.testmultiplier.TestMultiplier.xml +++ b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/multiplier/testmultiplier/desc/de.julielab.jcore.multiplier.testmultiplier.TestMultiplier.xml @@ -6,7 +6,7 @@ de.julielab.jcore.multiplier.testmultiplier.TestMultiplier Descriptor automatically generated by uimaFIT - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT de.julielab.jcore.multiplier.testmultiplier diff --git a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/reader/testreader/desc/de.julielab.jcore.reader.testreader.TestReader.xml b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/reader/testreader/desc/de.julielab.jcore.reader.testreader.TestReader.xml index 016fc36bf..bd482d6ee 100644 --- a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/reader/testreader/desc/de.julielab.jcore.reader.testreader.TestReader.xml +++ b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/reader/testreader/desc/de.julielab.jcore.reader.testreader.TestReader.xml @@ -5,7 +5,7 @@ de.julielab.jcore.reader.testreader.TestReader Descriptor automatically generated by uimaFIT - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT de.julielab.jcore.reader.testreader diff --git a/jcore-dta-reader/component.meta b/jcore-dta-reader/component.meta index 44239af00..ee9b729df 100644 --- a/jcore-dta-reader/component.meta +++ b/jcore-dta-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-dta-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe DTA Reader" } diff --git a/jcore-dta-reader/pom.xml b/jcore-dta-reader/pom.xml index f05d13a93..b47f53e66 100644 --- a/jcore-dta-reader/pom.xml +++ b/jcore-dta-reader/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-dta-reader/src/main/resources/de/julielab/jcore/reader/dta/desc/jcore-dta-reader.xml b/jcore-dta-reader/src/main/resources/de/julielab/jcore/reader/dta/desc/jcore-dta-reader.xml index 1e17bdb36..8bc431330 100644 --- a/jcore-dta-reader/src/main/resources/de/julielab/jcore/reader/dta/desc/jcore-dta-reader.xml +++ b/jcore-dta-reader/src/main/resources/de/julielab/jcore/reader/dta/desc/jcore-dta-reader.xml @@ -5,7 +5,7 @@ JCoRe DTA Reader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-ec-code-ae/component.meta b/jcore-ec-code-ae/component.meta index 995049c32..22af189d5 100644 --- a/jcore-ec-code-ae/component.meta +++ b/jcore-ec-code-ae/component.meta @@ -9,7 +9,7 @@ "maven-artifact": { "artifactId": "jcore-ecn-code-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Enzyme Commission Number AE" } diff --git a/jcore-ec-code-ae/pom.xml b/jcore-ec-code-ae/pom.xml index 14428b6cf..05cc496a5 100644 --- a/jcore-ec-code-ae/pom.xml +++ b/jcore-ec-code-ae/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-elasticsearch-consumer/component.meta b/jcore-elasticsearch-consumer/component.meta index 584bbdc82..b2f0e7a71 100644 --- a/jcore-elasticsearch-consumer/component.meta +++ b/jcore-elasticsearch-consumer/component.meta @@ -18,7 +18,7 @@ "maven-artifact": { "artifactId": "jcore-elasticsearch-consumer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe ElasticSearch Consumer" } diff --git a/jcore-elasticsearch-consumer/pom.xml b/jcore-elasticsearch-consumer/pom.xml index 8014c9cad..540e2f7d1 100644 --- a/jcore-elasticsearch-consumer/pom.xml +++ b/jcore-elasticsearch-consumer/pom.xml @@ -4,7 +4,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT jcore-elasticsearch-consumer JCoRe ElasticSearch Consumer diff --git a/jcore-elasticsearch-consumer/src/main/resources/de/julielab/jcore/consumer/es/desc/jcore-elasticsearch-consumer.xml b/jcore-elasticsearch-consumer/src/main/resources/de/julielab/jcore/consumer/es/desc/jcore-elasticsearch-consumer.xml index cafc85e71..c2334321e 100644 --- a/jcore-elasticsearch-consumer/src/main/resources/de/julielab/jcore/consumer/es/desc/jcore-elasticsearch-consumer.xml +++ b/jcore-elasticsearch-consumer/src/main/resources/de/julielab/jcore/consumer/es/desc/jcore-elasticsearch-consumer.xml @@ -5,7 +5,7 @@ de.julielab.jcore.consumer.es.ElasticSearchConsumer JCore ElasticSearch Consumer - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT urls diff --git a/jcore-elasticsearch-consumer/src/main/resources/de/julielab/jcore/consumer/es/desc/jcore-json-writer.xml b/jcore-elasticsearch-consumer/src/main/resources/de/julielab/jcore/consumer/es/desc/jcore-json-writer.xml index efd472393..485ebb2ce 100644 --- a/jcore-elasticsearch-consumer/src/main/resources/de/julielab/jcore/consumer/es/desc/jcore-json-writer.xml +++ b/jcore-elasticsearch-consumer/src/main/resources/de/julielab/jcore/consumer/es/desc/jcore-json-writer.xml @@ -5,7 +5,7 @@ de.julielab.jcore.consumer.es.JsonWriter JCoRe JSON Writer - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT OutputDestination diff --git a/jcore-elasticsearch-consumer/src/test/resources/de/julielab/jcore/consumer/es/testTypes.xml b/jcore-elasticsearch-consumer/src/test/resources/de/julielab/jcore/consumer/es/testTypes.xml index 0b1bd8c30..dfdd4d093 100644 --- a/jcore-elasticsearch-consumer/src/test/resources/de/julielab/jcore/consumer/es/testTypes.xml +++ b/jcore-elasticsearch-consumer/src/test/resources/de/julielab/jcore/consumer/es/testTypes.xml @@ -2,7 +2,7 @@ testTypes Some types suited for unit tests. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-embedding-writer/component.meta b/jcore-embedding-writer/component.meta index c95336587..0c6301641 100644 --- a/jcore-embedding-writer/component.meta +++ b/jcore-embedding-writer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-embedding-writer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Embedding Writer" } diff --git a/jcore-embedding-writer/pom.xml b/jcore-embedding-writer/pom.xml index 820510aa5..d5d5304a6 100644 --- a/jcore-embedding-writer/pom.xml +++ b/jcore-embedding-writer/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-embedding-writer/src/main/resources/de/julielab/jcore/consumer/ew/desc/jcore-embedding-writer.xml b/jcore-embedding-writer/src/main/resources/de/julielab/jcore/consumer/ew/desc/jcore-embedding-writer.xml index 14b684f02..46f458d8b 100644 --- a/jcore-embedding-writer/src/main/resources/de/julielab/jcore/consumer/ew/desc/jcore-embedding-writer.xml +++ b/jcore-embedding-writer/src/main/resources/de/julielab/jcore/consumer/ew/desc/jcore-embedding-writer.xml @@ -6,7 +6,7 @@ JCoRe Flair Embedding Writer Given a Flair compatible embedding and a UIMA annotation type, this component prints the embeddings of tokens annotated with the annotation to a file. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT UseGzip diff --git a/jcore-event-flattener-ae/component.meta b/jcore-event-flattener-ae/component.meta index 94b772718..afc1e729e 100644 --- a/jcore-event-flattener-ae/component.meta +++ b/jcore-event-flattener-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-event-flattener-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Event Flattener AE" } diff --git a/jcore-event-flattener-ae/pom.xml b/jcore-event-flattener-ae/pom.xml index 83ff43f48..423a141b9 100644 --- a/jcore-event-flattener-ae/pom.xml +++ b/jcore-event-flattener-ae/pom.xml @@ -3,7 +3,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT jcore-event-flattener-ae JCoRe Event Flattener AE diff --git a/jcore-event-flattener-ae/src/main/resources/de/julielab/jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml b/jcore-event-flattener-ae/src/main/resources/de/julielab/jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml index bbd7bde4f..ff351724b 100644 --- a/jcore-event-flattener-ae/src/main/resources/de/julielab/jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml +++ b/jcore-event-flattener-ae/src/main/resources/de/julielab/jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml @@ -6,7 +6,7 @@ de.julielab.jcore.ae.eventflattener.EventFlattener Descriptor automatically generated by uimaFIT - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT de.julielab.jcore.ae.eventflattener diff --git a/jcore-feature-value-replacement-ae/component.meta b/jcore-feature-value-replacement-ae/component.meta index d81fdcdaa..dfb623568 100644 --- a/jcore-feature-value-replacement-ae/component.meta +++ b/jcore-feature-value-replacement-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-feature-value-replacement-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Feature Value Replacement AE" } diff --git a/jcore-feature-value-replacement-ae/pom.xml b/jcore-feature-value-replacement-ae/pom.xml index 721035710..f3e120d76 100644 --- a/jcore-feature-value-replacement-ae/pom.xml +++ b/jcore-feature-value-replacement-ae/pom.xml @@ -4,7 +4,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT jcore-feature-value-replacement-ae JCoRe Feature Value Replacement AE diff --git a/jcore-feature-value-replacement-ae/src/main/resources/de/julielab/jcore/ae/fvr/desc/jcore-feature-value-replacement-ae.xml b/jcore-feature-value-replacement-ae/src/main/resources/de/julielab/jcore/ae/fvr/desc/jcore-feature-value-replacement-ae.xml index 9be834fd5..42c3e36a8 100644 --- a/jcore-feature-value-replacement-ae/src/main/resources/de/julielab/jcore/ae/fvr/desc/jcore-feature-value-replacement-ae.xml +++ b/jcore-feature-value-replacement-ae/src/main/resources/de/julielab/jcore/ae/fvr/desc/jcore-feature-value-replacement-ae.xml @@ -6,7 +6,7 @@ de.julielab.jcore.ae.fvr.FeatureValueReplacementAnnotator Descriptor automatically generated by uimaFIT - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT de.julielab.jcore.ae.fvr diff --git a/jcore-file-reader/component.meta b/jcore-file-reader/component.meta index 2e3e09849..9aabd9c66 100644 --- a/jcore-file-reader/component.meta +++ b/jcore-file-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-file-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe File Reader" } diff --git a/jcore-file-reader/pom.xml b/jcore-file-reader/pom.xml index 74d1574a6..0de264d3b 100644 --- a/jcore-file-reader/pom.xml +++ b/jcore-file-reader/pom.xml @@ -5,7 +5,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT jcore-file-reader JCoRe File Reader diff --git a/jcore-file-reader/src/main/resources/de/julielab/jcore/reader/file/desc/jcore-file-reader.xml b/jcore-file-reader/src/main/resources/de/julielab/jcore/reader/file/desc/jcore-file-reader.xml index 39a2be27e..f5b30ff00 100644 --- a/jcore-file-reader/src/main/resources/de/julielab/jcore/reader/file/desc/jcore-file-reader.xml +++ b/jcore-file-reader/src/main/resources/de/julielab/jcore/reader/file/desc/jcore-file-reader.xml @@ -5,7 +5,7 @@ JCoRe File Reader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIELab Jena, Germany diff --git a/jcore-flair-ner-ae/component.meta b/jcore-flair-ner-ae/component.meta index 5340cb3ce..09250babf 100644 --- a/jcore-flair-ner-ae/component.meta +++ b/jcore-flair-ner-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-flair-ner-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Flair NER AE" } diff --git a/jcore-flair-ner-ae/pom.xml b/jcore-flair-ner-ae/pom.xml index 5e9b35b49..9ad39de20 100644 --- a/jcore-flair-ner-ae/pom.xml +++ b/jcore-flair-ner-ae/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT @@ -43,7 +43,7 @@ de.julielab jcore-annotation-adder-ae - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT ch.qos.logback diff --git a/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/desc/jcore-flair-ner-ae.xml b/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/desc/jcore-flair-ner-ae.xml index 3d158471f..bccfd8ddc 100644 --- a/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/desc/jcore-flair-ner-ae.xml +++ b/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/desc/jcore-flair-ner-ae.xml @@ -6,7 +6,7 @@ JCoRe Flair Named Entity Recognizer This component starts a child process to a python interpreter and loads a Flair sequence tagging model. Sentences are taken from the CAS, sent to Flair for tagging and the results are written into the CAS. The annotation type to use can be configured. It must be a subtype of de.julielab.jcore.types.EntityMention. The tag of each entity is written to the specificType feature. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT AnnotationType diff --git a/jcore-flair-token-embedding-ae/component.meta b/jcore-flair-token-embedding-ae/component.meta index 82dc90b84..cc7ef4681 100644 --- a/jcore-flair-token-embedding-ae/component.meta +++ b/jcore-flair-token-embedding-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-flair-token-embedding-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Flair Token Embedding Annotator" } diff --git a/jcore-flair-token-embedding-ae/pom.xml b/jcore-flair-token-embedding-ae/pom.xml index 789d1956f..483998eda 100644 --- a/jcore-flair-token-embedding-ae/pom.xml +++ b/jcore-flair-token-embedding-ae/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-flair-token-embedding-ae/src/main/resources/de/julielab/jcore/ae/fte/desc/jcore-flair-token-embedding-ae.xml b/jcore-flair-token-embedding-ae/src/main/resources/de/julielab/jcore/ae/fte/desc/jcore-flair-token-embedding-ae.xml index 81db110e0..3b342d593 100644 --- a/jcore-flair-token-embedding-ae/src/main/resources/de/julielab/jcore/ae/fte/desc/jcore-flair-token-embedding-ae.xml +++ b/jcore-flair-token-embedding-ae/src/main/resources/de/julielab/jcore/ae/fte/desc/jcore-flair-token-embedding-ae.xml @@ -6,7 +6,7 @@ JCoRe Flair Token Embedding Annotator Adds the Flair compatible embedding vectors to the token annotations. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT EmbeddingPath diff --git a/jcore-iexml-consumer/component.meta b/jcore-iexml-consumer/component.meta index 621a4d340..0ec142ad7 100644 --- a/jcore-iexml-consumer/component.meta +++ b/jcore-iexml-consumer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-iexml-consumer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe IEXML Consumer" } diff --git a/jcore-iexml-consumer/pom.xml b/jcore-iexml-consumer/pom.xml index 0cca60dfb..8924c020c 100644 --- a/jcore-iexml-consumer/pom.xml +++ b/jcore-iexml-consumer/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT Generates stand-off IEXML files as used in the Mantra challenge. @@ -74,7 +74,7 @@ de.julielab jcore-mantra-xml-types - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT junit diff --git a/jcore-iexml-consumer/src/main/resources/de/julielab/jcore/consumer/iexml/desc/jcore-iexml-consumer.xml b/jcore-iexml-consumer/src/main/resources/de/julielab/jcore/consumer/iexml/desc/jcore-iexml-consumer.xml index 3d3cfbee2..98c581be2 100644 --- a/jcore-iexml-consumer/src/main/resources/de/julielab/jcore/consumer/iexml/desc/jcore-iexml-consumer.xml +++ b/jcore-iexml-consumer/src/main/resources/de/julielab/jcore/consumer/iexml/desc/jcore-iexml-consumer.xml @@ -5,7 +5,7 @@ JCoRe IEXML Consumer - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-iexml-reader/component.meta b/jcore-iexml-reader/component.meta index eac29d502..15d5600c1 100644 --- a/jcore-iexml-reader/component.meta +++ b/jcore-iexml-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-iexml-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe IEXML Reader" } diff --git a/jcore-iexml-reader/pom.xml b/jcore-iexml-reader/pom.xml index 94b02b301..2ce284fda 100644 --- a/jcore-iexml-reader/pom.xml +++ b/jcore-iexml-reader/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT @@ -75,7 +75,7 @@ de.julielab jcore-mantra-xml-types - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT junit diff --git a/jcore-iexml-reader/src/main/resources/de/julielab/jcore/reader/iexml/desc/jcore-iexml-reader.xml b/jcore-iexml-reader/src/main/resources/de/julielab/jcore/reader/iexml/desc/jcore-iexml-reader.xml index 89f48191c..933482a5a 100644 --- a/jcore-iexml-reader/src/main/resources/de/julielab/jcore/reader/iexml/desc/jcore-iexml-reader.xml +++ b/jcore-iexml-reader/src/main/resources/de/julielab/jcore/reader/iexml/desc/jcore-iexml-reader.xml @@ -5,7 +5,7 @@ JCoRe IEXML Reader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-ign-reader/component.meta b/jcore-ign-reader/component.meta index 9ea912d40..798abe608 100644 --- a/jcore-ign-reader/component.meta +++ b/jcore-ign-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-ign-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe IGN Reader" } diff --git a/jcore-ign-reader/pom.xml b/jcore-ign-reader/pom.xml index f1f2ebfd5..df7d561d4 100644 --- a/jcore-ign-reader/pom.xml +++ b/jcore-ign-reader/pom.xml @@ -5,7 +5,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT jcore-ign-reader diff --git a/jcore-ign-reader/src/main/resources/de/julielab/jcore/reader/ign/desc/jcore-ign-reader.xml b/jcore-ign-reader/src/main/resources/de/julielab/jcore/reader/ign/desc/jcore-ign-reader.xml index 91d8abac5..3205766bd 100644 --- a/jcore-ign-reader/src/main/resources/de/julielab/jcore/reader/ign/desc/jcore-ign-reader.xml +++ b/jcore-ign-reader/src/main/resources/de/julielab/jcore/reader/ign/desc/jcore-ign-reader.xml @@ -7,7 +7,7 @@ The IGNReader reads IGN corpus files in BioC-format. There are XML files comprising the actual text (as well as passage and sentence annotations) and there are separate XML files comprising the annotations. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-iob-consumer/component.meta b/jcore-iob-consumer/component.meta index faa7e6b5e..9e0e62410 100644 --- a/jcore-iob-consumer/component.meta +++ b/jcore-iob-consumer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-iob-consumer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe CAS to IOB Consumer" } diff --git a/jcore-iob-consumer/pom.xml b/jcore-iob-consumer/pom.xml index 7625d1c8c..e09d8591a 100644 --- a/jcore-iob-consumer/pom.xml +++ b/jcore-iob-consumer/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-iob-consumer/src/main/resources/de/julielab/jcore/consumer/cas2iob/desc/jcore-iob-consumer.xml b/jcore-iob-consumer/src/main/resources/de/julielab/jcore/consumer/cas2iob/desc/jcore-iob-consumer.xml index a333e4aaf..72b818213 100644 --- a/jcore-iob-consumer/src/main/resources/de/julielab/jcore/consumer/cas2iob/desc/jcore-iob-consumer.xml +++ b/jcore-iob-consumer/src/main/resources/de/julielab/jcore/consumer/cas2iob/desc/jcore-iob-consumer.xml @@ -6,7 +6,7 @@ JCoRe IOB Writer This component help to write CAS entity or chunk annotations into a text file in IOB format. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT outFolder diff --git a/jcore-iob-consumer/src/test/resources/de/julielab/jcore/consumer/cas2iob/desc/ToIOBConsumerTest.xml b/jcore-iob-consumer/src/test/resources/de/julielab/jcore/consumer/cas2iob/desc/ToIOBConsumerTest.xml index 36199e77d..deb5a9318 100644 --- a/jcore-iob-consumer/src/test/resources/de/julielab/jcore/consumer/cas2iob/desc/ToIOBConsumerTest.xml +++ b/jcore-iob-consumer/src/test/resources/de/julielab/jcore/consumer/cas2iob/desc/ToIOBConsumerTest.xml @@ -5,7 +5,7 @@ ToIOBConsumerTest - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-iob-consumer/src/test/resources/de/julielab/jcore/consumer/cas2iob/types/TestTypeSystem.xml b/jcore-iob-consumer/src/test/resources/de/julielab/jcore/consumer/cas2iob/types/TestTypeSystem.xml index 7b3f82a25..0a7a01cf3 100644 --- a/jcore-iob-consumer/src/test/resources/de/julielab/jcore/consumer/cas2iob/types/TestTypeSystem.xml +++ b/jcore-iob-consumer/src/test/resources/de/julielab/jcore/consumer/cas2iob/types/TestTypeSystem.xml @@ -2,7 +2,7 @@ TestTypeSystem including julie morpho-syntax and semantics -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT diff --git a/jcore-jemas-ae/src/main/resources/de/julielab/jcore/ae/jemas/desc/jcore-jemas-ae.xml b/jcore-jemas-ae/src/main/resources/de/julielab/jcore/ae/jemas/desc/jcore-jemas-ae.xml index 436c249b2..c44952183 100644 --- a/jcore-jemas-ae/src/main/resources/de/julielab/jcore/ae/jemas/desc/jcore-jemas-ae.xml +++ b/jcore-jemas-ae/src/main/resources/de/julielab/jcore/ae/jemas/desc/jcore-jemas-ae.xml @@ -5,7 +5,7 @@ JCoRe JEmAS A UIMA-based implementation of the core functionality of JEmAS, the Jena Emotion Analysis System. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-jnet-ae/component.meta b/jcore-jnet-ae/component.meta index dbdfe4186..74ba27806 100644 --- a/jcore-jnet-ae/component.meta +++ b/jcore-jnet-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-jnet-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe JNET AE" } diff --git a/jcore-jnet-ae/pom.xml b/jcore-jnet-ae/pom.xml index 6eb5eb572..ea8a89340 100644 --- a/jcore-jnet-ae/pom.xml +++ b/jcore-jnet-ae/pom.xml @@ -11,7 +11,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-jnet-ae/src/main/resources/de/julielab/jcore/ae/jnet/desc/jcore-jnet-ae.xml b/jcore-jnet-ae/src/main/resources/de/julielab/jcore/ae/jnet/desc/jcore-jnet-ae.xml index db23c98b2..8f602da33 100644 --- a/jcore-jnet-ae/src/main/resources/de/julielab/jcore/ae/jnet/desc/jcore-jnet-ae.xml +++ b/jcore-jnet-ae/src/main/resources/de/julielab/jcore/ae/jnet/desc/jcore-jnet-ae.xml @@ -6,7 +6,7 @@ JCoRe JNET AE - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.xml b/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.xml index 12859863d..34cfdc1e9 100644 --- a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.xml +++ b/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.xml @@ -6,7 +6,7 @@ EntityTaggerAnnotator - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT julielab diff --git a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/tsDescriptor.xml b/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/tsDescriptor.xml index b26a4688d..a71ebef34 100644 --- a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/tsDescriptor.xml +++ b/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/tsDescriptor.xml @@ -2,7 +2,7 @@ aceComplete -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT diff --git a/jcore-jpos-ae/component.meta b/jcore-jpos-ae/component.meta index 86f05e5d5..eb0b7ae53 100644 --- a/jcore-jpos-ae/component.meta +++ b/jcore-jpos-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-jpos-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe JPOS AE" } diff --git a/jcore-jpos-ae/pom.xml b/jcore-jpos-ae/pom.xml index 480afdf16..87cbc7fc5 100644 --- a/jcore-jpos-ae/pom.xml +++ b/jcore-jpos-ae/pom.xml @@ -11,7 +11,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-jpos-ae/src/main/resources/de/julielab/jcore/ae/jpos/desc/jcore-jpos.xml b/jcore-jpos-ae/src/main/resources/de/julielab/jcore/ae/jpos/desc/jcore-jpos.xml index be5593812..37870472c 100644 --- a/jcore-jpos-ae/src/main/resources/de/julielab/jcore/ae/jpos/desc/jcore-jpos.xml +++ b/jcore-jpos-ae/src/main/resources/de/julielab/jcore/ae/jpos/desc/jcore-jpos.xml @@ -6,7 +6,7 @@ JCoRe JPOS AE - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-jpos-ae/src/test/resources/POSTagAnnotatorTest.xml b/jcore-jpos-ae/src/test/resources/POSTagAnnotatorTest.xml index 384265369..5a179961d 100644 --- a/jcore-jpos-ae/src/test/resources/POSTagAnnotatorTest.xml +++ b/jcore-jpos-ae/src/test/resources/POSTagAnnotatorTest.xml @@ -6,7 +6,7 @@ JPOSAnnotator - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab diff --git a/jcore-jsbd-ae/component.meta b/jcore-jsbd-ae/component.meta index 025d9b87f..5da0bb330 100644 --- a/jcore-jsbd-ae/component.meta +++ b/jcore-jsbd-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-jsbd-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Sentence Annotator" } diff --git a/jcore-jsbd-ae/pom.xml b/jcore-jsbd-ae/pom.xml index d5622f97b..964b14ef9 100644 --- a/jcore-jsbd-ae/pom.xml +++ b/jcore-jsbd-ae/pom.xml @@ -11,7 +11,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-jsbd-ae/src/main/resources/de/julielab/jcore/ae/jsbd/desc/jcore-jsbd-ae.xml b/jcore-jsbd-ae/src/main/resources/de/julielab/jcore/ae/jsbd/desc/jcore-jsbd-ae.xml index 8bb60791a..409bda28e 100644 --- a/jcore-jsbd-ae/src/main/resources/de/julielab/jcore/ae/jsbd/desc/jcore-jsbd-ae.xml +++ b/jcore-jsbd-ae/src/main/resources/de/julielab/jcore/ae/jsbd/desc/jcore-jsbd-ae.xml @@ -6,7 +6,7 @@ de.julielab.jcore.ae.jsbd.main.SentenceAnnotator Descriptor automatically generated by uimaFIT - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT de.julielab.jcore.ae.jsbd.main diff --git a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml index 66314d4bf..1e1aaa26e 100644 --- a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml +++ b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml @@ -6,7 +6,7 @@ JCoRe Sentence Annotator This is the UIMA Wrapper for the JULIE Sentence Boundary Detector. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml index 63b003324..28c03ebe8 100644 --- a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml +++ b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml @@ -6,7 +6,7 @@ JCoRe Sentence Annotator This is the UIMA Wrapper for the JULIE Sentence Boundary Detector. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/paragraph-scope-type.xml b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/paragraph-scope-type.xml index 282896d88..cd826ac73 100644 --- a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/paragraph-scope-type.xml +++ b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/paragraph-scope-type.xml @@ -2,7 +2,7 @@ test-entity-type.xml A mini type system with one type only, used for testing consistency preservation - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-jtbd-ae/component.meta b/jcore-jtbd-ae/component.meta index 377c042d7..aa682f5da 100644 --- a/jcore-jtbd-ae/component.meta +++ b/jcore-jtbd-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-jtbd-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Token Annotator" } diff --git a/jcore-jtbd-ae/pom.xml b/jcore-jtbd-ae/pom.xml index 03523ba12..0c7e7d127 100644 --- a/jcore-jtbd-ae/pom.xml +++ b/jcore-jtbd-ae/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-jtbd-ae/src/main/resources/de/julielab/jcore/ae/jtbd/desc/jcore-jtbd.xml b/jcore-jtbd-ae/src/main/resources/de/julielab/jcore/ae/jtbd/desc/jcore-jtbd.xml index 337463371..a207b07d1 100644 --- a/jcore-jtbd-ae/src/main/resources/de/julielab/jcore/ae/jtbd/desc/jcore-jtbd.xml +++ b/jcore-jtbd-ae/src/main/resources/de/julielab/jcore/ae/jtbd/desc/jcore-jtbd.xml @@ -6,7 +6,7 @@ JCoRe Token Annotator - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-jtbd-ae/src/test/resources/de/julielab/jcore/ae/jtbd/desc/TokenAnnotatorTest.xml b/jcore-jtbd-ae/src/test/resources/de/julielab/jcore/ae/jtbd/desc/TokenAnnotatorTest.xml index 6a670af49..083790957 100644 --- a/jcore-jtbd-ae/src/test/resources/de/julielab/jcore/ae/jtbd/desc/TokenAnnotatorTest.xml +++ b/jcore-jtbd-ae/src/test/resources/de/julielab/jcore/ae/jtbd/desc/TokenAnnotatorTest.xml @@ -6,7 +6,7 @@ JCoRe Token Annotator - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-julielab-entity-evaluator-consumer/component.meta b/jcore-julielab-entity-evaluator-consumer/component.meta index 9ffe2edc3..dc65ea34a 100644 --- a/jcore-julielab-entity-evaluator-consumer/component.meta +++ b/jcore-julielab-entity-evaluator-consumer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-julielab-entity-evaluator-consumer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe JULIE Lab Entity Evaluator Converter" } diff --git a/jcore-julielab-entity-evaluator-consumer/pom.xml b/jcore-julielab-entity-evaluator-consumer/pom.xml index 7ad4d9597..35ae8b960 100644 --- a/jcore-julielab-entity-evaluator-consumer/pom.xml +++ b/jcore-julielab-entity-evaluator-consumer/pom.xml @@ -4,7 +4,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT jcore-julielab-entity-evaluator-consumer JCoRe JULIE Lab Entity Evaluator Converter diff --git a/jcore-julielab-entity-evaluator-consumer/src/main/resources/de/julielab/jcore/consumer/entityevaluator/desc/jcore-julielab-entity-evaluator-consumer.xml b/jcore-julielab-entity-evaluator-consumer/src/main/resources/de/julielab/jcore/consumer/entityevaluator/desc/jcore-julielab-entity-evaluator-consumer.xml index 4ffda6700..51c7fc6af 100644 --- a/jcore-julielab-entity-evaluator-consumer/src/main/resources/de/julielab/jcore/consumer/entityevaluator/desc/jcore-julielab-entity-evaluator-consumer.xml +++ b/jcore-julielab-entity-evaluator-consumer/src/main/resources/de/julielab/jcore/consumer/entityevaluator/desc/jcore-julielab-entity-evaluator-consumer.xml @@ -6,7 +6,7 @@ JCoRe Entity Evaluator and TSV Consumer This component was originally created to output the tab separated format used the JULIE Entity Evaluator. However, this component can be used to create a TSV file from any annotation or annotation set. The component allows to define columns by specifying the annotation type to draw feature values from and a feature path that specifies the location of the desired feature. All feature paths will be applied to each configured annotation, returning null values if an annotation does not exhibit a value for a column's feature path. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-likelihood-assignment-ae/component.meta b/jcore-likelihood-assignment-ae/component.meta index 671dbf79e..1055a51ab 100644 --- a/jcore-likelihood-assignment-ae/component.meta +++ b/jcore-likelihood-assignment-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-likelihood-assignment-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Likelihood Assignment AE" } diff --git a/jcore-likelihood-assignment-ae/pom.xml b/jcore-likelihood-assignment-ae/pom.xml index e49c1a243..d053fef46 100644 --- a/jcore-likelihood-assignment-ae/pom.xml +++ b/jcore-likelihood-assignment-ae/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-likelihood-assignment-ae/src/main/resources/de/julielab/jcore/ae/likelihoodassignment/desc/jcore-likelihood-assignment-ae.xml b/jcore-likelihood-assignment-ae/src/main/resources/de/julielab/jcore/ae/likelihoodassignment/desc/jcore-likelihood-assignment-ae.xml index 14bc6f60a..1a6b9b081 100644 --- a/jcore-likelihood-assignment-ae/src/main/resources/de/julielab/jcore/ae/likelihoodassignment/desc/jcore-likelihood-assignment-ae.xml +++ b/jcore-likelihood-assignment-ae/src/main/resources/de/julielab/jcore/ae/likelihoodassignment/desc/jcore-likelihood-assignment-ae.xml @@ -6,7 +6,7 @@ JCoRe Likelihood Assignment AE Analysis Engine to assign likelihood indicators to their corresponding entities and events. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-likelihood-detection-ae/component.meta b/jcore-likelihood-detection-ae/component.meta index e58826719..3f80906be 100644 --- a/jcore-likelihood-detection-ae/component.meta +++ b/jcore-likelihood-detection-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-likelihood-detection-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Likelihood Detection AE" } diff --git a/jcore-likelihood-detection-ae/pom.xml b/jcore-likelihood-detection-ae/pom.xml index c68a79a73..eb4aaa51e 100644 --- a/jcore-likelihood-detection-ae/pom.xml +++ b/jcore-likelihood-detection-ae/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-likelihood-detection-ae/src/main/resources/de/julielab/jcore/ae/likelihooddetection/desc/jcore-likelihood-detection-ae.xml b/jcore-likelihood-detection-ae/src/main/resources/de/julielab/jcore/ae/likelihooddetection/desc/jcore-likelihood-detection-ae.xml index 81e9c76f1..bbd5b55bf 100644 --- a/jcore-likelihood-detection-ae/src/main/resources/de/julielab/jcore/ae/likelihooddetection/desc/jcore-likelihood-detection-ae.xml +++ b/jcore-likelihood-detection-ae/src/main/resources/de/julielab/jcore/ae/likelihooddetection/desc/jcore-likelihood-detection-ae.xml @@ -6,7 +6,7 @@ JCoRe Likelihood Detection AE Analysis Engine to detect epistemic modal expressions and assign the appropriate likelihood category. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT LikelihoodDict diff --git a/jcore-line-multiplier/component.meta b/jcore-line-multiplier/component.meta index 432aa6b6a..864a9954c 100644 --- a/jcore-line-multiplier/component.meta +++ b/jcore-line-multiplier/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-line-multiplier", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Line Multiplier" } diff --git a/jcore-line-multiplier/pom.xml b/jcore-line-multiplier/pom.xml index 12aa067d8..f81a228ca 100644 --- a/jcore-line-multiplier/pom.xml +++ b/jcore-line-multiplier/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-line-multiplier/src/main/resources/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml b/jcore-line-multiplier/src/main/resources/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml index 69ff063cd..524ca369e 100644 --- a/jcore-line-multiplier/src/main/resources/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml +++ b/jcore-line-multiplier/src/main/resources/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml @@ -6,7 +6,7 @@ JCoRe Line Multiplier Splits incoming CAS document texts on line breaks and returns one CAS for each non-blank line. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT NumberLinesPerCAS diff --git a/jcore-line-multiplier/target/classes/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml b/jcore-line-multiplier/target/classes/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml index 69ff063cd..524ca369e 100644 --- a/jcore-line-multiplier/target/classes/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml +++ b/jcore-line-multiplier/target/classes/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml @@ -6,7 +6,7 @@ JCoRe Line Multiplier Splits incoming CAS document texts on line breaks and returns one CAS for each non-blank line. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT NumberLinesPerCAS diff --git a/jcore-lingpipe-porterstemmer-ae/component.meta b/jcore-lingpipe-porterstemmer-ae/component.meta index f0adaa9a1..af7dce999 100644 --- a/jcore-lingpipe-porterstemmer-ae/component.meta +++ b/jcore-lingpipe-porterstemmer-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-lingpipe-porterstemmer-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Lingpipe Porter Stemmer AE" } diff --git a/jcore-lingpipe-porterstemmer-ae/pom.xml b/jcore-lingpipe-porterstemmer-ae/pom.xml index 6a10f10c5..6df6ba486 100644 --- a/jcore-lingpipe-porterstemmer-ae/pom.xml +++ b/jcore-lingpipe-porterstemmer-ae/pom.xml @@ -5,7 +5,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT jcore-lingpipe-porterstemmer-ae JCoRe Lingpipe Porter Stemmer AE diff --git a/jcore-lingpipe-porterstemmer-ae/src/main/resources/de/julielab/jcore/ae/lingpipe/porterstemmer/desc/jcore-lingpipe-porterstemmer-ae.xml b/jcore-lingpipe-porterstemmer-ae/src/main/resources/de/julielab/jcore/ae/lingpipe/porterstemmer/desc/jcore-lingpipe-porterstemmer-ae.xml index b959cf460..d24a10c0d 100644 --- a/jcore-lingpipe-porterstemmer-ae/src/main/resources/de/julielab/jcore/ae/lingpipe/porterstemmer/desc/jcore-lingpipe-porterstemmer-ae.xml +++ b/jcore-lingpipe-porterstemmer-ae/src/main/resources/de/julielab/jcore/ae/lingpipe/porterstemmer/desc/jcore-lingpipe-porterstemmer-ae.xml @@ -5,7 +5,7 @@ JCoRe Lingpipe Porterstemmer AE Adds a StemmedForm to each token in the CAS. The offsets and the value feature of each StemmedForm are set to the stem as returned by the Porter stemmer algorithm as implemented by Lingpipe. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab, Germany diff --git a/jcore-lingpipegazetteer-ae/component.meta b/jcore-lingpipegazetteer-ae/component.meta index 0a77648a3..4ba7d7658 100644 --- a/jcore-lingpipegazetteer-ae/component.meta +++ b/jcore-lingpipegazetteer-ae/component.meta @@ -18,7 +18,7 @@ "maven-artifact": { "artifactId": "jcore-lingpipe-gazetteer-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Lingpipe Gazetteer AE" } diff --git a/jcore-lingpipegazetteer-ae/pom.xml b/jcore-lingpipegazetteer-ae/pom.xml index 1d39efcf8..080a61539 100644 --- a/jcore-lingpipegazetteer-ae/pom.xml +++ b/jcore-lingpipegazetteer-ae/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae-configurable-resource.xml b/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae-configurable-resource.xml index 1f4e5a34e..e8895177a 100644 --- a/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae-configurable-resource.xml +++ b/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae-configurable-resource.xml @@ -16,7 +16,7 @@ embedded into the descriptor. The current parameter settings will work but may be changed. Refer to https://github.com/JULIELab/jcore-base/tree/master/jcore-lingpipegazetteer-ae for more information. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT julielab diff --git a/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae.xml b/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae.xml index b168cefa2..e448c764c 100644 --- a/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae.xml +++ b/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae.xml @@ -14,7 +14,7 @@ and some parameter settings for dictionary processing and tagging. Refer to https://github.com/JULIELab/jcore-base/tree/master/jcore-lingpipegazetteer-ae for more information. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT julielab diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/ApproxGazetteerAnnotatorTest.xml b/jcore-lingpipegazetteer-ae/src/test/resources/ApproxGazetteerAnnotatorTest.xml index bfd3827d0..9e4cc5a3d 100644 --- a/jcore-lingpipegazetteer-ae/src/test/resources/ApproxGazetteerAnnotatorTest.xml +++ b/jcore-lingpipegazetteer-ae/src/test/resources/ApproxGazetteerAnnotatorTest.xml @@ -6,7 +6,7 @@ GazetteerAnnotator - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT julielab diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/ExactGazetteerAnnotatorTest.xml b/jcore-lingpipegazetteer-ae/src/test/resources/ExactGazetteerAnnotatorTest.xml index eeebe281b..2c6e0779a 100644 --- a/jcore-lingpipegazetteer-ae/src/test/resources/ExactGazetteerAnnotatorTest.xml +++ b/jcore-lingpipegazetteer-ae/src/test/resources/ExactGazetteerAnnotatorTest.xml @@ -6,7 +6,7 @@ GazetteerAnnotator - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT julielab diff --git a/jcore-lingscope-ae/component.meta b/jcore-lingscope-ae/component.meta index 3a5fc4991..3a73c19a3 100644 --- a/jcore-lingscope-ae/component.meta +++ b/jcore-lingscope-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-lingscope-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Lingscope AE" } diff --git a/jcore-lingscope-ae/pom.xml b/jcore-lingscope-ae/pom.xml index 28836bd2b..4c5a15b41 100644 --- a/jcore-lingscope-ae/pom.xml +++ b/jcore-lingscope-ae/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-lingscope-ae/src/main/resources/de/julielab/jcore/ae/lingscope/desc/jcore-lingscope-ae.xml b/jcore-lingscope-ae/src/main/resources/de/julielab/jcore/ae/lingscope/desc/jcore-lingscope-ae.xml index 164a2ed7e..dea73edd6 100644 --- a/jcore-lingscope-ae/src/main/resources/de/julielab/jcore/ae/lingscope/desc/jcore-lingscope-ae.xml +++ b/jcore-lingscope-ae/src/main/resources/de/julielab/jcore/ae/lingscope/desc/jcore-lingscope-ae.xml @@ -6,7 +6,7 @@ JCoRe Lingscope AE This component uses the Lingscope negation/hedge detection algorithm and models to annotate negation/hedge cues and the scope to which the cues apply. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT CueModel diff --git a/jcore-linnaeus-species-ae/component.meta b/jcore-linnaeus-species-ae/component.meta index a4789114c..8bc1674bb 100644 --- a/jcore-linnaeus-species-ae/component.meta +++ b/jcore-linnaeus-species-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-linnaeus-species-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Linnaeus Species Tagger" } diff --git a/jcore-linnaeus-species-ae/pom.xml b/jcore-linnaeus-species-ae/pom.xml index 9e5c99785..68c29ba14 100644 --- a/jcore-linnaeus-species-ae/pom.xml +++ b/jcore-linnaeus-species-ae/pom.xml @@ -4,7 +4,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT jcore-linnaeus-species-ae JCoRe Linnaeus Species Tagger diff --git a/jcore-linnaeus-species-ae/src/main/resources/de/julielab/jcore/ae/linnaeus/desc/jcore-linnaeus-ae.xml b/jcore-linnaeus-species-ae/src/main/resources/de/julielab/jcore/ae/linnaeus/desc/jcore-linnaeus-ae.xml index e89d8d5f3..d3ab9d56b 100644 --- a/jcore-linnaeus-species-ae/src/main/resources/de/julielab/jcore/ae/linnaeus/desc/jcore-linnaeus-ae.xml +++ b/jcore-linnaeus-species-ae/src/main/resources/de/julielab/jcore/ae/linnaeus/desc/jcore-linnaeus-ae.xml @@ -5,7 +5,7 @@ de.julielab.jcore.ae.linnaeus.LinnaeusSpeciesAnnotator JCore LINNAEUS Species AE - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-mantra-xml-types/pom.xml b/jcore-mantra-xml-types/pom.xml index 4108f1f6a..ea6b45d42 100644 --- a/jcore-mantra-xml-types/pom.xml +++ b/jcore-mantra-xml-types/pom.xml @@ -6,7 +6,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JCoRe Mantra XML Types https://github.com/JULIELab/jcore-base/tree/master/jcore-mantra-xml-types diff --git a/jcore-medxn-ae/component.meta b/jcore-medxn-ae/component.meta index d10bc8ded..c1c026762 100644 --- a/jcore-medxn-ae/component.meta +++ b/jcore-medxn-ae/component.meta @@ -22,7 +22,7 @@ "maven-artifact": { "artifactId": "jcore-medxn-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe MedXN" } diff --git a/jcore-medxn-ae/pom.xml b/jcore-medxn-ae/pom.xml index 94a1d35ee..aac277c21 100644 --- a/jcore-medxn-ae/pom.xml +++ b/jcore-medxn-ae/pom.xml @@ -4,7 +4,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT jcore-medxn-ae JCoRe MedXN diff --git a/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/MedNormAE.xml b/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/MedNormAE.xml index e92306340..be6bb7375 100644 --- a/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/MedNormAE.xml +++ b/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/MedNormAE.xml @@ -6,7 +6,7 @@ de.julielab.jcore.medxn.ae.desc.MedNormAE make a normalized medication description based on RxNorm standard - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/jcore-medxn-ae-attributes-german.xml b/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/jcore-medxn-ae-attributes-german.xml index 94393ddbf..199f1607d 100644 --- a/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/jcore-medxn-ae-attributes-german.xml +++ b/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/jcore-medxn-ae-attributes-german.xml @@ -6,7 +6,7 @@ de.julielab.jcore.medxn.ae.desc.MedAttrAE medication attribute tagger using regEx - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/jcore-medxn-ae-extractor-german.xml b/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/jcore-medxn-ae-extractor-german.xml index afdec1ce4..25468e126 100644 --- a/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/jcore-medxn-ae-extractor-german.xml +++ b/jcore-medxn-ae/src/main/resources/de/julielab/jcore/ae/medxn/desc/jcore-medxn-ae-extractor-german.xml @@ -6,7 +6,7 @@ de.julielab.jcore.medxn.ae.desc.jcore-medxn-ae-extractor-german Associate medication and the corresponding attributes - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-msdoc-reader/component.meta b/jcore-msdoc-reader/component.meta index 28d3243a0..eac523555 100644 --- a/jcore-msdoc-reader/component.meta +++ b/jcore-msdoc-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-msdoc-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe MSdoc Reader" } diff --git a/jcore-msdoc-reader/pom.xml b/jcore-msdoc-reader/pom.xml index ed305d952..74d9d3daa 100644 --- a/jcore-msdoc-reader/pom.xml +++ b/jcore-msdoc-reader/pom.xml @@ -5,7 +5,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT jcore-msdoc-reader JCoRe MSdoc Reader diff --git a/jcore-msdoc-reader/src/main/resources/de/julielab/jcore/reader/msdoc/desc/jcore-msdoc-reader.xml b/jcore-msdoc-reader/src/main/resources/de/julielab/jcore/reader/msdoc/desc/jcore-msdoc-reader.xml index 18a03952b..146d1f488 100644 --- a/jcore-msdoc-reader/src/main/resources/de/julielab/jcore/reader/msdoc/desc/jcore-msdoc-reader.xml +++ b/jcore-msdoc-reader/src/main/resources/de/julielab/jcore/reader/msdoc/desc/jcore-msdoc-reader.xml @@ -5,7 +5,7 @@ JCoRe MSdoc Reader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIELab Jena, Germany diff --git a/jcore-mstparser-ae/component.meta b/jcore-mstparser-ae/component.meta index d58972c4e..ba2e43335 100644 --- a/jcore-mstparser-ae/component.meta +++ b/jcore-mstparser-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-mstparser-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe MST Parser AE" } diff --git a/jcore-mstparser-ae/pom.xml b/jcore-mstparser-ae/pom.xml index ddbf1449e..83f9017af 100644 --- a/jcore-mstparser-ae/pom.xml +++ b/jcore-mstparser-ae/pom.xml @@ -54,7 +54,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT .. diff --git a/jcore-mstparser-ae/src/main/resources/de/julielab/jcore/ae/mstparser/desc/jcore-mstparser.xml b/jcore-mstparser-ae/src/main/resources/de/julielab/jcore/ae/mstparser/desc/jcore-mstparser.xml index 36985423b..36ef089e1 100644 --- a/jcore-mstparser-ae/src/main/resources/de/julielab/jcore/ae/mstparser/desc/jcore-mstparser.xml +++ b/jcore-mstparser-ae/src/main/resources/de/julielab/jcore/ae/mstparser/desc/jcore-mstparser.xml @@ -6,7 +6,7 @@ JCoRe MST Parser Annotator - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-mstparser-ae/src/test/resources/de/julielab/jcore/ae/mstparser/desc/MSTParserDescriptorTest.xml b/jcore-mstparser-ae/src/test/resources/de/julielab/jcore/ae/mstparser/desc/MSTParserDescriptorTest.xml index a9b0d6b0e..9442a4955 100644 --- a/jcore-mstparser-ae/src/test/resources/de/julielab/jcore/ae/mstparser/desc/MSTParserDescriptorTest.xml +++ b/jcore-mstparser-ae/src/test/resources/de/julielab/jcore/ae/mstparser/desc/MSTParserDescriptorTest.xml @@ -6,7 +6,7 @@ JCoRe MST Parser Annotator - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-muc7-reader/component.meta b/jcore-muc7-reader/component.meta index 882b76c87..7e16b6b2c 100644 --- a/jcore-muc7-reader/component.meta +++ b/jcore-muc7-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-muc7-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe MUC7 Reader" } diff --git a/jcore-muc7-reader/pom.xml b/jcore-muc7-reader/pom.xml index aeb5a81b5..a1461b459 100644 --- a/jcore-muc7-reader/pom.xml +++ b/jcore-muc7-reader/pom.xml @@ -13,7 +13,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-muc7-reader/src/main/resources/de/julielab/jcore/reader/muc7/desc/jcore-muc7-reader.xml b/jcore-muc7-reader/src/main/resources/de/julielab/jcore/reader/muc7/desc/jcore-muc7-reader.xml index 2f6b99cc3..be43fa1c1 100644 --- a/jcore-muc7-reader/src/main/resources/de/julielab/jcore/reader/muc7/desc/jcore-muc7-reader.xml +++ b/jcore-muc7-reader/src/main/resources/de/julielab/jcore/reader/muc7/desc/jcore-muc7-reader.xml @@ -5,7 +5,7 @@ JCoRe MUC7 Reader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-muc7-reader/src/test/resources/de/julielab/jcore/reader/muc7/desc/jcore-muc7-reader.xml b/jcore-muc7-reader/src/test/resources/de/julielab/jcore/reader/muc7/desc/jcore-muc7-reader.xml index 87e9f1679..e089a5ab2 100644 --- a/jcore-muc7-reader/src/test/resources/de/julielab/jcore/reader/muc7/desc/jcore-muc7-reader.xml +++ b/jcore-muc7-reader/src/test/resources/de/julielab/jcore/reader/muc7/desc/jcore-muc7-reader.xml @@ -5,7 +5,7 @@ JCoRe MUC7 Reader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-mutationfinder-ae/component.meta b/jcore-mutationfinder-ae/component.meta index c0df6eb43..6a13f809a 100644 --- a/jcore-mutationfinder-ae/component.meta +++ b/jcore-mutationfinder-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-mutationfinder-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Mutation Finder AE" } diff --git a/jcore-mutationfinder-ae/pom.xml b/jcore-mutationfinder-ae/pom.xml index bc0ff3ecb..62b3a5d5b 100644 --- a/jcore-mutationfinder-ae/pom.xml +++ b/jcore-mutationfinder-ae/pom.xml @@ -5,7 +5,7 @@ jcore-base de.julielab - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT 4.0.0 JCoRe Mutation Finder AE diff --git a/jcore-mutationfinder-ae/src/main/resources/de/julielab/jcore/ae/mutationfinder/desc/jcore-mutationfinder-ae.xml b/jcore-mutationfinder-ae/src/main/resources/de/julielab/jcore/ae/mutationfinder/desc/jcore-mutationfinder-ae.xml index d43c2caba..4bde7de35 100644 --- a/jcore-mutationfinder-ae/src/main/resources/de/julielab/jcore/ae/mutationfinder/desc/jcore-mutationfinder-ae.xml +++ b/jcore-mutationfinder-ae/src/main/resources/de/julielab/jcore/ae/mutationfinder/desc/jcore-mutationfinder-ae.xml @@ -7,7 +7,7 @@ JCoRe Mutation Annotator An analysis engine to recognize mentions of gene point mutations in document text. This is a wrapper around the original MutationFinder (http://mutationfinder.sourceforge.net/), published in the following paper: MutationFinder: A high-performance system for extracting point mutation mentions from text J. Gregory Caporaso, William A. Baumgartner Jr., David A. Randolph, K. Bretonnel Cohen, and Lawrence Hunter; Bioinformatics, 2007 23(14):1862-1865; doi:10.1093/bioinformatics/btm235; - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab, Germany diff --git a/jcore-neo4j-relations-consumer/pom.xml b/jcore-neo4j-relations-consumer/pom.xml index e83c89ced..b7a2bf83c 100644 --- a/jcore-neo4j-relations-consumer/pom.xml +++ b/jcore-neo4j-relations-consumer/pom.xml @@ -10,10 +10,9 @@ de.julielab jcore-base - 2.3.0-SNAPSHOT + 2.6.0-SNAPSHOT - 2.3.0-SNAPSHOT @@ -28,13 +27,18 @@ de.julielab jcore-types - ${jcore-version} + ${jcore-types-version} de.julielab julielab-neo4j-plugins-concepts-representation 3.0.0-SNAPSHOT + + de.julielab + jcore-utilities + ${jcore-utilities-version} + org.neo4j.test neo4j-harness @@ -47,6 +51,14 @@ 3.0.0-SNAPSHOT test + + org.assertj + assertj-core + + + de.julielab + jcore-descriptor-creator + JCoRe Neo4j Relations Consumer diff --git a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java index 4c4670d97..9b9a6dddc 100644 --- a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java +++ b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java @@ -6,9 +6,11 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.HashMultiset; import com.google.common.collect.Multiset; +import de.julielab.jcore.types.ArgumentMention; import de.julielab.jcore.types.ConceptMention; import de.julielab.jcore.types.ResourceEntry; import de.julielab.jcore.types.ext.FlattenedRelation; +import de.julielab.jcore.utility.JCoReTools; import de.julielab.neo4j.plugins.datarepresentation.ImportIERelation; import de.julielab.neo4j.plugins.datarepresentation.ImportIERelationArgument; import de.julielab.neo4j.plugins.datarepresentation.ImportIERelationDocument; @@ -85,13 +87,16 @@ public void process(final JCas aJCas) { private ImportIERelationDocument convertRelations(JCas aJCas) { Map> relationCounts = getEquivalentRelationGroups(aJCas); ImportIERelationDocument relDoc = new ImportIERelationDocument(); + relDoc.setDb(false); + relDoc.setName(JCoReTools.getDocId(aJCas)); ImportIETypedRelations typedRelations = new ImportIETypedRelations(); for (String relationType : relationCounts.keySet()) { Multiset unificationRelations = relationCounts.get(relationType); List ieRelations4relationType = new ArrayList<>(); - for (UnificationRelation rel : unificationRelations) { + for (UnificationRelation rel : unificationRelations.elementSet()) { ieRelations4relationType.add(rel.toImportRelation(unificationRelations.count(rel))); } + typedRelations.put(relationType, ieRelations4relationType); } relDoc.setRelations(typedRelations); return relDoc; @@ -151,7 +156,11 @@ private void sendRelationsToNeo4j() throws AnalysisEngineProcessException { private Map> getEquivalentRelationGroups(JCas aJCas) { Map> relationCounts = new HashMap<>(); for (FlattenedRelation fr : aJCas.getAnnotationIndex(FlattenedRelation.type)) { - Iterator cmIt = StreamSupport.stream(fr.getArguments().spliterator(), false).map(ConceptMention.class::cast).iterator(); + Iterator cmIt = StreamSupport.stream(fr.getArguments().spliterator(), false) + .map(ArgumentMention.class::cast) + .map(ArgumentMention::getRef) + .map(ConceptMention.class::cast) + .iterator(); Set unificationArgs = new HashSet<>(); while (cmIt.hasNext()) { ConceptMention cm = cmIt.next(); @@ -184,7 +193,7 @@ public UnificationRelation(String relationType, Set args) { } public ImportIERelation toImportRelation(int count) { - return ImportIERelation.of(count, (Iterable) args.stream().map(UnificationArgument::toImportArgument).iterator()); + return ImportIERelation.of(count, () -> args.stream().map(UnificationArgument::toImportArgument).iterator()); } public String getRelationType() { diff --git a/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/PLACEHOLDER b/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/PLACEHOLDER deleted file mode 100644 index 9f6c6ddb5..000000000 --- a/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/PLACEHOLDER +++ /dev/null @@ -1 +0,0 @@ -The actual descriptor must be created by UIMA fit. diff --git a/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml b/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml index a0eadea2f..1119cc5ef 100644 --- a/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml +++ b/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml @@ -1,21 +1,81 @@ - org.apache.uima.java - true - Neo4jRelationsConsumer - - JCoRe Neo4j Relations Consumer - - 2.3.0-SNAPSHOT - JULIE Lab Jena, Germany - - - - - - true - true - false - - - + org.apache.uima.java + true + de.julielab.jcore.consumer.neo4jrelations.Neo4jRelationsConsumer + + JCoRe Neo4j Relations Consumer + This component assumes that a Neo4j server with an installed julieliab-neo4j-plugins-concepts plugin installed. It then sends FlattenedRelation instances with more then one arguments to Neo4j. Note that this requires the event arguments to have a ResourceEntry list to obtain database concept IDs from. + 2.6.0-SNAPSHOT + JULIE Lab, Germany + JULIE Lab + + + URL + The complete URL to the endpoint of the Neo4j server for relation insertion. + String + false + true + + + IdProperty + The ID property to look up concept nodes in the Neo4j graph. Common options are 'id', 'sourceIds' and 'originalId'. You must know to which ID type the ResourceEntry objects of the relation arguments refer to. + String + false + true + + + ConceptSource + Optional. Sets the global source for the concept IDs taken from the ResourceEntry instances of the relation arguments. This causes the 'source' feature of the ResourceEntry objects to be omitted and to globally use the specified source instead. This causes the Neo4j database plugin to resolve the provided argument IDs against the source specified here. + String + false + false + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + de.julielab.jcore.types.EventMention + + + + + + + true + true + false + + + \ No newline at end of file diff --git a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java index 41d24b178..f40f929b2 100644 --- a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java +++ b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java @@ -1,15 +1,24 @@ package de.julielab.jcore.consumer.neo4jrelations; -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.fit.factory.AnalysisEngineFactory; +import de.julielab.jcore.types.ArgumentMention; +import de.julielab.jcore.types.ConceptMention; +import de.julielab.jcore.types.EventMention; +import de.julielab.jcore.types.ResourceEntry; +import de.julielab.jcore.types.ext.FlattenedRelation; +import de.julielab.jcore.utility.JCoReTools; +import de.julielab.neo4j.plugins.datarepresentation.ImportIERelation; +import de.julielab.neo4j.plugins.datarepresentation.ImportIERelationArgument; +import de.julielab.neo4j.plugins.datarepresentation.ImportIERelationDocument; import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.factory.UimaContextFactory; import org.apache.uima.jcas.JCas; import org.junit.Test; -import java.io.IOException; +import java.lang.reflect.Method; +import java.util.List; +import static org.assertj.core.api.Assertions.assertThat; /** * Unit tests for jcore-neo4j-relations-consumer. @@ -19,10 +28,113 @@ public class Neo4jRelationsConsumerTest { @Test - public void insertEventMentions() throws UIMAException, IOException { - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types"); - AnalysisEngine engine = AnalysisEngineFactory.createEngine("de.julielab.jcore.consumer.neo4jrelations.desc.jcore-neo4j-relations-consumer", Neo4jRelationsConsumer.PARAM_URL, ""); + public void insertEventMentions() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.extensions.jcore-semantics-mention-extension-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types"); + Neo4jRelationsConsumer engine = new Neo4jRelationsConsumer(); + engine.initialize(UimaContextFactory.createUimaContext(Neo4jRelationsConsumer.PARAM_URL, "", Neo4jRelationsConsumer.PARAM_ID_PROPERTY, "sourceIds")); + addFlattenedRelation1ToCas(jCas); + // Here is a duplicate. It should be recognized and just be counted up + addFlattenedRelation2ToCas(jCas); + addFlattenedRelation2ToCas(jCas); + Method m = Neo4jRelationsConsumer.class.getDeclaredMethod("convertRelations", JCas.class); + m.setAccessible(true); + ImportIERelationDocument relations = (ImportIERelationDocument) m.invoke(engine, jCas); + assertThat(relations).extracting(ImportIERelationDocument::getRelations).isNotNull(); + assertThat(relations.getRelations()).hasSize(1); + List regulations = relations.getRelations().get("regulation"); + assertThat(regulations).hasSize(2); + assertThat(regulations.get(0)).extracting(ImportIERelation::getCount).isEqualTo(1); + assertThat(regulations.get(1)).extracting(ImportIERelation::getCount).isEqualTo(2); + assertThat(regulations).flatExtracting(ImportIERelation::getArgs).flatExtracting(ImportIERelationArgument::getId).containsExactlyInAnyOrder("id11", "id12", "id13", "id21", "id22"); + assertThat(regulations).flatExtracting(ImportIERelation::getArgs).flatExtracting(ImportIERelationArgument::getSource).containsExactlyInAnyOrder("source11", "source12", "source13", "source21", "source22"); + } + + @Test + public void insertEventMentionsGlobalSource() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.extensions.jcore-semantics-mention-extension-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types"); + Neo4jRelationsConsumer engine = new Neo4jRelationsConsumer(); + engine.initialize(UimaContextFactory.createUimaContext(Neo4jRelationsConsumer.PARAM_URL, "", Neo4jRelationsConsumer.PARAM_ID_PROPERTY, "sourceIds", Neo4jRelationsConsumer.PARAM_SOURCE, "globalSource")); + addFlattenedRelation1ToCas(jCas); + addFlattenedRelation2ToCas(jCas); + Method m = Neo4jRelationsConsumer.class.getDeclaredMethod("convertRelations", JCas.class); + m.setAccessible(true); + ImportIERelationDocument relations = (ImportIERelationDocument) m.invoke(engine, jCas); + assertThat(relations).extracting(ImportIERelationDocument::getRelations).isNotNull(); + assertThat(relations.getRelations()).hasSize(1); + List regulations = relations.getRelations().get("regulation"); + assertThat(regulations).hasSize(2); + // With the global source set, the individual sources are left out + assertThat(regulations).flatExtracting(ImportIERelation::getArgs).flatExtracting(ImportIERelationArgument::getSource).containsExactlyInAnyOrder(null, null, null, null, null); } + + /** + * Adds a FlattenedRelation with three arguments. + * @param jCas The CAS. + */ + private void addFlattenedRelation1ToCas(JCas jCas) { + FlattenedRelation fr = new FlattenedRelation(jCas); + EventMention rootEm = new EventMention(jCas); + rootEm.setSpecificType("regulation"); + fr.setRootRelation(rootEm); + + ArgumentMention am1 = new ArgumentMention(jCas); + ConceptMention cm1 = new ConceptMention(jCas); + ResourceEntry re1 = new ResourceEntry(jCas); + re1.setEntryId("id11"); + re1.setSource("source11"); + cm1.setResourceEntryList(JCoReTools.addToFSArray(null, re1)); + am1.setRef(cm1); + + ArgumentMention am2 = new ArgumentMention(jCas); + ConceptMention cm2 = new ConceptMention(jCas); + ResourceEntry re2 = new ResourceEntry(jCas); + re2.setEntryId("id12"); + re2.setSource("source12"); + cm2.setResourceEntryList(JCoReTools.addToFSArray(null, re2)); + am2.setRef(cm2); + + ArgumentMention am3 = new ArgumentMention(jCas); + ConceptMention cm3 = new ConceptMention(jCas); + ResourceEntry re3 = new ResourceEntry(jCas); + re3.setEntryId("id13"); + re3.setSource("source13"); + cm3.setResourceEntryList(JCoReTools.addToFSArray(null, re3)); + am3.setRef(cm3); + + fr.setArguments(JCoReTools.addToFSArray(null, List.of(am1, am2, am3))); + fr.addToIndexes(); + } + + /** + * Adds a FlattenedRelation with two arguments. + * @param jCas The CAS. + */ + private void addFlattenedRelation2ToCas(JCas jCas) { + FlattenedRelation fr = new FlattenedRelation(jCas); + EventMention rootEm = new EventMention(jCas); + rootEm.setSpecificType("regulation"); + fr.setRootRelation(rootEm); + + ArgumentMention am1 = new ArgumentMention(jCas); + ConceptMention cm1 = new ConceptMention(jCas); + ResourceEntry re1 = new ResourceEntry(jCas); + re1.setEntryId("id21"); + re1.setSource("source21"); + cm1.setResourceEntryList(JCoReTools.addToFSArray(null, re1)); + am1.setRef(cm1); + + ArgumentMention am2 = new ArgumentMention(jCas); + ConceptMention cm2 = new ConceptMention(jCas); + ResourceEntry re2 = new ResourceEntry(jCas); + re2.setEntryId("id22"); + re2.setSource("source22"); + cm2.setResourceEntryList(JCoReTools.addToFSArray(null, re2)); + am2.setRef(cm2); + + fr.setArguments(JCoReTools.addToFSArray(null, List.of(am1, am2))); + fr.addToIndexes(); + } + } diff --git a/jcore-opennlp-chunk-ae/component.meta b/jcore-opennlp-chunk-ae/component.meta index 202885b41..5254bb51d 100644 --- a/jcore-opennlp-chunk-ae/component.meta +++ b/jcore-opennlp-chunk-ae/component.meta @@ -9,7 +9,7 @@ "maven-artifact": { "artifactId": "jcore-opennlp-chunk-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Open NLP Chunker" } diff --git a/jcore-opennlp-chunk-ae/pom.xml b/jcore-opennlp-chunk-ae/pom.xml index d691531cd..1ec6f1917 100644 --- a/jcore-opennlp-chunk-ae/pom.xml +++ b/jcore-opennlp-chunk-ae/pom.xml @@ -14,7 +14,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-opennlp-chunk-ae/src/test/resources/ChunkAnnotatorTest.xml b/jcore-opennlp-chunk-ae/src/test/resources/ChunkAnnotatorTest.xml index a9d5953cf..dc8612e2d 100644 --- a/jcore-opennlp-chunk-ae/src/test/resources/ChunkAnnotatorTest.xml +++ b/jcore-opennlp-chunk-ae/src/test/resources/ChunkAnnotatorTest.xml @@ -6,7 +6,7 @@ jcore-openlp-chunk-ae - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT julielab diff --git a/jcore-opennlp-chunk-ae/src/test/resources/ChunkAnnotatorTestDefaultMappings.xml b/jcore-opennlp-chunk-ae/src/test/resources/ChunkAnnotatorTestDefaultMappings.xml index 8d522d208..9f2a29ee1 100644 --- a/jcore-opennlp-chunk-ae/src/test/resources/ChunkAnnotatorTestDefaultMappings.xml +++ b/jcore-opennlp-chunk-ae/src/test/resources/ChunkAnnotatorTestDefaultMappings.xml @@ -6,7 +6,7 @@ jcore-openlp-chunk-ae - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT julielab diff --git a/jcore-opennlp-parser-ae/component.meta b/jcore-opennlp-parser-ae/component.meta index 8233a2b6f..c73a0bec7 100644 --- a/jcore-opennlp-parser-ae/component.meta +++ b/jcore-opennlp-parser-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-opennlp-parser-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe OpenNLP Constituency Parser" } diff --git a/jcore-opennlp-parser-ae/pom.xml b/jcore-opennlp-parser-ae/pom.xml index 87af33491..5a2ef3229 100644 --- a/jcore-opennlp-parser-ae/pom.xml +++ b/jcore-opennlp-parser-ae/pom.xml @@ -12,7 +12,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-opennlp-parser-ae/src/main/resources/de/julielab/jcore/ae/opennlpparser/desc/jcore-opennlpparser.xml b/jcore-opennlp-parser-ae/src/main/resources/de/julielab/jcore/ae/opennlpparser/desc/jcore-opennlpparser.xml index ca499d279..afa247920 100644 --- a/jcore-opennlp-parser-ae/src/main/resources/de/julielab/jcore/ae/opennlpparser/desc/jcore-opennlpparser.xml +++ b/jcore-opennlp-parser-ae/src/main/resources/de/julielab/jcore/ae/opennlpparser/desc/jcore-opennlpparser.xml @@ -6,7 +6,7 @@ JCoRe OpenNLP Constituency Parser AE -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-opennlp-parser-ae/src/test/resources/de/julielab/jcore/ae/opennlpparser/desc/jcore-opennlpparser-test.xml b/jcore-opennlp-parser-ae/src/test/resources/de/julielab/jcore/ae/opennlpparser/desc/jcore-opennlpparser-test.xml index 5943431f4..60e9d9e45 100644 --- a/jcore-opennlp-parser-ae/src/test/resources/de/julielab/jcore/ae/opennlpparser/desc/jcore-opennlpparser-test.xml +++ b/jcore-opennlp-parser-ae/src/test/resources/de/julielab/jcore/ae/opennlpparser/desc/jcore-opennlpparser-test.xml @@ -6,7 +6,7 @@ JCoRe OpenNLP Parser Test -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-opennlp-postag-ae/component.meta b/jcore-opennlp-postag-ae/component.meta index 4f3b87ffb..4996af5fb 100644 --- a/jcore-opennlp-postag-ae/component.meta +++ b/jcore-opennlp-postag-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-opennlp-postag-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe OpenNLP POS Tagger" } diff --git a/jcore-opennlp-postag-ae/pom.xml b/jcore-opennlp-postag-ae/pom.xml index 77abc3243..cadd08079 100644 --- a/jcore-opennlp-postag-ae/pom.xml +++ b/jcore-opennlp-postag-ae/pom.xml @@ -12,7 +12,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-opennlp-postag-ae/src/main/resources/de/julielab/jcore/ae/opennlppostag/desc/jcore-opennlppostag.xml b/jcore-opennlp-postag-ae/src/main/resources/de/julielab/jcore/ae/opennlppostag/desc/jcore-opennlppostag.xml index 9a7640c32..ce2228cca 100644 --- a/jcore-opennlp-postag-ae/src/main/resources/de/julielab/jcore/ae/opennlppostag/desc/jcore-opennlppostag.xml +++ b/jcore-opennlp-postag-ae/src/main/resources/de/julielab/jcore/ae/opennlppostag/desc/jcore-opennlppostag.xml @@ -6,7 +6,7 @@ JCoRe OpenNLP POS Tagger - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-opennlp-postag-ae/src/test/resources/PosTagAnnotatorTest.xml b/jcore-opennlp-postag-ae/src/test/resources/PosTagAnnotatorTest.xml index 1c6b115ca..ff5e2768a 100644 --- a/jcore-opennlp-postag-ae/src/test/resources/PosTagAnnotatorTest.xml +++ b/jcore-opennlp-postag-ae/src/test/resources/PosTagAnnotatorTest.xml @@ -6,7 +6,7 @@ jcore-opennlp-postag-ae - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-opennlp-sentence-ae/component.meta b/jcore-opennlp-sentence-ae/component.meta index 33e67cb8d..7980c80b1 100644 --- a/jcore-opennlp-sentence-ae/component.meta +++ b/jcore-opennlp-sentence-ae/component.meta @@ -9,7 +9,7 @@ "maven-artifact": { "artifactId": "jcore-opennlp-sentence-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe OpenNLP Sentence Splitter" } diff --git a/jcore-opennlp-sentence-ae/pom.xml b/jcore-opennlp-sentence-ae/pom.xml index d2e778487..c1c0c2b03 100644 --- a/jcore-opennlp-sentence-ae/pom.xml +++ b/jcore-opennlp-sentence-ae/pom.xml @@ -12,7 +12,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-opennlp-sentence-ae/src/test/resources/SentenceAnnotatorTest.xml b/jcore-opennlp-sentence-ae/src/test/resources/SentenceAnnotatorTest.xml index 127ce56d8..249d1030e 100644 --- a/jcore-opennlp-sentence-ae/src/test/resources/SentenceAnnotatorTest.xml +++ b/jcore-opennlp-sentence-ae/src/test/resources/SentenceAnnotatorTest.xml @@ -6,7 +6,7 @@ jcore-opennlp-sentence-ae sentence splitter based on opennlp -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT diff --git a/jcore-opennlp-token-ae/component.meta b/jcore-opennlp-token-ae/component.meta index 373b7c246..f394a600a 100644 --- a/jcore-opennlp-token-ae/component.meta +++ b/jcore-opennlp-token-ae/component.meta @@ -9,7 +9,7 @@ "maven-artifact": { "artifactId": "jcore-opennlp-token-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe OpenNLP Tokenizer" } diff --git a/jcore-opennlp-token-ae/desc/TokenAnnotator.xml b/jcore-opennlp-token-ae/desc/TokenAnnotator.xml index a8eecd2b1..ea840ac0c 100644 --- a/jcore-opennlp-token-ae/desc/TokenAnnotator.xml +++ b/jcore-opennlp-token-ae/desc/TokenAnnotator.xml @@ -6,7 +6,7 @@ jcore-opennlp-token-ae -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT diff --git a/jcore-opennlp-token-ae/pom.xml b/jcore-opennlp-token-ae/pom.xml index 3145d63a6..306972531 100644 --- a/jcore-opennlp-token-ae/pom.xml +++ b/jcore-opennlp-token-ae/pom.xml @@ -12,7 +12,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-opennlp-token-ae/src/test/resources/TokenAnnotatorTest.xml b/jcore-opennlp-token-ae/src/test/resources/TokenAnnotatorTest.xml index 2ab75743c..749f145df 100644 --- a/jcore-opennlp-token-ae/src/test/resources/TokenAnnotatorTest.xml +++ b/jcore-opennlp-token-ae/src/test/resources/TokenAnnotatorTest.xml @@ -6,7 +6,7 @@ jcore-opennlp-token-ae -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT diff --git a/jcore-pmc-reader/component.meta b/jcore-pmc-reader/component.meta index b71a1930d..6cfbb0efc 100644 --- a/jcore-pmc-reader/component.meta +++ b/jcore-pmc-reader/component.meta @@ -23,7 +23,7 @@ "maven-artifact": { "artifactId": "jcore-pmc-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe PubMed Central Reader" } diff --git a/jcore-pmc-reader/pom.xml b/jcore-pmc-reader/pom.xml index 3f545c1a4..976a1b456 100644 --- a/jcore-pmc-reader/pom.xml +++ b/jcore-pmc-reader/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-multiplier.xml b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-multiplier.xml index dba9b5af0..aafb02d0a 100644 --- a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-multiplier.xml +++ b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-multiplier.xml @@ -8,7 +8,7 @@ This multiplier expect to receive URIs to NXML documents in the form of JCoReURI feature structures. All JCoReURI FS in the annotation indexes are read and output as new CASes. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-multiplier-reader.xml b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-multiplier-reader.xml index 88d0d6c73..224b668eb 100644 --- a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-multiplier-reader.xml +++ b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-multiplier-reader.xml @@ -5,7 +5,7 @@ JCoRe Pubmed Central Multiplier Reader Reads a directory of NXML files, possibly assembled into ZIP archives. Requires the Pubmed Central Multiplier to follow in the pipeline. This reader only sends URIs referencing the NXML files to the multiplier that then does the parsing. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT SendCasToLast diff --git a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-reader.xml b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-reader.xml index 74eee3a1a..284f41cdd 100644 --- a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-reader.xml +++ b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-reader.xml @@ -5,7 +5,7 @@ JCoRe Pubmed Central Reader Reads Pubmed Central documents from the NXML format - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT Input diff --git a/jcore-ppd-writer/component.meta b/jcore-ppd-writer/component.meta index d10916db5..9264bc68e 100644 --- a/jcore-ppd-writer/component.meta +++ b/jcore-ppd-writer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-ppd-writer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Piped Format Writer" } diff --git a/jcore-ppd-writer/pom.xml b/jcore-ppd-writer/pom.xml index 6009a4286..8e409735b 100644 --- a/jcore-ppd-writer/pom.xml +++ b/jcore-ppd-writer/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-ppd-writer/src/main/resources/de/julielab/jcore/consumer/ppd/desc/jcore-ppd-writer.xml b/jcore-ppd-writer/src/main/resources/de/julielab/jcore/consumer/ppd/desc/jcore-ppd-writer.xml index de2470406..fdf9505bf 100644 --- a/jcore-ppd-writer/src/main/resources/de/julielab/jcore/consumer/ppd/desc/jcore-ppd-writer.xml +++ b/jcore-ppd-writer/src/main/resources/de/julielab/jcore/consumer/ppd/desc/jcore-ppd-writer.xml @@ -6,7 +6,7 @@ JCoRe PPD Writer This component writes CAS annotation data to the pipe-separated format. For example, writing tokens with their PoS would result in text like 'The|DET tree|NN is|VBZ green|ADJ'. The component can be configured for an arbitrary number of annotations to be added to each token. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT TypeToLabelMappings diff --git a/jcore-pubtator-reader/component.meta b/jcore-pubtator-reader/component.meta index 591a4acb5..ea6504c28 100644 --- a/jcore-pubtator-reader/component.meta +++ b/jcore-pubtator-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-pubtator-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe PubTator Reader" } diff --git a/jcore-pubtator-reader/pom.xml b/jcore-pubtator-reader/pom.xml index bd57f680f..84661f424 100644 --- a/jcore-pubtator-reader/pom.xml +++ b/jcore-pubtator-reader/pom.xml @@ -12,7 +12,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT .. diff --git a/jcore-pubtator-reader/src/main/resources/de/julielab/jcore/reader/pubtator/desc/jcore-pubtator-reader.xml b/jcore-pubtator-reader/src/main/resources/de/julielab/jcore/reader/pubtator/desc/jcore-pubtator-reader.xml index 82cd90174..76985dfd7 100644 --- a/jcore-pubtator-reader/src/main/resources/de/julielab/jcore/reader/pubtator/desc/jcore-pubtator-reader.xml +++ b/jcore-pubtator-reader/src/main/resources/de/julielab/jcore/reader/pubtator/desc/jcore-pubtator-reader.xml @@ -5,7 +5,7 @@ jcore-pubtator-reader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-stanford-lemmatizer-ae/component.meta b/jcore-stanford-lemmatizer-ae/component.meta index 872ac0bdd..2da48cfa7 100644 --- a/jcore-stanford-lemmatizer-ae/component.meta +++ b/jcore-stanford-lemmatizer-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-stanford-lemmatizer-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Stanford Lemmatizer" } diff --git a/jcore-stanford-lemmatizer-ae/pom.xml b/jcore-stanford-lemmatizer-ae/pom.xml index 99e888260..39eda0c8b 100644 --- a/jcore-stanford-lemmatizer-ae/pom.xml +++ b/jcore-stanford-lemmatizer-ae/pom.xml @@ -5,7 +5,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT jcore-stanford-lemmatizer-ae JCoRe Stanford Lemmatizer diff --git a/jcore-stanford-lemmatizer-ae/src/main/resources/de/julielab/jcore/ae/stanford/lemma/desc/jcore-stanford-lemmatizer.xml b/jcore-stanford-lemmatizer-ae/src/main/resources/de/julielab/jcore/ae/stanford/lemma/desc/jcore-stanford-lemmatizer.xml index 47dec0beb..2e4b3d99a 100644 --- a/jcore-stanford-lemmatizer-ae/src/main/resources/de/julielab/jcore/ae/stanford/lemma/desc/jcore-stanford-lemmatizer.xml +++ b/jcore-stanford-lemmatizer-ae/src/main/resources/de/julielab/jcore/ae/stanford/lemma/desc/jcore-stanford-lemmatizer.xml @@ -5,7 +5,7 @@ JCoRe Stanford Lemmatizer This is the UIMA Wrapper for the Stanford CoreNLP Lemmatizer component. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-stanford-lemmatizer-ae/src/test/resources/de/julielab/jcore/ae/stanford/lemma/desc/jcore-stanford-lemmatizer-ae.xml b/jcore-stanford-lemmatizer-ae/src/test/resources/de/julielab/jcore/ae/stanford/lemma/desc/jcore-stanford-lemmatizer-ae.xml index ca8ce0703..653ecbe8e 100644 --- a/jcore-stanford-lemmatizer-ae/src/test/resources/de/julielab/jcore/ae/stanford/lemma/desc/jcore-stanford-lemmatizer-ae.xml +++ b/jcore-stanford-lemmatizer-ae/src/test/resources/de/julielab/jcore/ae/stanford/lemma/desc/jcore-stanford-lemmatizer-ae.xml @@ -7,7 +7,7 @@ jcore-stanford-lemmatizer-ae This is the UIMA Wrapper for the Stanford CoreNLP Lemmatizer component. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-topic-indexing-ae/component.meta b/jcore-topic-indexing-ae/component.meta index 03a0d63b9..c6cdef338 100644 --- a/jcore-topic-indexing-ae/component.meta +++ b/jcore-topic-indexing-ae/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-topic-indexing-ae", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe-Topic-Labeling-AE" } diff --git a/jcore-topic-indexing-ae/pom.xml b/jcore-topic-indexing-ae/pom.xml index 99e1c0173..b378f818d 100644 --- a/jcore-topic-indexing-ae/pom.xml +++ b/jcore-topic-indexing-ae/pom.xml @@ -9,7 +9,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT @@ -53,7 +53,7 @@ de.julielab jcore-xmi-reader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT test diff --git a/jcore-topic-indexing-ae/src/main/resources/de/julielab/jcore/ae/topicindexing/desc/jcore-topic-indexing-ae.xml b/jcore-topic-indexing-ae/src/main/resources/de/julielab/jcore/ae/topicindexing/desc/jcore-topic-indexing-ae.xml index 95d04054e..6db4c1c87 100644 --- a/jcore-topic-indexing-ae/src/main/resources/de/julielab/jcore/ae/topicindexing/desc/jcore-topic-indexing-ae.xml +++ b/jcore-topic-indexing-ae/src/main/resources/de/julielab/jcore/ae/topicindexing/desc/jcore-topic-indexing-ae.xml @@ -6,7 +6,7 @@ JCoRe Topic Indexer This component assigns topics relative to a given topic model to the encoutered documents. The topic model is one trained by the julielab-topic-modeling project. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT TopicModelConfig diff --git a/jcore-topics-writer/component.meta b/jcore-topics-writer/component.meta index c98a40a2e..32ac48b74 100644 --- a/jcore-topics-writer/component.meta +++ b/jcore-topics-writer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-topics-writer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Topics Writer" } diff --git a/jcore-topics-writer/pom.xml b/jcore-topics-writer/pom.xml index ad9569a47..19752ec2e 100644 --- a/jcore-topics-writer/pom.xml +++ b/jcore-topics-writer/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-topics-writer/src/main/resources/de/julielab/jcore/consumer/topics/desc/jcore-topics-writer.xml b/jcore-topics-writer/src/main/resources/de/julielab/jcore/consumer/topics/desc/jcore-topics-writer.xml index 47a3c1bb6..23aab97c9 100644 --- a/jcore-topics-writer/src/main/resources/de/julielab/jcore/consumer/topics/desc/jcore-topics-writer.xml +++ b/jcore-topics-writer/src/main/resources/de/julielab/jcore/consumer/topics/desc/jcore-topics-writer.xml @@ -6,7 +6,7 @@ JCoRe Topics Writer Writes the topic weights, given the jcore-topic-indexing-ae running before, into a simple text file. Thus, the output consists of a sequency of double numbers encodes as strings, separated by tab characters. The topic ID is just the 0-based index of each number, from left to right in the written file. The first entry of each file is the document ID. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT OutputDirectory diff --git a/jcore-txt-consumer/component.meta b/jcore-txt-consumer/component.meta index 6cf58e0d6..c10e83c2c 100644 --- a/jcore-txt-consumer/component.meta +++ b/jcore-txt-consumer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-txt-consumer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe TXT Consumer" } diff --git a/jcore-txt-consumer/pom.xml b/jcore-txt-consumer/pom.xml index bf6de2d14..07b878cab 100644 --- a/jcore-txt-consumer/pom.xml +++ b/jcore-txt-consumer/pom.xml @@ -4,7 +4,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT jcore-txt-consumer JCoRe TXT Consumer diff --git a/jcore-txt-consumer/src/main/resources/de/julielab/jcore/consumer/txt/desc/jcore-txt-consumer.xml b/jcore-txt-consumer/src/main/resources/de/julielab/jcore/consumer/txt/desc/jcore-txt-consumer.xml index 556fbbc5e..55deaaea9 100644 --- a/jcore-txt-consumer/src/main/resources/de/julielab/jcore/consumer/txt/desc/jcore-txt-consumer.xml +++ b/jcore-txt-consumer/src/main/resources/de/julielab/jcore/consumer/txt/desc/jcore-txt-consumer.xml @@ -6,7 +6,7 @@ JCoRe Text Consumer Stores the CAS document text in files. Either in tokenized sentences plus optional PoS tags or just the original document text. The text files can also be stored in GZIP format or batch-wise in ZIP archives. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT de.julielab.jcore.consumer.txt diff --git a/jcore-types/pom.xml b/jcore-types/pom.xml index 6abd932d0..e9571839f 100644 --- a/jcore-types/pom.xml +++ b/jcore-types/pom.xml @@ -8,7 +8,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-dbtable-multiplier-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-dbtable-multiplier-types.xml index 57770ed2a..06aa1902b 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-dbtable-multiplier-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-dbtable-multiplier-types.xml @@ -10,7 +10,7 @@ base document and annotations that have been previously created and stored in separate tables. This is part of the Jena Document Information System, JeDIS. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-uri-multiplier-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-uri-multiplier-types.xml index 7b49b49c9..5f6a3459b 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-uri-multiplier-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-uri-multiplier-types.xml @@ -4,7 +4,7 @@ This is a type system for usage with a CAS multiplier. It should not be included into the jcore-all-types type system. This particular type system holds a single URI that points to the resource that should be split into CASes by the multiplier. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-ace-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-ace-types.xml index e3ba78bce..d5e851681 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-ace-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-ace-types.xml @@ -2,7 +2,7 @@ JCoRe ACE Types The jcore-ace-types TS represents the complete ACE Annotation in CAS format. -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-document-meta-extension-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-document-meta-extension-types.xml index a95f22bfa..115927024 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-document-meta-extension-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-document-meta-extension-types.xml @@ -2,7 +2,7 @@ JCoRe Document Meta Types The types of this type system reflect meta data about documents for rather specific use cases. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-dta-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-dta-types.xml index 387aafda0..89d99ec9a 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-dta-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-dta-types.xml @@ -2,7 +2,7 @@ jcore-dta-types - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-evaluation-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-evaluation-types.xml index bb860b3ec..e02013a5f 100755 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-evaluation-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-evaluation-types.xml @@ -2,7 +2,7 @@ JCoRe Evaluation Types This type system is an extension of the JCoRe type system to cover evaluation Annotations like missing or additional annotations for evaluation purposes. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-mantra-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-mantra-types.xml index 9d5ffa276..32ce435a4 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-mantra-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-mantra-types.xml @@ -2,7 +2,7 @@ JCoRe Mantra Types The type system contains types for working with documents in the context of the MANTRA project. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-medical-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-medical-types.xml index 3fca73bdb..af5a69392 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-medical-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-medical-types.xml @@ -2,7 +2,7 @@ jcore-medical-types - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-mmax-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-mmax-types.xml index 192bf4dc6..fbec38980 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-mmax-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-mmax-types.xml @@ -2,7 +2,7 @@ JCoRe MMAX Types The type system contains types for the import of MMAX2 annotations. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-muc7-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-muc7-types.xml index 39575dacb..8aa7a5303 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-muc7-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-muc7-types.xml @@ -2,7 +2,7 @@ JCoRe MUC7 Type System This type system contains types covering annotations for the MUC7 data. -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-ace-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-ace-types.xml index f2db24b6f..2a2059bf3 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-ace-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-ace-types.xml @@ -2,7 +2,7 @@ JCoRe Semantics ACE Types The type system contains ACE types of the ACE taxonomy. -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-bootstrep-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-bootstrep-types.xml index 2ae6f1df3..a2f40fc70 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-bootstrep-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-bootstrep-types.xml @@ -2,7 +2,7 @@ JCoRe Semantics BOOTStrep Types The type system is an extension of the JCoRe core type system for types required in the context of the BOOTStrep project. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-mention-extension-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-mention-extension-types.xml index 1b67565c2..62cabb2d4 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-mention-extension-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-mention-extension-types.xml @@ -2,7 +2,7 @@ JCoRe Semantics Mention Types Extension JCoRe type extensions to the JCoRe Semantics Mention types. Required for some processing or representation, these types do not extend the actual semantics of the core type system. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-stemnet-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-stemnet-types.xml index c213f3f08..68f6711bd 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-stemnet-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-semantics-stemnet-types.xml @@ -2,7 +2,7 @@ JCoRe Semantics StemNet Typs The type system contains types of the StemNet project. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-wikipedia-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-wikipedia-types.xml index 48c8e3b9e..cb9265d5b 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-wikipedia-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-wikipedia-types.xml @@ -2,7 +2,7 @@ JCoRe Wikipedia Types The type system contains types for the annotation of meta information of Wikipedia pages. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-affect-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-affect-types.xml index 10d3a8bb7..69183e809 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-affect-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-affect-types.xml @@ -2,7 +2,7 @@ jcore-affect-types - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-all-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-all-types.xml index fedf2eec7..d2d038014 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-all-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-all-types.xml @@ -2,7 +2,7 @@ JCoRe All Types This is just a convenience file, assembling all JCoRe types - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-basic-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-basic-types.xml index 00003147c..c1105adcc 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-basic-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-basic-types.xml @@ -2,7 +2,7 @@ JCoRe Basic Types The type system contains the basic annotation types. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-discourse-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-discourse-types.xml index d3190b9e5..ab4888c8c 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-discourse-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-discourse-types.xml @@ -2,7 +2,7 @@ JCoRe Discourse Types Discourse types such as coreference relations. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-clinicaltrial-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-clinicaltrial-types.xml index ce908039f..c39e6dd15 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-clinicaltrial-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-clinicaltrial-types.xml @@ -5,7 +5,7 @@ document meta information (bibliographical and content information), especially for PubMed abstracts. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-pubmed-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-pubmed-types.xml index 6363ece45..2deb2853d 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-pubmed-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-pubmed-types.xml @@ -5,7 +5,7 @@ document meta information (bibliographical and content information), especially for PubMed abstracts. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-types.xml index 5a23252bd..2865894e7 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-meta-types.xml @@ -2,7 +2,7 @@ JCoRe Document Meta Types The type system contains types for the annotation of document meta information (bibliographical and content information). - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-clinicaltrial-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-clinicaltrial-types.xml index 082c8e775..77b328da5 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-clinicaltrial-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-clinicaltrial-types.xml @@ -4,7 +4,7 @@ This type system contains document structure types specific to the clinical trails XML format as retrieved from https://clinicaltrials.gov/. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-pubmed-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-pubmed-types.xml index 57ea9b281..b575084d5 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-pubmed-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-pubmed-types.xml @@ -4,7 +4,7 @@ This type system contains document structure types specific to PubMed or MEDLINE, e.g. detailed descriptions of structured abstracts. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-types.xml index 633edd187..5159c11aa 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-document-structure-types.xml @@ -2,7 +2,7 @@ JCoRe Document Structure Types The type system contains the types for the annotation of document sutructure, e.g. titles, abstract text, captions etc. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-morpho-syntax-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-morpho-syntax-types.xml index 1c65aef6f..0ff447c77 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-morpho-syntax-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-morpho-syntax-types.xml @@ -4,7 +4,7 @@ The type system contains types for the annotation of morpho-syntactic and syntactic analysis (constituncy-based and dependecy-based parsing) results. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-biology-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-biology-types.xml index 92f89b23b..e421aa1c6 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-biology-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-biology-types.xml @@ -2,7 +2,7 @@ JCoRe Semantics Biology Types The type system contains types of the biomedical domain. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-concept-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-concept-types.xml index edea7e8ee..7b4b3d008 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-concept-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-concept-types.xml @@ -2,7 +2,7 @@ JCoRe Semantics Concept Types The type system contains core semantic types definitions such as entity, relation and event. -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-mention-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-mention-types.xml index e2ecd9dd4..87718af50 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-mention-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-semantics-mention-types.xml @@ -2,7 +2,7 @@ JCoRe Semantics Mention Types The type system contains core semantic types definitions such as entity, relation and event. The types in this type system refer to actual text occurrences. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/priorities/jcore-type-priorities.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/priorities/jcore-type-priorities.xml index 1373c4eac..53c5d882f 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/priorities/jcore-type-priorities.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/priorities/jcore-type-priorities.xml @@ -2,7 +2,7 @@ jcore-type-priorities - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT de.julielab.jcore.types.Title diff --git a/jcore-utilities/pom.xml b/jcore-utilities/pom.xml index 6395a3b73..aafbe74fe 100644 --- a/jcore-utilities/pom.xml +++ b/jcore-utilities/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-utilities/src/test/resources/AETestDescriptor.xml b/jcore-utilities/src/test/resources/AETestDescriptor.xml index ea2658e15..ab602e7c9 100644 --- a/jcore-utilities/src/test/resources/AETestDescriptor.xml +++ b/jcore-utilities/src/test/resources/AETestDescriptor.xml @@ -6,7 +6,7 @@ JulesToolsDescriptor -2.5.1-SNAPSHOT +2.6.0-SNAPSHOT diff --git a/jcore-xmi-db-reader/component.meta b/jcore-xmi-db-reader/component.meta index d8abdab0f..c7c922807 100644 --- a/jcore-xmi-db-reader/component.meta +++ b/jcore-xmi-db-reader/component.meta @@ -23,7 +23,7 @@ "maven-artifact": { "artifactId": "jcore-xmi-db-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe XMI Database Reader" } diff --git a/jcore-xmi-db-reader/pom.xml b/jcore-xmi-db-reader/pom.xml index 4e3f07f1a..6cd48ce47 100644 --- a/jcore-xmi-db-reader/pom.xml +++ b/jcore-xmi-db-reader/pom.xml @@ -5,7 +5,7 @@ jedis-parent de.julielab - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT ../jedis-parent jcore-xmi-db-reader @@ -18,7 +18,7 @@ de.julielab jcore-db-reader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT org.testng @@ -59,13 +59,13 @@ de.julielab jcore-xml-db-reader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT test de.julielab jcore-xmi-db-writer - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT test diff --git a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier-reader.xml b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier-reader.xml index 312cb5e0a..9ef28be72 100644 --- a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier-reader.xml +++ b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier-reader.xml @@ -5,7 +5,7 @@ JCoRe XMI Database Multiplier Reader This is an extension of the DBMultiplierReader to handle JeDIS XMI annotation module data. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT ReadsBaseDocument diff --git a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml index 0e15747d0..081c3d6a8 100644 --- a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml +++ b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml @@ -6,7 +6,7 @@ JCoRe Abstract Database Multiplier A multiplier that receives document IDs to read from a database table from the DBMultiplierReader. The reader also delivers the path to the corpus storage system (CoStoSys) configuration and additional tables for joining with the main data table. This multiplier class is abstract and cannot be used directly.Extending classes must implement the next() method to actually read documents from the database and populate CASes with them. This component is a part of the Jena Document Information System, JeDIS. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany JULIE Lab Jena, Germany diff --git a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-reader.xml b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-reader.xml index c09220a89..dd703d3d1 100644 --- a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-reader.xml +++ b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-reader.xml @@ -5,7 +5,7 @@ JCoRe XMI Database Reader A database readerthat expects serialized UIMA CAS objects in XMI format as input. The reader has the capability to read segmented annotation graphs that have been stored by the jcore-xmi-db-writer. This component is part of the Jena Document Information System, JeDIS. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-xmi-db-writer/component.meta b/jcore-xmi-db-writer/component.meta index 708695365..3c65e61ac 100644 --- a/jcore-xmi-db-writer/component.meta +++ b/jcore-xmi-db-writer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-xmi-db-writer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe XMI Database Writer" } diff --git a/jcore-xmi-db-writer/pom.xml b/jcore-xmi-db-writer/pom.xml index 5a7320d2f..657e06c16 100644 --- a/jcore-xmi-db-writer/pom.xml +++ b/jcore-xmi-db-writer/pom.xml @@ -4,7 +4,7 @@ jedis-parent de.julielab - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT ../jedis-parent jcore-xmi-db-writer @@ -144,7 +144,7 @@ de.julielab jcore-db-checkpoint-ae - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT de.julielab diff --git a/jcore-xmi-db-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-db-writer.xml b/jcore-xmi-db-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-db-writer.xml index 9eab689a6..b3b5afac1 100644 --- a/jcore-xmi-db-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-db-writer.xml +++ b/jcore-xmi-db-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-db-writer.xml @@ -6,7 +6,7 @@ JCoRe XMI Database Writer This component is capable of storing the standard UIMA serialization of documents in one or even multiple database tables. The UIMA serialization format is XMI, an XML format that expressed an annotation graph. This component either stores the whole annotation graph in XMI format in a database row, together with the document ID. Alternatively, it makes use of the jcore-xmi-splitter to segment the annotation graph with respect to a user specified list of annotation types. Then, the XMI data of each annotation type is extracted from the document XMI data and stored in a separate table. The tables are created automatically according to the primary key of the active table schema in the Corpus Storage System (CoStoSys) configuration file that is also given as a parameter. The jcore-xmi-db-reader is capable of reading this kind of distributed annotation graph and reassemble a valid XMI document which then cas be deserialized into a CAS. This consumer is UIMA DUCC compatible. It requires the collection reader to forward the work item CAS to the consumer. This is required so the consumer knows that a work item has been finished and that all cached data - in this case the XMI data - should be flushed. This is important! Without the forwarding of the work item CAS, the last batch of cached XMI data will not be written into the database. This component is part of the Jena Document Information System, JeDIS. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab Jena, Germany diff --git a/jcore-xmi-reader/component.meta b/jcore-xmi-reader/component.meta index 701192b4c..347606dc4 100644 --- a/jcore-xmi-reader/component.meta +++ b/jcore-xmi-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-xmi-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe XMI Collection Reader" } diff --git a/jcore-xmi-reader/pom.xml b/jcore-xmi-reader/pom.xml index ea0dcd482..9e3df5b4c 100644 --- a/jcore-xmi-reader/pom.xml +++ b/jcore-xmi-reader/pom.xml @@ -13,7 +13,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-xmi-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-reader.xml b/jcore-xmi-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-reader.xml index a7701f7e3..c6c747371 100644 --- a/jcore-xmi-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-reader.xml +++ b/jcore-xmi-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-reader.xml @@ -6,7 +6,7 @@ XmiCollectionReader A CollectionReader which reads CAS data stored as XMI files from the file system. The reader grounds on IBM's XmiCollectionReader delivered with older versions of UIMA and has been extended by the Julie Lab team at the University of Jena. This XMI reader is capable of reading (g)zipped XMI files and is able to recursively search subdirectories of a delivered root directory for XMI files. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-xmi-writer/component.meta b/jcore-xmi-writer/component.meta index 48695ccb1..ef645b6dd 100644 --- a/jcore-xmi-writer/component.meta +++ b/jcore-xmi-writer/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-xmi-writer", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe XMI Writer" } diff --git a/jcore-xmi-writer/pom.xml b/jcore-xmi-writer/pom.xml index 950de517b..65dd58b07 100644 --- a/jcore-xmi-writer/pom.xml +++ b/jcore-xmi-writer/pom.xml @@ -11,7 +11,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-xmi-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-writer.xml b/jcore-xmi-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-writer.xml index cfd5692d9..aaeb7196c 100644 --- a/jcore-xmi-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-writer.xml +++ b/jcore-xmi-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-writer.xml @@ -6,7 +6,7 @@ XMIWriter - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-xmi-writer/src/test/resources/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.xml b/jcore-xmi-writer/src/test/resources/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.xml index 1453038df..c11ac0001 100644 --- a/jcore-xmi-writer/src/test/resources/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.xml +++ b/jcore-xmi-writer/src/test/resources/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.xml @@ -6,7 +6,7 @@ XMIWriter - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-xml-db-reader/component.meta b/jcore-xml-db-reader/component.meta index 6fde40ce7..0d7fce2f6 100644 --- a/jcore-xml-db-reader/component.meta +++ b/jcore-xml-db-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-xml-db-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe XML Database Reader" } diff --git a/jcore-xml-db-reader/pom.xml b/jcore-xml-db-reader/pom.xml index 72a3652f7..145fcc69a 100644 --- a/jcore-xml-db-reader/pom.xml +++ b/jcore-xml-db-reader/pom.xml @@ -15,7 +15,7 @@ de.julielab jedis-parent - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT ../jedis-parent @@ -23,7 +23,7 @@ de.julielab jcore-db-reader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT de.julielab @@ -51,7 +51,7 @@ de.julielab jcore-xml-mapper - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT de.julielab diff --git a/jcore-xml-db-reader/src/main/resources/de/julielab/jcore/reader/xml/desc/jcore-xml-db-reader.xml b/jcore-xml-db-reader/src/main/resources/de/julielab/jcore/reader/xml/desc/jcore-xml-db-reader.xml index 6b562101f..a7d8fe03c 100644 --- a/jcore-xml-db-reader/src/main/resources/de/julielab/jcore/reader/xml/desc/jcore-xml-db-reader.xml +++ b/jcore-xml-db-reader/src/main/resources/de/julielab/jcore/reader/xml/desc/jcore-xml-db-reader.xml @@ -7,7 +7,7 @@ A collection reader that receives XML document data from a PostgreSQL database. It employs the jcore-xml-mapper to populate UIMA CAS instances with the XML data according to a mapping file. For the same functionality without using a database, refer to the jcore-xml-reader. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT JULIE Lab, Germany diff --git a/jcore-xml-mapper/pom.xml b/jcore-xml-mapper/pom.xml index dab5025f2..ded5b9a9f 100644 --- a/jcore-xml-mapper/pom.xml +++ b/jcore-xml-mapper/pom.xml @@ -13,7 +13,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor.xml b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor.xml index b1878a690..a1bebd5a0 100644 --- a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor.xml +++ b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor.xml @@ -5,7 +5,7 @@ XMLReader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_Unicode_outside_BMP.xml b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_Unicode_outside_BMP.xml index 31c6e8683..0ce228185 100755 --- a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_Unicode_outside_BMP.xml +++ b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_Unicode_outside_BMP.xml @@ -5,7 +5,7 @@ XMLReader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_missingInputDir.xml b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_missingInputDir.xml index f103e0d5a..b501db9fa 100644 --- a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_missingInputDir.xml +++ b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_missingInputDir.xml @@ -5,7 +5,7 @@ XMLReader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_singleFile.xml b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_singleFile.xml index bf791c1c9..af51a64c1 100644 --- a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_singleFile.xml +++ b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_singleFile.xml @@ -5,7 +5,7 @@ XMLReader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_singleFile2.xml b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_singleFile2.xml index b0350909a..fdc051f37 100644 --- a/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_singleFile2.xml +++ b/jcore-xml-mapper/src/test/resources/XMLReaderDescriptor_medline_singleFile2.xml @@ -5,7 +5,7 @@ XMLReader - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-xml-reader/component.meta b/jcore-xml-reader/component.meta index dec59c048..97de60fef 100644 --- a/jcore-xml-reader/component.meta +++ b/jcore-xml-reader/component.meta @@ -14,7 +14,7 @@ "maven-artifact": { "artifactId": "jcore-xml-reader", "groupId": "de.julielab", - "version": "2.5.1-SNAPSHOT" + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe XML Reader" } diff --git a/jcore-xml-reader/pom.xml b/jcore-xml-reader/pom.xml index 1deddb382..1719c5c73 100644 --- a/jcore-xml-reader/pom.xml +++ b/jcore-xml-reader/pom.xml @@ -5,7 +5,7 @@ de.julielab jcore-base - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT jcore-xml-reader JCoRe XML Reader @@ -14,7 +14,7 @@ de.julielab jcore-xml-mapper - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT org.slf4j diff --git a/jcore-xml-reader/src/main/resources/de/julielab/jcore/reader/xml/desc/XMLMultiplierReader.xml b/jcore-xml-reader/src/main/resources/de/julielab/jcore/reader/xml/desc/XMLMultiplierReader.xml index 34d04d1c7..f13e7b82c 100644 --- a/jcore-xml-reader/src/main/resources/de/julielab/jcore/reader/xml/desc/XMLMultiplierReader.xml +++ b/jcore-xml-reader/src/main/resources/de/julielab/jcore/reader/xml/desc/XMLMultiplierReader.xml @@ -8,7 +8,7 @@ This reader is to be used with the JCoRe XML CAS Multiplier. The reader merely distributes the files to be read. The actual parsing is done by the multiplier. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-xml-reader/src/test/resources/MedlineReaderDescriptor_missingInputDir.xml b/jcore-xml-reader/src/test/resources/MedlineReaderDescriptor_missingInputDir.xml index cd9a3ac70..f1aaab0c6 100644 --- a/jcore-xml-reader/src/test/resources/MedlineReaderDescriptor_missingInputDir.xml +++ b/jcore-xml-reader/src/test/resources/MedlineReaderDescriptor_missingInputDir.xml @@ -5,7 +5,7 @@ MedlineReaderDescriptor_missingInputDir - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-xml-reader/src/test/resources/PubmedXMLMultiplier.xml b/jcore-xml-reader/src/test/resources/PubmedXMLMultiplier.xml index d8ad0005b..964ccdf74 100644 --- a/jcore-xml-reader/src/test/resources/PubmedXMLMultiplier.xml +++ b/jcore-xml-reader/src/test/resources/PubmedXMLMultiplier.xml @@ -5,7 +5,7 @@ PubmedXMLMultiplierDescriptor - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jcore-xml-reader/src/test/resources/XMLMultiplierReader.xml b/jcore-xml-reader/src/test/resources/XMLMultiplierReader.xml index 90a50848b..5d7c405fb 100644 --- a/jcore-xml-reader/src/test/resources/XMLMultiplierReader.xml +++ b/jcore-xml-reader/src/test/resources/XMLMultiplierReader.xml @@ -5,7 +5,7 @@ MedlineReaderDescriptor_missingInputDir - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT diff --git a/jedis-parent/pom.xml b/jedis-parent/pom.xml index 4d1302786..f56a81be0 100644 --- a/jedis-parent/pom.xml +++ b/jedis-parent/pom.xml @@ -4,7 +4,7 @@ jcore-base de.julielab - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT pom 4.0.0 diff --git a/pom.xml b/pom.xml index 274a990dd..7f4011b1e 100644 --- a/pom.xml +++ b/pom.xml @@ -21,7 +21,7 @@ The POM for the JCoRe Base projects. - 2.5.1-SNAPSHOT + 2.6.0-SNAPSHOT From 5eec4cbe26f9d67116980b8bcc6eb0b64e841546 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 9 Jun 2020 10:29:23 +0200 Subject: [PATCH 005/269] All Neo4jRelationsConsumer tests running. --- .../Neo4jRelationsConsumer.java | 13 ++- ...Neo4jRelationsConsumerIntegrationTest.java | 88 +++++++++++++++++-- .../Neo4jRelationsConsumerTest.java | 4 +- 3 files changed, 96 insertions(+), 9 deletions(-) diff --git a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java index 9b9a6dddc..a3653b18a 100644 --- a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java +++ b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java @@ -39,6 +39,8 @@ import java.util.*; import java.util.stream.StreamSupport; +import static java.nio.charset.StandardCharsets.UTF_8; + @ResourceMetaData(name = "JCoRe Neo4j Relations Consumer", description = "This component assumes that a Neo4j server with an installed julieliab-neo4j-plugins-concepts plugin installed. It then sends FlattenedRelation instances with more then one arguments to Neo4j. Note that this requires the event arguments to have a ResourceEntry list to obtain database concept IDs from.", vendor = "JULIE Lab, Germany", copyright = "JULIE Lab", version = "2.6.0-SNAPSHOT") @TypeCapability(inputs = {"de.julielab.jcore.types.EventMention"}) public class Neo4jRelationsConsumer extends JCasAnnotator_ImplBase { @@ -69,6 +71,7 @@ public void initialize(final UimaContext aContext) throws ResourceInitialization om = new ObjectMapper(); om.setSerializationInclusion(JsonInclude.Include.NON_NULL); om.setSerializationInclusion(JsonInclude.Include.NON_EMPTY); + initImportRelations(); } private void initImportRelations() { @@ -118,6 +121,7 @@ private void sendRelationsToNeo4j() throws AnalysisEngineProcessException { try { URL url = URI.create(this.url).toURL(); HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); + urlConnection.addRequestProperty("Content-Type", "application/json"); urlConnection.setRequestMethod(HttpMethod.POST); urlConnection.setDoOutput(true); try (OutputStream outputStream = urlConnection.getOutputStream()) { @@ -138,7 +142,14 @@ private void sendRelationsToNeo4j() throws AnalysisEngineProcessException { g.close(); } try (InputStream inputStream = urlConnection.getInputStream()) { - log.debug("Response from Neo4j: {}", IOUtils.toString(inputStream)); + log.debug("Response from Neo4j: {}", IOUtils.toString(inputStream, UTF_8)); + } catch (IOException e) { + log.error("Exception occurred while sending relation data to Neo4j server."); + try (InputStream inputStream = urlConnection.getErrorStream()) { + if (inputStream != null) + log.error("Error from Neo4j: {}", IOUtils.toString(inputStream, UTF_8)); + } + throw e; } importIERelations.clear(); } catch (IOException e) { diff --git a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java index 6c853ecdd..e176fba41 100644 --- a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java +++ b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java @@ -1,31 +1,107 @@ package de.julielab.jcore.consumer.neo4jrelations; +import de.julielab.jcore.types.pubmed.Header; import de.julielab.neo4j.plugins.Indexes; +import de.julielab.neo4j.plugins.concepts.ConceptLookup; import de.julielab.neo4j.plugins.concepts.ConceptManager; -import org.apache.uima.UIMAException; +import de.julielab.neo4j.plugins.datarepresentation.*; +import de.julielab.neo4j.plugins.datarepresentation.constants.FacetConstants; +import de.julielab.neo4j.plugins.datarepresentation.util.ConceptsJsonSerializer; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Rule; +import org.junit.BeforeClass; +import org.junit.ClassRule; import org.junit.Test; +import org.neo4j.graphdb.GraphDatabaseService; +import org.neo4j.graphdb.Node; +import org.neo4j.graphdb.RelationshipType; +import org.neo4j.graphdb.Transaction; import org.neo4j.harness.junit.rule.Neo4jRule; +import org.neo4j.test.server.HTTP; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.List; +import java.util.stream.Stream; + +import static de.julielab.jcore.consumer.neo4jrelations.Neo4jRelationsConsumerTest.addFlattenedRelation1ToCas; +import static de.julielab.jcore.consumer.neo4jrelations.Neo4jRelationsConsumerTest.addFlattenedRelation2ToCas; +import static de.julielab.neo4j.plugins.constants.semedico.SemanticRelationConstants.PROP_DOC_IDS; +import static de.julielab.neo4j.plugins.datarepresentation.constants.ConceptConstants.PROP_SRC_IDS; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; +import static org.neo4j.configuration.GraphDatabaseSettings.DEFAULT_DATABASE_NAME; /** * Unit tests for jcore-neo4j-relations-consumer. * */ public class Neo4jRelationsConsumerIntegrationTest { - @Rule - public Neo4jRule neo4j = new Neo4jRule() + private final static Logger log = LoggerFactory.getLogger(Neo4jRelationsConsumerIntegrationTest.class); + @ClassRule + public static Neo4jRule neo4j = new Neo4jRule() .withUnmanagedExtension("/concepts", ConceptManager.class).withFixture(graphDatabaseService -> { new Indexes(null).createIndexes(graphDatabaseService); return null; }); + @BeforeClass + public static void beforeClass() throws Exception { + ImportFacet facet = new ImportFacet(new ImportFacetGroup("FG"), "myfacet", "myfacet", "myfacet", FacetConstants.SRC_TYPE_HIERARCHICAL); + ImportConcept c11 = new ImportConcept("concept11", new ConceptCoordinates("id11", "source11", CoordinateType.SRC)); + ImportConcept c12 = new ImportConcept("concept12", new ConceptCoordinates("id12", "source12", CoordinateType.SRC)); + ImportConcept c13 = new ImportConcept("concept13", new ConceptCoordinates("id13", "source13", CoordinateType.SRC)); + ImportConcept c21 = new ImportConcept("concept21", new ConceptCoordinates("id21", "source21", CoordinateType.SRC)); + ImportConcept c22 = new ImportConcept("concept22", new ConceptCoordinates("id22", "source22", CoordinateType.SRC)); + ImportConcepts importConcepts = new ImportConcepts(Stream.of(c11, c12, c13, c21, c22), facet); + String uri = neo4j.httpURI().resolve("concepts/" + ConceptManager.CM_REST_ENDPOINT+"/"+ConceptManager.INSERT_CONCEPTS).toString(); + log.debug("Sending concepts to {}", uri); + HTTP.Response response = HTTP.POST(uri, ConceptsJsonSerializer.toJsonTree(importConcepts)); + log.debug("Response to test concepts import: {}", response); + assertEquals(200, response.status()); + } + @Test - public void insertEventMentions() throws UIMAException { - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types"); + public void insertEventMentions() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.extensions.jcore-semantics-mention-extension-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types"); + Header h = new Header(jCas); + h.setDocId("testdoc"); + h.addToIndexes(); + addFlattenedRelation1ToCas(jCas); + // Here is a duplicate. It should be recognized and just be counted up + addFlattenedRelation2ToCas(jCas); + addFlattenedRelation2ToCas(jCas); + + AnalysisEngine engine = AnalysisEngineFactory.createEngine( + "de.julielab.jcore.consumer.neo4jrelations.desc.jcore-neo4j-relations-consumer", + Neo4jRelationsConsumer.PARAM_URL, neo4j.httpURI().resolve("concepts/" + ConceptManager.CM_REST_ENDPOINT+"/"+ConceptManager.INSERT_IE_RELATIONS).toString(), + Neo4jRelationsConsumer.PARAM_ID_PROPERTY, "sourceIds"); + + engine.process(jCas); + engine.collectionProcessComplete(); + + GraphDatabaseService graphDb = neo4j.databaseManagementService().database(DEFAULT_DATABASE_NAME); + try (Transaction tx = graphDb.beginTx()) { + Node id11 = ConceptLookup.lookupSingleConceptBySourceId(tx, "id11"); + // There should be connections to 12 and 13. + assertThat(id11.getRelationships(RelationshipType.withName("regulation"))).hasSize(2); + assertThat(id11.getRelationships(RelationshipType.withName("regulation"))).flatExtracting(r -> List.of((String[]) r.getProperty(PROP_DOC_IDS))).containsExactly("testdoc", "testdoc"); + assertThat(id11.getRelationships(RelationshipType.withName("regulation"))).extracting(r -> r.getOtherNode(id11).getProperty(PROP_SRC_IDS+0)).containsExactlyInAnyOrder("id12", "id13"); + + Node id13 = ConceptLookup.lookupSingleConceptBySourceId(tx, "id13"); + // There should be connections to 11 and 12. + assertThat(id13.getRelationships(RelationshipType.withName("regulation"))).hasSize(2); + assertThat(id13.getRelationships(RelationshipType.withName("regulation"))).flatExtracting(r -> List.of((String[]) r.getProperty(PROP_DOC_IDS))).containsExactly("testdoc", "testdoc"); + assertThat(id13.getRelationships(RelationshipType.withName("regulation"))).extracting(r -> r.getOtherNode(id13).getProperty(PROP_SRC_IDS+0)).containsExactlyInAnyOrder("id11", "id12"); + Node id22 = ConceptLookup.lookupSingleConceptBySourceId(tx, "id22"); + // There should be connections to 21 + assertThat(id22.getRelationships(RelationshipType.withName("regulation"))).hasSize(1); + assertThat(id22.getRelationships(RelationshipType.withName("regulation"))).flatExtracting(r -> List.of((String[]) r.getProperty(PROP_DOC_IDS))).containsExactly("testdoc"); + assertThat(id22.getRelationships(RelationshipType.withName("regulation"))).extracting(r -> r.getOtherNode(id22).getProperty(PROP_SRC_IDS+0)).containsExactlyInAnyOrder("id21"); + } } } diff --git a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java index f40f929b2..8cf7bc2e5 100644 --- a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java +++ b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java @@ -73,7 +73,7 @@ public void insertEventMentionsGlobalSource() throws Exception { * Adds a FlattenedRelation with three arguments. * @param jCas The CAS. */ - private void addFlattenedRelation1ToCas(JCas jCas) { + public static void addFlattenedRelation1ToCas(JCas jCas) { FlattenedRelation fr = new FlattenedRelation(jCas); EventMention rootEm = new EventMention(jCas); rootEm.setSpecificType("regulation"); @@ -111,7 +111,7 @@ private void addFlattenedRelation1ToCas(JCas jCas) { * Adds a FlattenedRelation with two arguments. * @param jCas The CAS. */ - private void addFlattenedRelation2ToCas(JCas jCas) { + public static void addFlattenedRelation2ToCas(JCas jCas) { FlattenedRelation fr = new FlattenedRelation(jCas); EventMention rootEm = new EventMention(jCas); rootEm.setSpecificType("regulation"); From e233713340150a84f1f5a3dfbdaa40a1b3208844 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 9 Jun 2020 10:43:49 +0200 Subject: [PATCH 006/269] Adding the component.meta file for the Neo4j Relations Consumer. --- jcore-neo4j-relations-consumer/component.meta | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 jcore-neo4j-relations-consumer/component.meta diff --git a/jcore-neo4j-relations-consumer/component.meta b/jcore-neo4j-relations-consumer/component.meta new file mode 100644 index 000000000..e988fe410 --- /dev/null +++ b/jcore-neo4j-relations-consumer/component.meta @@ -0,0 +1,20 @@ +{ + "categories": [ + "consumer" + ], + "description": "Writes EventMentions to Neo4j.", + "descriptors": [ + { + "category": "consumer", + "location": "de.julielab.jcore.consumer.neo4jrelations.desc.jcore-neo4j-relations-consumer" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-neo4j-relations-consumer", + "groupId": "de.julielab", + "version": "2.6.0-SNAPSHOT" + }, + "name": "JCoRe Neo4j Relations Consumer" +} From febb256a2090fcf0f17ed2f19b568e6299e8959e Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 9 Jun 2020 10:58:59 +0200 Subject: [PATCH 007/269] Fixed test bugs where the document ID was not set to the CAS. --- .../neo4jrelations/Neo4jRelationsConsumerTest.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java index 8cf7bc2e5..28ba51f74 100644 --- a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java +++ b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java @@ -6,6 +6,7 @@ import de.julielab.jcore.types.EventMention; import de.julielab.jcore.types.ResourceEntry; import de.julielab.jcore.types.ext.FlattenedRelation; +import de.julielab.jcore.types.pubmed.Header; import de.julielab.jcore.utility.JCoReTools; import de.julielab.neo4j.plugins.datarepresentation.ImportIERelation; import de.julielab.neo4j.plugins.datarepresentation.ImportIERelationArgument; @@ -30,6 +31,9 @@ public class Neo4jRelationsConsumerTest { @Test public void insertEventMentions() throws Exception { JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.extensions.jcore-semantics-mention-extension-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types"); + Header h = new Header(jCas); + h.setDocId("testdoc"); + h.addToIndexes(); Neo4jRelationsConsumer engine = new Neo4jRelationsConsumer(); engine.initialize(UimaContextFactory.createUimaContext(Neo4jRelationsConsumer.PARAM_URL, "", Neo4jRelationsConsumer.PARAM_ID_PROPERTY, "sourceIds")); addFlattenedRelation1ToCas(jCas); @@ -53,6 +57,9 @@ public void insertEventMentions() throws Exception { @Test public void insertEventMentionsGlobalSource() throws Exception { JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.extensions.jcore-semantics-mention-extension-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types"); + Header h = new Header(jCas); + h.setDocId("testdoc"); + h.addToIndexes(); Neo4jRelationsConsumer engine = new Neo4jRelationsConsumer(); engine.initialize(UimaContextFactory.createUimaContext(Neo4jRelationsConsumer.PARAM_URL, "", Neo4jRelationsConsumer.PARAM_ID_PROPERTY, "sourceIds", Neo4jRelationsConsumer.PARAM_SOURCE, "globalSource")); addFlattenedRelation1ToCas(jCas); From 5c4d0a021f81b55a4e935a356ac17330be7f3e6d Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 10 Jun 2020 13:57:17 +0200 Subject: [PATCH 008/269] Bug fixes --- .../checkpoint/DocumentReleaseCheckpoint.java | 13 +++++-- jcore-neo4j-relations-consumer/pom.xml | 5 +++ .../Neo4jRelationsConsumer.java | 38 ++++++++++++++++-- .../desc/jcore-neo4j-relations-consumer.xml | 39 ++++++++----------- ...Neo4jRelationsConsumerIntegrationTest.java | 2 +- .../jcore/consumer/xmi/XMIDBWriter.java | 1 - 6 files changed, 66 insertions(+), 32 deletions(-) diff --git a/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DocumentReleaseCheckpoint.java b/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DocumentReleaseCheckpoint.java index e67750ed5..cb94a8aa3 100644 --- a/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DocumentReleaseCheckpoint.java +++ b/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DocumentReleaseCheckpoint.java @@ -14,7 +14,7 @@ *

This is class is a synchronization point for JeDIS components to report documents as being completely finished * with processing.

*

Problem explanation: This synchronization is necessary because most database operating components work in batch mode for - * performance reasons. However, if multiple components use batching with might be out of sync due to different + * performance reasons. However, if multiple components use batching wich might be out of sync due to different * batch sizes and possibly other factors, one component may have sent a batch of document data to the database * while other components have not at a particular point in time. If at such a time point the pipeline crashes * or is manually interrupted, the actually written data is incoherent in the sense that some components have sent @@ -41,6 +41,7 @@ public class DocumentReleaseCheckpoint { private static DocumentReleaseCheckpoint checkpoint; private Multiset releasedDocuments; private Set registeredComponents; + private long lastwarning = 1000; private DocumentReleaseCheckpoint() { releasedDocuments = HashMultiset.create(); @@ -99,13 +100,17 @@ public Set getReleasedDocumentIds() { // Get all documents released by all components Set returnedIds; synchronized (releasedDocuments) { - returnedIds = this.releasedDocuments.entrySet().stream().filter(e -> e.getCount() == getNumberOfRegisteredComponents()).map(Multiset.Entry::getElement).collect(Collectors.toSet()); + returnedIds = this.releasedDocuments.elementSet().stream().filter(e -> this.releasedDocuments.count(e) == getNumberOfRegisteredComponents()).collect(Collectors.toSet()); // Remove the completely released documents from the pool of potentially not yet completely released documents. returnedIds.forEach(id -> this.releasedDocuments.remove(id, Integer.MAX_VALUE)); } log.debug("Returning {} documents released by all registered components. {} document IDs remain that have not yet been released by all registered components.", returnedIds.size(), this.releasedDocuments.size()); - if (this.releasedDocuments.size() > 1000) - log.warn("The number of document IDs that have not been released by all registered components has grown to {}. If it does not increase again, there is likely an errorneous component which does not release its documents.", releasedDocuments.size()); + if (this.releasedDocuments.size() > lastwarning) { + log.warn("The number of document IDs that have not been released by all registered components has grown to {}. If it does not decrease again, there is likely an errorneous component which does not release its documents. Currently registered components: {}", releasedDocuments.size(), registeredComponents); + lastwarning *= 2; + } else if (this.releasedDocuments.size() < 50) { + lastwarning = 1000; + } return returnedIds; } diff --git a/jcore-neo4j-relations-consumer/pom.xml b/jcore-neo4j-relations-consumer/pom.xml index b7a2bf83c..6b0d0060c 100644 --- a/jcore-neo4j-relations-consumer/pom.xml +++ b/jcore-neo4j-relations-consumer/pom.xml @@ -39,6 +39,11 @@ jcore-utilities ${jcore-utilities-version} + + de.julielab + jcore-db-checkpoint-ae + 2.6.0-SNAPSHOT + org.neo4j.test neo4j-harness diff --git a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java index a3653b18a..2ad273897 100644 --- a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java +++ b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java @@ -6,9 +6,12 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.HashMultiset; import com.google.common.collect.Multiset; +import de.julielab.jcore.ae.checkpoint.DocumentId; +import de.julielab.jcore.ae.checkpoint.DocumentReleaseCheckpoint; import de.julielab.jcore.types.ArgumentMention; import de.julielab.jcore.types.ConceptMention; import de.julielab.jcore.types.ResourceEntry; +import de.julielab.jcore.types.ext.DBProcessingMetaData; import de.julielab.jcore.types.ext.FlattenedRelation; import de.julielab.jcore.utility.JCoReTools; import de.julielab.neo4j.plugins.datarepresentation.ImportIERelation; @@ -16,6 +19,7 @@ import de.julielab.neo4j.plugins.datarepresentation.ImportIERelationDocument; import de.julielab.neo4j.plugins.datarepresentation.ImportIETypedRelations; import de.julielab.neo4j.plugins.datarepresentation.constants.ImportIERelations; +import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.IOUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; @@ -23,6 +27,7 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.resource.ResourceInitializationException; @@ -48,6 +53,8 @@ public class Neo4jRelationsConsumer extends JCasAnnotator_ImplBase { public static final String PARAM_URL = "URL"; public static final String PARAM_ID_PROPERTY = "IdProperty"; public static final String PARAM_SOURCE = "ConceptSource"; + public static final String PARAM_NEO4J_USER = "Neo4jUser"; + public static final String PARAM_NEO4J_PASSWORD = "Neo4jPassword"; private final static Logger log = LoggerFactory.getLogger(Neo4jRelationsConsumer.class); @ConfigurationParameter(name = PARAM_URL, description = "The complete URL to the endpoint of the Neo4j server for relation insertion.") private String url; @@ -55,10 +62,16 @@ public class Neo4jRelationsConsumer extends JCasAnnotator_ImplBase { private String idProperty; @ConfigurationParameter(name = PARAM_SOURCE, mandatory = false, description = "Optional. Sets the global source for the concept IDs taken from the ResourceEntry instances of the relation arguments. This causes the 'source' feature of the ResourceEntry objects to be omitted and to globally use the specified source instead. This causes the Neo4j database plugin to resolve the provided argument IDs against the source specified here.") private String globalSource; + @ConfigurationParameter(name = PARAM_NEO4J_USER, mandatory = false, description = "Optional. The Neo4j server user name.") + private String neo4jUser; + @ConfigurationParameter(name = PARAM_NEO4J_PASSWORD, mandatory = false, description = "Optional. The Neo4j server password.") + private String neo4jPassword; private ImportIERelations importIERelations; private ObjectMapper om; + private Set documentIds; + /** * This method is called a single time by the framework at component * creation. Here, descriptor parameters are read and initial setup is done. @@ -68,10 +81,14 @@ public void initialize(final UimaContext aContext) throws ResourceInitialization url = (String) aContext.getConfigParameterValue(PARAM_URL); idProperty = (String) aContext.getConfigParameterValue(PARAM_ID_PROPERTY); globalSource = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_SOURCE)).orElse(null); + neo4jUser = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_NEO4J_USER)).orElse(null); + neo4jPassword = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_NEO4J_PASSWORD)).orElse(null); om = new ObjectMapper(); om.setSerializationInclusion(JsonInclude.Include.NON_NULL); om.setSerializationInclusion(JsonInclude.Include.NON_EMPTY); initImportRelations(); + DocumentReleaseCheckpoint.get().register(Neo4jRelationsConsumer.class.getCanonicalName()); + documentIds = new HashSet<>(); } private void initImportRelations() { @@ -84,14 +101,20 @@ private void initImportRelations() { */ @Override public void process(final JCas aJCas) { - importIERelations.addRelationDocument(convertRelations(aJCas)); + ImportIERelationDocument document = convertRelations(aJCas); + if (!document.getRelations().isEmpty()) + importIERelations.addRelationDocument(document); + + Optional metaOpt = JCasUtil.select(aJCas, DBProcessingMetaData.class).stream().findAny(); + documentIds.add(metaOpt.isPresent() ? new DocumentId(metaOpt.get()) : new DocumentId(JCoReTools.getDocId(aJCas))); } private ImportIERelationDocument convertRelations(JCas aJCas) { Map> relationCounts = getEquivalentRelationGroups(aJCas); ImportIERelationDocument relDoc = new ImportIERelationDocument(); relDoc.setDb(false); - relDoc.setName(JCoReTools.getDocId(aJCas)); + String docId = JCoReTools.getDocId(aJCas); + relDoc.setName(docId); ImportIETypedRelations typedRelations = new ImportIETypedRelations(); for (String relationType : relationCounts.keySet()) { Multiset unificationRelations = relationCounts.get(relationType); @@ -115,6 +138,7 @@ public void batchProcessComplete() throws AnalysisEngineProcessException { public void collectionProcessComplete() throws AnalysisEngineProcessException { super.collectionProcessComplete(); sendRelationsToNeo4j(); + DocumentReleaseCheckpoint.get().unregister(Neo4jRelationsConsumer.class.getCanonicalName()); } private void sendRelationsToNeo4j() throws AnalysisEngineProcessException { @@ -122,6 +146,11 @@ private void sendRelationsToNeo4j() throws AnalysisEngineProcessException { URL url = URI.create(this.url).toURL(); HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); urlConnection.addRequestProperty("Content-Type", "application/json"); + String authorizationToken = neo4jUser != null && neo4jPassword != null + ? "Basic " + Base64.encodeBase64URLSafeString((neo4jUser + ":" + neo4jPassword).getBytes()) + : null; + if (authorizationToken != null) + urlConnection.setRequestProperty("Authorization", authorizationToken); urlConnection.setRequestMethod(HttpMethod.POST); urlConnection.setDoOutput(true); try (OutputStream outputStream = urlConnection.getOutputStream()) { @@ -152,8 +181,11 @@ private void sendRelationsToNeo4j() throws AnalysisEngineProcessException { throw e; } importIERelations.clear(); + log.debug("Releasing {} document IDs that have successfully been sent to Neo4j", documentIds.size()); + DocumentReleaseCheckpoint.get().release(Neo4jRelationsConsumer.class.getCanonicalName(), documentIds.stream()); + documentIds.clear(); } catch (IOException e) { - log.error("Could not send relations to Neo4j", e); + log.error("Could not send relations to Neo4j endpoint {}", url, e); throw new AnalysisEngineProcessException(e); } } diff --git a/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml b/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml index 1119cc5ef..1ce50f4d9 100644 --- a/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml +++ b/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml @@ -31,34 +31,27 @@ false false + + Neo4jUser + Optional. The Neo4j server user name. + String + false + false + + + Neo4jPassword + Optional. The Neo4j server password. + String + false + false + - - - - - - - - - - - - - - - - - - - - - - - + + diff --git a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java index e176fba41..9a1fc1905 100644 --- a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java +++ b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java @@ -66,7 +66,7 @@ public static void beforeClass() throws Exception { @Test public void insertEventMentions() throws Exception { - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.extensions.jcore-semantics-mention-extension-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types"); + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.extensions.jcore-semantics-mention-extension-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types"); Header h = new Header(jCas); h.setDocId("testdoc"); h.addToIndexes(); diff --git a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java index f639e58ae..380c0b232 100644 --- a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java +++ b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java @@ -122,7 +122,6 @@ public class XMIDBWriter extends JCasAnnotator_ImplBase { private static Map> binaryMappedFeatures = Collections.emptyMap(); private static Map> splitterResultMap; private static Map, CountDownLatch>>> xmiBufferItemsToProcess; - private static ReentrantLock missingMappingsGatheringLock; private static CountDownLatch missingMappingsGatheringLatch = new CountDownLatch(0); private static ReentrantLock mappingUpdateLock; private DataBaseConnector dbc; From c73befcef48271d83b3a4b23a67f33367607c094 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 12 Jun 2020 12:11:01 +0200 Subject: [PATCH 009/269] Letting the Neo4j consumer manually send documents to the database according to a new batch size parameter. --- .../db/desc/jcore-db-multiplier-reader.xml | 2 +- .../neo4jrelations/Neo4jRelationsConsumer.java | 16 +++++++++++++++- .../desc/jcore-neo4j-relations-consumer.xml | 16 +++++++++++++++- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/jcore-db-reader/src/main/resources/de/julielab/jcore/reader/db/desc/jcore-db-multiplier-reader.xml b/jcore-db-reader/src/main/resources/de/julielab/jcore/reader/db/desc/jcore-db-multiplier-reader.xml index 9637ab27d..593b1ef99 100644 --- a/jcore-db-reader/src/main/resources/de/julielab/jcore/reader/db/desc/jcore-db-multiplier-reader.xml +++ b/jcore-db-reader/src/main/resources/de/julielab/jcore/reader/db/desc/jcore-db-multiplier-reader.xml @@ -162,7 +162,7 @@ BatchSize - 50 + 100 diff --git a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java index 2ad273897..7fbd7b8ac 100644 --- a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java +++ b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java @@ -55,6 +55,7 @@ public class Neo4jRelationsConsumer extends JCasAnnotator_ImplBase { public static final String PARAM_SOURCE = "ConceptSource"; public static final String PARAM_NEO4J_USER = "Neo4jUser"; public static final String PARAM_NEO4J_PASSWORD = "Neo4jPassword"; + public static final String PARAM_WRITE_BATCH_SIZE = "WriteBatchSize"; private final static Logger log = LoggerFactory.getLogger(Neo4jRelationsConsumer.class); @ConfigurationParameter(name = PARAM_URL, description = "The complete URL to the endpoint of the Neo4j server for relation insertion.") private String url; @@ -66,12 +67,17 @@ public class Neo4jRelationsConsumer extends JCasAnnotator_ImplBase { private String neo4jUser; @ConfigurationParameter(name = PARAM_NEO4J_PASSWORD, mandatory = false, description = "Optional. The Neo4j server password.") private String neo4jPassword; + @ConfigurationParameter(name = PARAM_WRITE_BATCH_SIZE, mandatory = false, defaultValue = "50", description = + "The number of processed CASes after which the relation data should be flushed into the database. Defaults to 50.") + private int writeBatchSize; private ImportIERelations importIERelations; private ObjectMapper om; private Set documentIds; + private long docNum; + /** * This method is called a single time by the framework at component * creation. Here, descriptor parameters are read and initial setup is done. @@ -83,12 +89,14 @@ public void initialize(final UimaContext aContext) throws ResourceInitialization globalSource = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_SOURCE)).orElse(null); neo4jUser = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_NEO4J_USER)).orElse(null); neo4jPassword = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_NEO4J_PASSWORD)).orElse(null); + writeBatchSize = Optional.ofNullable((Integer) aContext.getConfigParameterValue(PARAM_WRITE_BATCH_SIZE)).orElse(50); om = new ObjectMapper(); om.setSerializationInclusion(JsonInclude.Include.NON_NULL); om.setSerializationInclusion(JsonInclude.Include.NON_EMPTY); initImportRelations(); DocumentReleaseCheckpoint.get().register(Neo4jRelationsConsumer.class.getCanonicalName()); documentIds = new HashSet<>(); + docNum = 0; } private void initImportRelations() { @@ -100,13 +108,18 @@ private void initImportRelations() { * is where the actual work happens. */ @Override - public void process(final JCas aJCas) { + public void process(final JCas aJCas) throws AnalysisEngineProcessException { ImportIERelationDocument document = convertRelations(aJCas); if (!document.getRelations().isEmpty()) importIERelations.addRelationDocument(document); Optional metaOpt = JCasUtil.select(aJCas, DBProcessingMetaData.class).stream().findAny(); documentIds.add(metaOpt.isPresent() ? new DocumentId(metaOpt.get()) : new DocumentId(JCoReTools.getDocId(aJCas))); + + if (documentIds.size() % writeBatchSize == 0) { + log.trace("Document nr {} processed, sending batch nr {} of size {} to database.", docNum, docNum / writeBatchSize, writeBatchSize); + batchProcessComplete(); + } } private ImportIERelationDocument convertRelations(JCas aJCas) { @@ -137,6 +150,7 @@ public void batchProcessComplete() throws AnalysisEngineProcessException { @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { super.collectionProcessComplete(); + log.info("Collection processing finished."); sendRelationsToNeo4j(); DocumentReleaseCheckpoint.get().unregister(Neo4jRelationsConsumer.class.getCanonicalName()); } diff --git a/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml b/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml index 1ce50f4d9..bebaa2135 100644 --- a/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml +++ b/jcore-neo4j-relations-consumer/src/main/resources/de/julielab/jcore/consumer/neo4jrelations/desc/jcore-neo4j-relations-consumer.xml @@ -45,8 +45,22 @@ false false + + WriteBatchSize + The number of processed CASes after which the relation data should be flushed into the database. Defaults to 50. + Integer + false + false + - + + + WriteBatchSize + + 50 + + + From f33bc2ceb565920c155d9fa298608e9faaf7c179 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 12 Jun 2020 12:13:19 +0200 Subject: [PATCH 010/269] Adding a log message. --- .../jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java | 1 + 1 file changed, 1 insertion(+) diff --git a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java index 7fbd7b8ac..6734d7ea2 100644 --- a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java +++ b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java @@ -177,6 +177,7 @@ private void sendRelationsToNeo4j() throws AnalysisEngineProcessException { List documents = importIERelations.getDocuments(); g.writeFieldName(ImportIERelations.NAME_DOCUMENTS); g.writeStartArray(); + log.debug("Sending {} relation documents to Neo4j.", documents.size()); for (ImportIERelationDocument document : (Iterable) documents::iterator) { g.writeObject(document); } From 12154b8acd95d991787f791017bdf6d88785d58d Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 22 Jun 2020 10:46:17 +0200 Subject: [PATCH 011/269] The file reader can now search in arbitrarily deep subdirectory trees. --- .../jcore/reader/file/main/FileReader.java | 29 ++++++------------- .../Neo4jRelationsConsumer.java | 4 ++- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/jcore-file-reader/src/main/java/de/julielab/jcore/reader/file/main/FileReader.java b/jcore-file-reader/src/main/java/de/julielab/jcore/reader/file/main/FileReader.java index 925c91e5e..dee16f1d7 100644 --- a/jcore-file-reader/src/main/java/de/julielab/jcore/reader/file/main/FileReader.java +++ b/jcore-file-reader/src/main/java/de/julielab/jcore/reader/file/main/FileReader.java @@ -35,6 +35,9 @@ import org.apache.uima.util.ProgressImpl; import java.io.*; +import java.nio.file.FileVisitOption; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -404,26 +407,12 @@ public Progress[] getProgress() { return new Progress[]{new ProgressImpl(fileIndex, files.size(), Progress.ENTITIES)}; } - private String[] createFileListByType(File inputDirectory, final Set allowedExtensions) throws IOException { - String[] path = new File(inputDirectory.getPath()).list(); - - for (int i = 0; i < path.length; i++) { - File file = new File(inputDirectory.getAbsolutePath() + "/" + path[i]); - - if (!useSubDirs && file.isDirectory()) - continue; - - String CurrentExtension = path[i].substring(path[i].lastIndexOf('.') + 1); - if (allowedExtensions.isEmpty() || allowedExtensions.contains(CurrentExtension)) { - files.add(file); - } - - if (useSubDirs && file.isDirectory()) { - createFileListByType(file, allowedExtensions); - } - } - - return path; + private void createFileListByType(File inputDirectory, final Set allowedExtensions) throws IOException { + Files.walk(inputDirectory.toPath(), useSubDirs ? Integer.MAX_VALUE : 1, FileVisitOption.FOLLOW_LINKS) + .filter(p -> { if (allowedExtensions.isEmpty()) return true; for (String ext : allowedExtensions) if (p.toString().endsWith(ext)) return true; return false;}) + .map(Path::toFile) + .filter(File::isFile) + .forEach(files::add); } private String getFileName(File fi) { diff --git a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java index 6734d7ea2..7ff69f9f8 100644 --- a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java +++ b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java @@ -177,7 +177,7 @@ private void sendRelationsToNeo4j() throws AnalysisEngineProcessException { List documents = importIERelations.getDocuments(); g.writeFieldName(ImportIERelations.NAME_DOCUMENTS); g.writeStartArray(); - log.debug("Sending {} relation documents to Neo4j.", documents.size()); + log.debug("Converting {} relation documents to JSON.", documents.size()); for (ImportIERelationDocument document : (Iterable) documents::iterator) { g.writeObject(document); } @@ -212,6 +212,7 @@ private void sendRelationsToNeo4j() throws AnalysisEngineProcessException { * @return The grouped relations. */ private Map> getEquivalentRelationGroups(JCas aJCas) { + // Maps relation types to the complete relations. Map> relationCounts = new HashMap<>(); for (FlattenedRelation fr : aJCas.getAnnotationIndex(FlattenedRelation.type)) { Iterator cmIt = StreamSupport.stream(fr.getArguments().spliterator(), false) @@ -220,6 +221,7 @@ private Map> getEquivalentRelationGroups(J .map(ConceptMention.class::cast) .iterator(); Set unificationArgs = new HashSet<>(); + // Add all arguments to the relation object. So there could be 1, 2, 3 or even more arguments. while (cmIt.hasNext()) { ConceptMention cm = cmIt.next(); FSArray resourceEntryList = cm.getResourceEntryList(); From ba7f5cdc55072a3c32f4615610434f23f7d28080 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 23 Jun 2020 13:29:44 +0200 Subject: [PATCH 012/269] FileWriter: Not appending .txt suffix if already present. --- .../de/julielab/jcore/consumer/txt/SentenceTokenConsumer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-txt-consumer/src/main/java/de/julielab/jcore/consumer/txt/SentenceTokenConsumer.java b/jcore-txt-consumer/src/main/java/de/julielab/jcore/consumer/txt/SentenceTokenConsumer.java index ca3d51d29..a86921ec4 100644 --- a/jcore-txt-consumer/src/main/java/de/julielab/jcore/consumer/txt/SentenceTokenConsumer.java +++ b/jcore-txt-consumer/src/main/java/de/julielab/jcore/consumer/txt/SentenceTokenConsumer.java @@ -237,7 +237,7 @@ private void writeSentences2File(String fileId, List sentences) throws I OutputStream os = null; boolean zipContentWritten = false; try { - File outputFile = new File(directory.getCanonicalPath() + File.separator + fileId + ".txt" + (gzip ? ".gz" : "")); + File outputFile = new File(directory.getCanonicalPath() + File.separator + fileId + (fileId.endsWith(".txt") ? "" : ".txt") + (gzip ? ".gz" : "")); os = zip ? currentArchive : FileUtilities.getOutputStreamToFile(outputFile); if (zip) { // Initialize the ZIP output stream if necessary From b8a880f89d5fc3f866712493652e3540d176ffba Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 23 Jun 2020 14:15:18 +0200 Subject: [PATCH 013/269] Revoked the last change because this can influence document ID lookup. --- .../de/julielab/jcore/consumer/txt/SentenceTokenConsumer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-txt-consumer/src/main/java/de/julielab/jcore/consumer/txt/SentenceTokenConsumer.java b/jcore-txt-consumer/src/main/java/de/julielab/jcore/consumer/txt/SentenceTokenConsumer.java index a86921ec4..ca3d51d29 100644 --- a/jcore-txt-consumer/src/main/java/de/julielab/jcore/consumer/txt/SentenceTokenConsumer.java +++ b/jcore-txt-consumer/src/main/java/de/julielab/jcore/consumer/txt/SentenceTokenConsumer.java @@ -237,7 +237,7 @@ private void writeSentences2File(String fileId, List sentences) throws I OutputStream os = null; boolean zipContentWritten = false; try { - File outputFile = new File(directory.getCanonicalPath() + File.separator + fileId + (fileId.endsWith(".txt") ? "" : ".txt") + (gzip ? ".gz" : "")); + File outputFile = new File(directory.getCanonicalPath() + File.separator + fileId + ".txt" + (gzip ? ".gz" : "")); os = zip ? currentArchive : FileUtilities.getOutputStreamToFile(outputFile); if (zip) { // Initialize the ZIP output stream if necessary From 18de1e7dd0c6478b84f8536f060600762fa39515 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 30 Jul 2020 13:49:04 +0200 Subject: [PATCH 014/269] The Cord19 reader now prefers the abstract from the meta data. --- .../jcore/reader/cord19/Cord19Reader.java | 74 ++++++++++++------- .../cord19/Cord19MultiplierReaderTest.java | 5 +- 2 files changed, 48 insertions(+), 31 deletions(-) diff --git a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19Reader.java b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19Reader.java index 54a9f1d5c..176d0f6dd 100644 --- a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19Reader.java +++ b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19Reader.java @@ -121,31 +121,42 @@ private void addBody(JCas jCas, StringBuilder doctext, Cord19Document document) } private void addAbstract(JCas jCas, StringBuilder doctext, Cord19Document document) { - List sections = new ArrayList<>(document.getAbstr().size()); - int abstractBegin = doctext.length(); - // Stores the end of the last paragraph before the newline - int lastEnd = 0; - for (Paragraph p : document.getAbstr()) { - int paragraphBegin = doctext.length(); - AbstractSection as = new AbstractSection(jCas, paragraphBegin, doctext.length() + p.getText().length()); - doctext.append(p.getText()); - lastEnd = doctext.length(); - doctext.append(linesep); - AbstractSectionHeading asHeading = new AbstractSectionHeading(jCas); - asHeading.setTitleType("abstract"); - asHeading.setLabel(p.getSection()); - as.setAbstractSectionHeading(asHeading); - sections.add(as); - addReferences(p, Paragraph::getRefSpans, paragraphBegin, jCas); - addReferences(p, Paragraph::getEqSpans, paragraphBegin, jCas); - addReferences(p, Paragraph::getCiteSpans, paragraphBegin, jCas); - } - if (lastEnd - abstractBegin > 0) { - AbstractText abstractText = new AbstractText(jCas, abstractBegin, lastEnd); - abstractText.setAbstractType("main"); - abstractText.setStructuredAbstractParts(JCoReTools.addToFSArray(null, sections)); - abstractText.addToIndexes(); - doctext.append(linesep); + MetadataRecord metadataRecord = metadataIdMap.get(document.getPaperId()); + if (metadataRecord != null) { + String abstractText = metadataRecord.getAbstractText(); + if (abstractText != null && !abstractText.isBlank()) { + AbstractText abstractAnnotation = new AbstractText(jCas, doctext.length(),doctext.length() + abstractText.length()); + abstractAnnotation.setAbstractType("main"); + abstractAnnotation.addToIndexes(); + doctext.append(abstractText); + } + } else { + List sections = new ArrayList<>(document.getAbstr().size()); + int abstractBegin = doctext.length(); + // Stores the end of the last paragraph before the newline + int lastEnd = 0; + for (Paragraph p : document.getAbstr()) { + int paragraphBegin = doctext.length(); + AbstractSection as = new AbstractSection(jCas, paragraphBegin, doctext.length() + p.getText().length()); + doctext.append(p.getText()); + lastEnd = doctext.length(); + doctext.append(linesep); + AbstractSectionHeading asHeading = new AbstractSectionHeading(jCas); + asHeading.setTitleType("abstract"); + asHeading.setLabel(p.getSection()); + as.setAbstractSectionHeading(asHeading); + sections.add(as); + addReferences(p, Paragraph::getRefSpans, paragraphBegin, jCas); + addReferences(p, Paragraph::getEqSpans, paragraphBegin, jCas); + addReferences(p, Paragraph::getCiteSpans, paragraphBegin, jCas); + } + if (lastEnd - abstractBegin > 0) { + AbstractText abstractText = new AbstractText(jCas, abstractBegin, lastEnd); + abstractText.setAbstractType("main"); + abstractText.setStructuredAbstractParts(JCoReTools.addToFSArray(null, sections)); + abstractText.addToIndexes(); + doctext.append(linesep); + } } } @@ -164,7 +175,7 @@ private void addReferences(Paragraph p, Function> private void addTitle(JCas jCas, Cord19Document document, MetadataRecord metadataRecord, StringBuilder doctext) { if (metadataRecord != null) { String title = metadataRecord.getTitle(); - if (title != null) { + if (title != null && !title.isBlank()) { addTitle(jCas, title, doctext); } } else { @@ -221,9 +232,10 @@ private void readMetaData(String metadataFile) { String cordUid = record.get("cord_uid"); String sha = record.get("sha"); String title = record.get("title"); + String abstractText = record.get("abstract"); String pmcid = record.get("pmcid"); String pmid = record.get("pubmed_id"); - MetadataRecord metadataRecord = new MetadataRecord(cordUid, sha, pmcid, pmid, title); + MetadataRecord metadataRecord = new MetadataRecord(cordUid, sha, pmcid, pmid, title, abstractText); for (String hash : metadataRecord.hashes) metadataIdMap.put(hash, metadataRecord); if (pmcid != null) @@ -244,13 +256,19 @@ private static class MetadataRecord { private final String pmid; private final String[] hashes; private final String title; + private String abstractText; - public MetadataRecord(String cordUid, String sha, String pmcid, String pmid, String title) { + public MetadataRecord(String cordUid, String sha, String pmcid, String pmid, String title, String abstractText) { this.cordUid = cordUid; this.pmcid = pmcid; this.pmid = pmid; this.title = title; this.hashes = Arrays.stream(sha.split(";")).map(String::trim).toArray(String[]::new); + this.abstractText = abstractText; + } + + public String getAbstractText() { + return abstractText; } public String getCordUid() { diff --git a/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReaderTest.java b/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReaderTest.java index b5922a816..f7a8e8fcf 100644 --- a/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReaderTest.java +++ b/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReaderTest.java @@ -63,9 +63,8 @@ private void checkSecondDocument(JCas cas) { assertThat(documentTitles.get(0)).extracting(Annotation::getCoveredText).isEqualTo("Recombinant M protein-based ELISA test for detection of antibodies to canine coronavirus"); AbstractText abstractText = JCasUtil.selectSingle(cas, AbstractText.class); - assertThat(abstractText.getCoveredText()).startsWith("The membrane (M) protein of canine"); - assertThat(abstractText.getCoveredText()).endsWith("antibodies to CCoV in dog sera. #"); - assertThat(abstractText.getStructuredAbstractParts()).hasSize(1); + assertThat(abstractText.getCoveredText()).startsWith("Abstract The membrane (M) protein of canine"); + assertThat(abstractText.getCoveredText()).endsWith("antibodies to CCoV in dog sera."); Collection paragraphs = JCasUtil.select(cas, Paragraph.class); assertThat(paragraphs).hasSize(19); From cda27d66c54ac20918867af9fae3baa5ef12dabb Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 31 Jul 2020 07:43:33 +0200 Subject: [PATCH 015/269] Merging an if-condition for correct branching. --- .../julielab/jcore/reader/cord19/Cord19Reader.java | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19Reader.java b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19Reader.java index 176d0f6dd..60939db2b 100644 --- a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19Reader.java +++ b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19Reader.java @@ -122,14 +122,12 @@ private void addBody(JCas jCas, StringBuilder doctext, Cord19Document document) private void addAbstract(JCas jCas, StringBuilder doctext, Cord19Document document) { MetadataRecord metadataRecord = metadataIdMap.get(document.getPaperId()); - if (metadataRecord != null) { + if (metadataRecord != null && metadataRecord.getAbstractText() != null && !metadataRecord.getAbstractText().isBlank()) { String abstractText = metadataRecord.getAbstractText(); - if (abstractText != null && !abstractText.isBlank()) { - AbstractText abstractAnnotation = new AbstractText(jCas, doctext.length(),doctext.length() + abstractText.length()); - abstractAnnotation.setAbstractType("main"); - abstractAnnotation.addToIndexes(); - doctext.append(abstractText); - } + AbstractText abstractAnnotation = new AbstractText(jCas, doctext.length(), doctext.length() + abstractText.length()); + abstractAnnotation.setAbstractType("main"); + abstractAnnotation.addToIndexes(); + doctext.append(abstractText); } else { List sections = new ArrayList<>(document.getAbstr().size()); int abstractBegin = doctext.length(); From 02ba4c09236d27e5192a5fe5254959895ee9b356 Mon Sep 17 00:00:00 2001 From: khituras Date: Sun, 2 Aug 2020 16:20:22 +0200 Subject: [PATCH 016/269] NPE issue with the Cord19MultiplierReader, adding an error message for debugging. --- .../jcore/reader/cord19/Cord19MultiplierReader.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReader.java b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReader.java index 5789d935b..3b8b9ff35 100644 --- a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReader.java +++ b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReader.java @@ -73,7 +73,11 @@ public void getNext(JCas jCas) throws CollectionException { Path p = currentFileBatch.get(currentBatchIndex); if (p != Cord19FileVisitor.END) { JCoReURI uri = new JCoReURI(jCas); - uri.setUri(p.toUri().toString()); + try { + uri.setUri(p.toUri().toString()); + } catch (NullPointerException e) { + log.error("Could not retrieve URI string for path {}, resolved URI {}", p, p!= null ? p.toUri() : ""); + } uri.addToIndexes(); ++completed; } From ccd03baf3cd61b9805ce081d580204adaafe6f7c Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 5 Aug 2020 10:49:12 +0200 Subject: [PATCH 017/269] Lingpipe Gaz: Fixed an issue where lowercasing could break the offset mapping. Special characters would get torn apart leading to difficult to track behaviour in the string normalization process. The solution is to use the transliterator for lowercasing. --- jcore-lingpipegazetteer-ae/pom.xml | 115 +- .../uima/GazetteerAnnotator.java | 1424 ++--- .../utils/StringNormalizerForChunking.java | 405 +- .../StringNormalizerForChunkingTest.java | 67 +- .../uima/GazetteerAnnotatorTest.java | 36 + .../src/test/resources/pehc.dict | 2 + .../src/test/resources/unused/bio_text.xmi | 3 - .../src/test/resources/unused/tmp.txt | 4878 ----------------- 8 files changed, 1055 insertions(+), 5875 deletions(-) create mode 100644 jcore-lingpipegazetteer-ae/src/test/resources/pehc.dict delete mode 100644 jcore-lingpipegazetteer-ae/src/test/resources/unused/bio_text.xmi delete mode 100644 jcore-lingpipegazetteer-ae/src/test/resources/unused/tmp.txt diff --git a/jcore-lingpipegazetteer-ae/pom.xml b/jcore-lingpipegazetteer-ae/pom.xml index 080a61539..686f9ae80 100644 --- a/jcore-lingpipegazetteer-ae/pom.xml +++ b/jcore-lingpipegazetteer-ae/pom.xml @@ -1,68 +1,77 @@ - + - 4.0.0 - jcore-lingpipe-gazetteer-ae - jar - JCoRe Lingpipe Gazetteer AE - Basically used as NE tagger based on Lingpipe's dictionary-lookup tagger. + 4.0.0 + jcore-lingpipe-gazetteer-ae + jar + JCoRe Lingpipe Gazetteer AE + Basically used as NE tagger based on Lingpipe's dictionary-lookup tagger. - - de.julielab - jcore-base - 2.6.0-SNAPSHOT - + + de.julielab + jcore-base + 2.6.0-SNAPSHOT + - - - de.julielab - jcore-descriptor-creator - + + + de.julielab + jcore-descriptor-creator + de.julielab jcore-types ${jcore-types-version} - - org.slf4j - slf4j-api - - - de.julielab - jcore-utilities - ${jcore-utilities-version} - - - ch.qos.logback - logback-classic - provided - - - com.ibm.icu - icu4j - 4.8.1.1 - - - de.julielab - aliasi-lingpipe - 4.1.2-JL1.0 - - - org.apache.commons - commons-lang3 - 3.4 - - junitjunit - - JULIE Lab, Germany - http://www.julielab.de - - + + org.slf4j + slf4j-api + + + de.julielab + jcore-utilities + ${jcore-utilities-version} + + + ch.qos.logback + logback-classic + provided + + + com.ibm.icu + icu4j + 4.8.1.1 + + + de.julielab + aliasi-lingpipe + 4.1.2-JL1.0 + + + org.apache.commons + commons-lang3 + 3.4 + + + org.assertj + assertj-core + + + junit + junit + + + + JULIE Lab, Germany + http://www.julielab.de + + GNU Affero General Public License, Version 3.0 http://www.gnu.org/licenses/agpl-3.0.en.html - https://github.com/JULIELab/jcore-base/tree/master/jcore-lingpipe-gazetteer-ae - + https://github.com/JULIELab/jcore-base/tree/master/jcore-lingpipe-gazetteer-ae + diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java index 6ddd3b58a..dd0c68c20 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java @@ -1,21 +1,20 @@ -/** - * +/** * Copyright (c) 2015, JULIE Lab. - * + *

* Author: tomanek, jwermter - * - * - * Creation date: Jan 14, 2008 - * + *

+ *

+ * Creation date: Jan 14, 2008 + *

* A entity tagger based on a dictionary lookup. Lingpipe's gazetteer is used. - * - * There are two modes: exact matching (only terms which map exactly to - * those specified in dictionary are found). Approximate matching (by means of - * weighted levenstein distance, approximate matches are found.) - * - * As approximate matching results in concurring matches on overlapping spans, I + *

+ * There are two modes: exact matching (only terms which map exactly to + * those specified in dictionary are found). Approximate matching (by means of + * weighted levenstein distance, approximate matches are found.) + *

+ * As approximate matching results in concurring matches on overlapping spans, I * added a mechanism to resolve this according to this rules: in overlapping matches - * the one with the best (here: lowest) score is taken, if more than one chunk has the + * the one with the best (here: lowest) score is taken, if more than one chunk has the * same score, the one with the longest span is chosen. **/ package de.julielab.jcore.ae.lingpipegazetteer.uima; @@ -57,702 +56,707 @@ public class GazetteerAnnotator extends JCasAnnotator_ImplBase { - private static final String COMPONENT_ID = GazetteerAnnotator.class.getCanonicalName(); - private static final Logger LOGGER = LoggerFactory.getLogger(GazetteerAnnotator.class); - public static final String CHUNKER_RESOURCE_NAME = "DictionaryChunkerProvider"; - // public final static String PARAM_USE_APPROXIMATE_MATCHING = - // "UseApproximateMatching"; - public final static String PARAM_CHECK_ACRONYMS = "CheckAcronyms"; - public final static String PARAM_OUTPUT_TYPE = "OutputType"; - /** - * Only required to set to false as an annotator parameter when using - * approximate matching and the ChunkerProvider is set to CaseSensitive false. - * That is because the approximate chunker is always case sensitive. - */ - // public final static String PARAM_CASE_SENSITIVE = "CaseSensitive"; - private static final String PARAM_USE_MANTRA_MODE = "MantraMode"; - /** - * Parameter to indicate whether text - CAS document text for this class - - * should be normalized by completely removing dashes, parenthesis, genitive 's - * and perhaps more. This is meant to replace the generation of term variants - * and cannot be used together with variation generation. If this is switched on - * here, it must also be switched on in the external resource configuration for - * the ChunkerProvider! Can only be used with alternative ChunkerProviderImplAlt - * implementation. - */ - // public final static String PARAM_NORMALIZE_TEXT = "NormalizeText"; - /** - * Parameter to indicate whether text - CAS document text for this class - - * should be transliterated, i.e. whether accents and other character variations - * should be stripped. If this is switched on here, it must also be switched on - * in the external resource configuration for the ChunkerProvider! Can only be - * used with alternative ChunkerProviderImplAlt implementation. - */ - // public final static String PARAM_TRANSLITERATE_TEXT = - // "TransliterateText"; - - @ConfigurationParameter(name = PARAM_USE_MANTRA_MODE, defaultValue = "false") - private boolean mantraMode = false; - - // needs to be true because of chunker injection: - @ConfigurationParameter(name = PARAM_CHECK_ACRONYMS, defaultValue = "true") - private boolean checkAcronyms = true; - @ConfigurationParameter(name = PARAM_OUTPUT_TYPE) - private String outputType = null; - - @ExternalResource(key = CHUNKER_RESOURCE_NAME, mandatory = true) - private ChunkerProvider provider; - /** - * Removes diacritics and does lower casing - */ - private Transliterator transliterator; - private Chunker gazetteer = null; - private TokenizerFactory normalizationTokenFactory; - private Set stopWords; - - // TODO for debug only - private static int initializeCount = 0; - - public void initialize(UimaContext aContext) throws ResourceInitializationException { - LOGGER.info("calls to initialize: " + initializeCount); - - super.initialize(aContext); - LOGGER.info("initialize() - initializing GazetteerAnnotator..."); - - try { - provider = (ChunkerProvider) getContext().getResourceObject(CHUNKER_RESOURCE_NAME); - gazetteer = provider.getChunker(); + public static final String CHUNKER_RESOURCE_NAME = "DictionaryChunkerProvider"; + // public final static String PARAM_USE_APPROXIMATE_MATCHING = + // "UseApproximateMatching"; + public final static String PARAM_CHECK_ACRONYMS = "CheckAcronyms"; + public final static String PARAM_OUTPUT_TYPE = "OutputType"; + private static final String COMPONENT_ID = GazetteerAnnotator.class.getCanonicalName(); + private static final Logger LOGGER = LoggerFactory.getLogger(GazetteerAnnotator.class); + /** + * Only required to set to false as an annotator parameter when using + * approximate matching and the ChunkerProvider is set to CaseSensitive false. + * That is because the approximate chunker is always case sensitive. + */ + // public final static String PARAM_CASE_SENSITIVE = "CaseSensitive"; + private static final String PARAM_USE_MANTRA_MODE = "MantraMode"; + /** + * Parameter to indicate whether text - CAS document text for this class - + * should be normalized by completely removing dashes, parenthesis, genitive 's + * and perhaps more. This is meant to replace the generation of term variants + * and cannot be used together with variation generation. If this is switched on + * here, it must also be switched on in the external resource configuration for + * the ChunkerProvider! Can only be used with alternative ChunkerProviderImplAlt + * implementation. + */ + // public final static String PARAM_NORMALIZE_TEXT = "NormalizeText"; + // TODO for debug only + private static int initializeCount = 0; + /** + * Parameter to indicate whether text - CAS document text for this class - + * should be transliterated, i.e. whether accents and other character variations + * should be stripped. If this is switched on here, it must also be switched on + * in the external resource configuration for the ChunkerProvider! Can only be + * used with alternative ChunkerProviderImplAlt implementation. + */ + // public final static String PARAM_TRANSLITERATE_TEXT = + // "TransliterateText"; + + @ConfigurationParameter(name = PARAM_USE_MANTRA_MODE, defaultValue = "false") + private boolean mantraMode = false; + // needs to be true because of chunker injection: + @ConfigurationParameter(name = PARAM_CHECK_ACRONYMS, defaultValue = "true") + private boolean checkAcronyms = true; + @ConfigurationParameter(name = PARAM_OUTPUT_TYPE) + private String outputType = null; + @ExternalResource(key = CHUNKER_RESOURCE_NAME, mandatory = true) + private ChunkerProvider provider; + /** + * Removes diacritics and does lower casing + */ + private Transliterator transliterator; + private Chunker gazetteer = null; + private TokenizerFactory normalizationTokenFactory; + private Set stopWords; + + static boolean filterParenthesis(String chunkText) { + Stack parenthesisStack = new Stack<>(); + // Map pMap = new HashMap<>(); + for (int i = 0; i < chunkText.length(); i++) { + char current = chunkText.charAt(i); + if (isParentheses(current)) { + if (isOpenedParentheses(current)) { + parenthesisStack.add(current); + } else { + if (parenthesisStack.isEmpty()) + return true; + if (!isParenthesisCounterpart(parenthesisStack.pop(), current)) + return true; + } + } + } + if (!parenthesisStack.isEmpty()) + return true; + return false; + } + + private static boolean isParenthesisCounterpart(Character char1, Character char2) { + ParenthesisType char1ParenthesisType = getParenthesisType(char2); + ParenthesisType char2ParenthesisType = getParenthesisType(char1); + if (char1ParenthesisType == ParenthesisType.NONE || char2ParenthesisType == ParenthesisType.NONE) + throw new IllegalArgumentException("The two characters '" + char1 + "' and '" + char2 + + "' were given in order to determine whether they are compatible parenthesis counterparts, but at least one of those characters is no parentheses."); + return char1ParenthesisType.equals(char2ParenthesisType); + } + + static ParenthesisType getParenthesisType(char current) { + switch (current) { + case '(': + case ')': + return ParenthesisType.ROUND; + case '[': + case ']': + return ParenthesisType.BRACKET; + case '{': + case '}': + return ParenthesisType.CURLY; + default: + return ParenthesisType.NONE; + } + } + + static boolean isParentheses(char current) { + return isOpenedParentheses(current) || isClosedParentheses(current); + } + + static boolean isOpenedParentheses(char current) { + switch (current) { + case '(': + case '[': + case '{': + return true; + default: + return false; + } + } + + static boolean isClosedParentheses(char current) { + switch (current) { + case ')': + case ']': + case '}': + return true; + default: + return false; + } + } + + static List groupOverlappingChunks(List chunkList, String chunkedText) { + // sort chunkList so the grouping works as intended + Collections.sort(chunkList, new Comparator() { + + @Override + public int compare(Chunk o1, Chunk o2) { + return o1.start() - o2.start(); + } + + }); + // group overlapping chunks + List overlappingChunks = new ArrayList(); + for (Chunk chunk : chunkList) { + // for debugging + // System.out.println("chunking.add(ChunkFactory.createChunk(" + + // chunk.start() + ", " + chunk.end() + + // ", 0d));"); + boolean added = false; + for (OverlappingChunk over : overlappingChunks) { + if (over.isOverlappingSpan(chunk.start(), chunk.end())) { + over.addChunk(chunk.start(), chunk.end(), chunk); + added = true; + } + } + if (!added) { + overlappingChunks.add(new OverlappingChunk(chunk.start(), chunk.end(), chunk, chunkedText)); + added = true; + } + } + return overlappingChunks; + } + + // enum ParenthesesType { + // ROUND_CLOSED { + // @Override + // boolean isOpen() { + // return false; + // } + // + // }, + // BRACKET_CLOSED { + // @Override + // boolean isOpen() { + // return false; + // } + // }, + // CURLY_CLOSED { + // @Override + // boolean isOpen() { + // return false; + // } + // + // }, + // ROUND_OPENED { + // @Override + // boolean isOpen() { + // return true; + // } + // }, + // BRACKET_OPENED { + // @Override + // boolean isOpen() { + // return true; + // } + // }, + // CURLY_OPENED { + // @Override + // boolean isOpen() { + // return true; + // } + // }; + // abstract boolean isOpen(); + // + // boolean isClose() { + // return !isOpen(); + // }; + // } + + public void initialize(UimaContext aContext) throws ResourceInitializationException { + LOGGER.info("calls to initialize: " + initializeCount); + + super.initialize(aContext); + LOGGER.info("initialize() - initializing GazetteerAnnotator..."); + + try { + provider = (ChunkerProvider) getContext().getResourceObject(CHUNKER_RESOURCE_NAME); + gazetteer = provider.getChunker(); // stopWords = provider.getStopWords(); - String[] stopwordArray = { "a", "about", "above", "across", "after", "afterwards", "again", "against", - "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", - "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", - "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", - "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", - "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", - "cant", "co", "computer", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", - "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", - "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", - "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", - "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", - "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", - "herself", "high", "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", - "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", - "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", - "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", - "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", - "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", - "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", - "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", - "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", - "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", - "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", - "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", - "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though", "three", - "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", - "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", - "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", - "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", - "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", - "you", "your", "yours", "yourself", "yourselves", }; - stopWords = new HashSet<>(); - for (String sw : stopwordArray) - stopWords.add(sw); - } catch (ResourceAccessException e) { - LOGGER.error("Exception while initializing", e); - } - - // check acronyms - checkAcronyms = (Boolean) aContext.getConfigParameterValue(PARAM_CHECK_ACRONYMS); - LOGGER.info( - "Check for acronyms (found dictionary entries that are abbreviations are only accepted if their long form is an abbreviation of the same type, too): {}", - checkAcronyms); - // filter stop words - - Boolean normalizeBoolean = provider.getNormalize();// (Boolean) - // aContext.getConfigParameterValue(PARAM_NORMALIZE_TEXT); - if (normalizeBoolean) { - normalizationTokenFactory = new IndoEuropeanTokenizerFactory(); - } - LOGGER.info("Normalize CAS document text (i.e. do stemming and remove possessive 's): {}", provider.getNormalize()); - - Boolean transliterateBoolean = provider.getTransliterate();// (Boolean) - // aContext.getConfigParameterValue(PARAM_TRANSLITERATE_TEXT); - if (transliterateBoolean) { - transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower"); - } - LOGGER.info("Transliterate CAS document text (i.e. transform accented characters to their base forms): {}", - provider.getTransliterate()); - - // define output level - outputType = (String) aContext.getConfigParameterValue(PARAM_OUTPUT_TYPE); - if (outputType == null) { - LOGGER.error("initialize() - output type not specified."); - throw new ResourceInitializationException(); - } - - mantraMode = aContext.getConfigParameterValue(PARAM_USE_MANTRA_MODE) != null - ? (Boolean) aContext.getConfigParameterValue(PARAM_USE_MANTRA_MODE) - : false; - } - - /** - * process the CAS, there are two subroutines: one for exact and one for - * approximate matching. - */ - public void process(JCas aJCas) throws AnalysisEngineProcessException { - if (gazetteer == null) - throw new IllegalStateException("The actual gazetteer object is null. Check previous log messages pointing to the error (most probably the dictionary file could not be found)."); - String docText = aJCas.getDocumentText(); - if (docText == null || docText.length() == 0) - return; - if (provider.getUseApproximateMatching() && !provider.getTransliterate() && !provider.getCaseSensitive()) - docText = docText.toLowerCase(); - NormalizedString normalizedDocText = null; - if (provider.getNormalize()) { - normalizedDocText = StringNormalizerForChunking.normalizeString(docText, normalizationTokenFactory, - transliterator); - } - - IndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator(); - JCoReHashMapAnnotationIndex conceptMentionIndex = new JCoReHashMapAnnotationIndex<>( - longOffsetTermGenerator, longOffsetTermGenerator, aJCas, ConceptMention.type); - JCoReHashMapAnnotationIndex abbreviationIndex = new JCoReHashMapAnnotationIndex<>( - longOffsetTermGenerator, longOffsetTermGenerator, aJCas, Abbreviation.type); - - LOGGER.debug("Performing actual Gazetteer annotation..."); - Chunking chunking; - if (provider.getNormalize()) - chunking = gazetteer.chunk(normalizedDocText.string); - else - chunking = gazetteer.chunk(docText); - LOGGER.debug("Gazetteer annotation done."); - if (provider.getUseApproximateMatching()) { - /* - * handle matches found by approx matching: this means especially overlapping - * matches with different scores (doesn't happen with exact matches) - */ - List chunkList = filterChunking(chunking); - List overlappingChunks = groupOverlappingChunks(chunkList, - chunking.charSequence().toString()); - // now add the best chunk of all overlappingChunks to the CAS - LOGGER.debug("all overlapping chunks:\n"); - // Set bestChunksSet = new HashSet<>(); - for (OverlappingChunk overlappingChunk : overlappingChunks) { - // show chunks - LOGGER.debug(overlappingChunk.toStringAll()); - List bestChunks = overlappingChunk.getBestChunks(); - LOGGER.debug("Found {} best chunks.", bestChunks.size()); - for (int i = 0; i < bestChunks.size(); i++) { - Chunk bestChunk = bestChunks.get(i); - LOGGER.debug("Nr. " + i + " best chunk: " + bestChunk.start() + " - " + bestChunk.end() + ": " - + bestChunk.score() + " ; type: " + bestChunk.type()); - // TODO this check and the corresponding set may be removed - // when this exception hasn't been thrown - // in a - // while. Its currently just to be sure, this should not - // happen any more since the chunks are sorted - // by - // offset in the grouping method. - // if (bestChunksSet.contains(bestChunk)) { - // throw new IllegalStateException("Duplicate best chunk: " + bestChunk); - // } - // bestChunksSet.add(bestChunk); - // add 2 cas - add2Cas(aJCas, bestChunk, normalizedDocText, conceptMentionIndex, abbreviationIndex); - } - } - // for (Chunk chunk : chunking.chunkSet()) { - // add2Cas(aJCas, chunk, normalizedDocText); - // } - } else { - for (Chunk chunk : chunking.chunkSet()) { - add2Cas(aJCas, chunk, normalizedDocText, conceptMentionIndex, abbreviationIndex); - } - } - if (checkAcronyms && !mantraMode) { - LOGGER.debug("process() - checking acronyms"); - annotateAcronymsWithFullFormEntity(aJCas, conceptMentionIndex); - } - } - - private List filterChunking(Chunking chunking) { - // ChunkingImpl newChunking = new ChunkingImpl(chunking.charSequence()); - List newChunking = new ArrayList<>(chunking.chunkSet().size()); - for (Chunk chunk : chunking.chunkSet()) { - String chunkText = chunking.charSequence().subSequence(chunk.start(), chunk.end()).toString(); - if (filterParenthesis(chunkText)) - continue; - if (filterPunctuationArtifacts(chunkText)) - continue; - if (filterStopwords(chunkText)) - continue; - newChunking.add(chunk); - } - return newChunking; - } - - private boolean filterPunctuationArtifacts(String chunkText) { - if (chunkText.startsWith("-")) - return true; - if (chunkText.endsWith("-")) - return true; - return false; - } - - private boolean filterStopwords(String chunkText) { - if (stopWords.contains(chunkText.toLowerCase())) - return true; - if (chunkText.contains(" ")) { - String[] words = chunkText.split(" "); - int stopWordCounter = 0; - for (String word : words) { - if (stopWords.contains(word.toLowerCase())) - stopWordCounter++; - } - if (Math.ceil(words.length / 2.0) <= stopWordCounter) { - LOGGER.debug("Filtering due to high stop word occurrences: {}", chunkText); - return true; - } - } - return false; - } - - static boolean filterParenthesis(String chunkText) { - Stack parenthesisStack = new Stack<>(); - // Map pMap = new HashMap<>(); - for (int i = 0; i < chunkText.length(); i++) { - char current = chunkText.charAt(i); - if (isParentheses(current)) { - if (isOpenedParentheses(current)) { - parenthesisStack.add(current); - } else { - if (parenthesisStack.isEmpty()) - return true; - if (!isParenthesisCounterpart(parenthesisStack.pop(), current)) - return true; - } - } - } - if (!parenthesisStack.isEmpty()) - return true; - return false; - } - - private static boolean isParenthesisCounterpart(Character char1, Character char2) { - ParenthesisType char1ParenthesisType = getParenthesisType(char2); - ParenthesisType char2ParenthesisType = getParenthesisType(char1); - if (char1ParenthesisType == ParenthesisType.NONE || char2ParenthesisType == ParenthesisType.NONE) - throw new IllegalArgumentException("The two characters '" + char1 + "' and '" + char2 - + "' were given in order to determine whether they are compatible parenthesis counterparts, but at least one of those characters is no parentheses."); - return char1ParenthesisType.equals(char2ParenthesisType); - } - - // enum ParenthesesType { - // ROUND_CLOSED { - // @Override - // boolean isOpen() { - // return false; - // } - // - // }, - // BRACKET_CLOSED { - // @Override - // boolean isOpen() { - // return false; - // } - // }, - // CURLY_CLOSED { - // @Override - // boolean isOpen() { - // return false; - // } - // - // }, - // ROUND_OPENED { - // @Override - // boolean isOpen() { - // return true; - // } - // }, - // BRACKET_OPENED { - // @Override - // boolean isOpen() { - // return true; - // } - // }, - // CURLY_OPENED { - // @Override - // boolean isOpen() { - // return true; - // } - // }; - // abstract boolean isOpen(); - // - // boolean isClose() { - // return !isOpen(); - // }; - // } - - enum ParenthesisType { - ROUND, BRACKET, CURLY, NONE - } - - static ParenthesisType getParenthesisType(char current) { - switch (current) { - case '(': - case ')': - return ParenthesisType.ROUND; - case '[': - case ']': - return ParenthesisType.BRACKET; - case '{': - case '}': - return ParenthesisType.CURLY; - default: - return ParenthesisType.NONE; - } - } - - static boolean isParentheses(char current) { - return isOpenedParentheses(current) || isClosedParentheses(current); - } - - static boolean isOpenedParentheses(char current) { - switch (current) { - case '(': - case '[': - case '{': - return true; - default: - return false; - } - } - - static boolean isClosedParentheses(char current) { - switch (current) { - case ')': - case ']': - case '}': - return true; - default: - return false; - } - } - - static List groupOverlappingChunks(List chunkList, String chunkedText) { - // sort chunkList so the grouping works as intended - Collections.sort(chunkList, new Comparator() { - - @Override - public int compare(Chunk o1, Chunk o2) { - return o1.start() - o2.start(); - } - - }); - // group overlapping chunks - List overlappingChunks = new ArrayList(); - for (Chunk chunk : chunkList) { - // for debugging - // System.out.println("chunking.add(ChunkFactory.createChunk(" + - // chunk.start() + ", " + chunk.end() + - // ", 0d));"); - boolean added = false; - for (OverlappingChunk over : overlappingChunks) { - if (over.isOverlappingSpan(chunk.start(), chunk.end())) { - over.addChunk(chunk.start(), chunk.end(), chunk); - added = true; - } - } - if (!added) { - overlappingChunks.add(new OverlappingChunk(chunk.start(), chunk.end(), chunk, chunkedText)); - added = true; - } - } - return overlappingChunks; - } - - // ------------ INFO .......... - // String text = aJCas.getDocumentText(); - // int start = chunk.start(); - // int end = chunk.end(); - // String type = chunk.type(); - // double score = chunk.score(); - // String phrase = text.substring(start, end); - // System.out.println(" found phrase=|" + phrase + "|" - // + " start=" + start + " end=" + end + " type=" + type - // + " score=" + score); - // ------------ INFO .......... - /** - * checks whether a chunk (= dictionary match) is an acronym. If yes, checks - * whether respective full form (obtained via abbr textReference) is - * ConceptMention and has same specificType as chunk If these conditions are not - * fulfilled, no entity annotation will be made. - * - * @param abbreviationIndex - * @param conceptMentionIndex - */ - private boolean isAcronymWithSameFullFormSpecificType(JCas aJCas, Chunk chunk, NormalizedString normalizedDocText, - JCoReHashMapAnnotationIndex conceptMentionIndex, - JCoReHashMapAnnotationIndex abbreviationIndex) { - // Annotation anno; - int start; - int end; - if (provider.getNormalize()) { - try { - start = normalizedDocText.getOriginalOffset(chunk.start()); - end = normalizedDocText.getOriginalOffset(chunk.end()); - } catch (Exception e) { - System.out.println("Text: " + normalizedDocText); - System.out.println("Chunk: " + chunk); - System.out.println("Chunk end: " + chunk.end()); - System.out - .println("Normalized Text: " + normalizedDocText.string.substring(chunk.start(), chunk.end())); - throw e; - } - // anno = new Annotation(aJCas, start, end); - } else { - start = chunk.start(); - end = chunk.end(); - } - - LongOffsetIndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator(); - // Retrieves potential abbr annotation - Abbreviation abbr = abbreviationIndex.getFirst(longOffsetTermGenerator.forOffsets(start, end)); - // check whether it's an abbr - String chunktext = null; - if (LOGGER.isDebugEnabled()) - chunktext = aJCas.getDocumentText().substring(start, end); - if (abbr == null) { - LOGGER.debug("{} chunk \"{}\" is not an abbreviation\n", chunk, chunktext); - return true; - } - // checks whether respective full form is ConceptMention - AbbreviationLongform textRef = abbr.getTextReference(); - ConceptMention em = conceptMentionIndex.getFirst(textRef); - if (em == null) { - LOGGER.debug( - chunk + " chunk \"{}\" is an abbreviation but respective full \"{}\" form is no ConceptMention\n", - chunktext, textRef.getCoveredText()); - return false; - } - - // checks whether full form annotation matches the type to be annotated - // here - String emType = em.getClass().getCanonicalName(); - if (emType.equals(outputType)) { - LOGGER.debug(chunk - + " chunk \"{}\" is an abbreviation and respective full form \"{}\" is ConceptMention with same type as OutputType\n", - chunktext, em.getCoveredText()); - return true; - } - - LOGGER.debug(chunk - + " chunk \"{}\" is an abbreviation but respective full form \"{}\" is ConceptMention without the correct OutputType (is: {}; OutputType: {})\n", - new Object[] { chunktext, em.getCoveredText(), emType, outputType }); - return false; - } - - /** - * adds a chunk as an annotation to the CAS - * - * @param normalizedDocText - * @param abbreviationIndex - * @param conceptMentionIndex - */ - private void add2Cas(JCas aJCas, Chunk chunk, NormalizedString normalizedDocText, - JCoReHashMapAnnotationIndex conceptMentionIndex, - JCoReHashMapAnnotationIndex abbreviationIndex) throws AnalysisEngineProcessException { - // System.out.println("CHUNK: start=" + chunk.start() + " end=" + - // chunk.end()); - // if checkAcronyms, then check acronyms for compliant full forms (= - // with same specificType) - if (checkAcronyms && !isAcronymWithSameFullFormSpecificType(aJCas, chunk, normalizedDocText, - conceptMentionIndex, abbreviationIndex)) { - return; - } - - int start = provider.getNormalize() ? normalizedDocText.getOriginalOffset(chunk.start()) : chunk.start(); - int end = provider.getNormalize() ? normalizedDocText.getOriginalOffset(chunk.end()) : chunk.end(); - - try { - if (mantraMode) { - // the "type" string is used to transport all data needed for - // the MAN-XML format - for (String term : chunk.type().split("@@TERM@@")) { - // @@ is used to separate source, cui, type(s) and group (in - // this order!) - String[] info = term.split("@@"); - Entity newEntity = (Entity) JCoReAnnotationTools.getAnnotationByClassName(aJCas, - "de.julielab.jcore.types.mantra.Entity"); - newEntity.setBegin(start); - newEntity.setEnd(end); - newEntity.setComponentId(COMPONENT_ID); - newEntity.setConfidence(chunk.score() + ""); - - // mantra specific - newEntity.setSource(info[0]); - newEntity.setCui(info[1]); - newEntity.setSemanticType(info[2]); - newEntity.setSemanticGroup(info[3]); - - newEntity.addToIndexes(); - } - } else { - ConceptMention newEntity = (ConceptMention) JCoReAnnotationTools.getAnnotationByClassName(aJCas, - outputType); - newEntity.setBegin(start); - newEntity.setEnd(end); - - // String entityText = newEntity.getCoveredText(); - // if (stopWords.contains(entityText.toLowerCase())) - // return; - // if (entityText.contains(" ")) { - // String[] words = entityText.split(" "); - // int stopWordCounter = 0; - // for (String word : words) { - // if (stopWords.contains(word.toLowerCase())) - // stopWordCounter++; - // } - // if (words.length == stopWordCounter) - // return; - // } - - newEntity.setSpecificType(chunk.type()); - newEntity.setComponentId(COMPONENT_ID); - newEntity.setConfidence(chunk.score() + ""); - newEntity.addToIndexes(); - - conceptMentionIndex.index(newEntity); - } - } catch (Exception e) { - LOGGER.error("process() - could not generate output type: " + e.getMessage()); - e.printStackTrace(); - throw new AnalysisEngineProcessException(e); - } - } - - private void annotateAcronymsWithFullFormEntity(JCas aJCas, - JCoReHashMapAnnotationIndex conceptMentionIndex) - throws AnalysisEngineProcessException { - - JFSIndexRepository indexes = aJCas.getJFSIndexRepository(); - FSIterator abbrevIter = indexes.getAnnotationIndex(Abbreviation.type).iterator(); - IndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator(); - - // loop over all abbreviations - while (abbrevIter.hasNext()) { - Abbreviation abbrev = (Abbreviation) abbrevIter.next(); - AbbreviationLongform fullFormAnnotation = abbrev.getTextReference(); - LOGGER.debug("annotateAcronymsWithFullFormEntity() - checking abbreviation: " + abbrev.getCoveredText()); - ConceptMention emFullform = null;// AnnotationRetrieval.getMatchingAnnotation(aJCas, fullFormAnnotation, - // ConceptMention.class); - emFullform = conceptMentionIndex.getFirst(fullFormAnnotation); - - // The following code was once introduced for gene tagging. There, - // the acronym fullforms sometimes miss minor parts of an annotated - // gene, leading to non-annotated acronyms that would have been - // correct. - // However, for general-purpose concept recognition this approach - // can be quite harmful. Example: "Anaphase-promoting complex (APC)" - // where only "anaphase" is recognized as concept. Now, "APC" would - // be annotated as an acronym for "anaphase". Here, a better - // recognition of the abbreviation span is required. - // ConceptMention emFullform = null; - // List conceptsInFullform = - // JCoReAnnotationTools.getIncludedAnnotations(aJCas, - // fullFormAnnotation, - // ConceptMention.class); - // if (conceptsInFullform.size() == 1) { - // emFullform = conceptsInFullform.get(0); - // LOGGER.debug("Found a single ConceptMention included in the full - // form: {}", emFullform.getCoveredText()); - // } else if (conceptsInFullform.size() > 1) { - // // If there are multiple ConceptMentions found in the full form, - // take that largest right-most candidate. - // int maxSize = -1; - // for (ConceptMention em : conceptsInFullform) { - // int emSize = em.getEnd() - em.getBegin(); - // if (emSize > maxSize) { - // emFullform = em; - // maxSize = emSize; - // } - // } - // LOGGER.debug("Found multiple ConceptMentions included in the full - // form \"{}\", returning the longest.", - // fullFormAnnotation.getCoveredText()); - // if (LOGGER.isTraceEnabled()) { - // LOGGER.trace("All found ConceptMentions:"); - // for (ConceptMention cm : conceptsInFullform) { - // LOGGER.trace("Text: {}; offsets: {}-{}", - // new Object[] { cm.getCoveredText(), cm.getBegin(), cm.getEnd() - // }); - // } - // } - // } else { - // LOGGER.debug("No ConceptMention in the span of acronym fullform - // \"{}\" found.", - // fullFormAnnotation.getCoveredText()); - // } - - String type = null; - if (emFullform != null) - type = emFullform.getClass().getCanonicalName(); - - ConceptMention emAcronym = null;// AnnotationRetrieval.getMatchingAnnotation(aJCas, abbrev, - // ConceptMention.class); - emAcronym = conceptMentionIndex.getFirst(abbrev); - // This is really slow, really a pain with full texts. - // It was originally introduced to push recall for gene recognition. - // So now we will lose (a bit) of recognition performance there. - // ConceptMention emAcronym = - // JCoReAnnotationTools.getPartiallyOverlappingAnnotation(aJCas, - // abbrev, - // ConceptMention.class); - - // if type of the entity is equal to the output type for this - // annotator - if (type != null && type.equals(outputType)) { - if (emFullform == null) { - LOGGER.debug( - "annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no ConceptMention\n"); - continue; - } - if (emFullform.getComponentId() != null && emFullform.getComponentId().equals(COMPONENT_ID) - && (emAcronym == null - || !emAcronym.getClass().getName().equals(emFullform.getClass().getName()))) { - - try { - LOGGER.debug("annotateAcronymsWithFullFormEntity() - fullform of abbreviation (" - + abbrev.getCoveredText() + " [begin=" + abbrev.getBegin() + "; end=" + abbrev.getEnd() - + "]) has ConceptMention: " + emFullform.toString()); - ConceptMention newEntityOnAcronym = (ConceptMention) JCoReAnnotationTools - .getAnnotationByClassName(aJCas, outputType); - newEntityOnAcronym.setBegin(abbrev.getBegin()); - newEntityOnAcronym.setEnd(abbrev.getEnd()); - newEntityOnAcronym.setTextualRepresentation(newEntityOnAcronym.getCoveredText()); - newEntityOnAcronym.setSpecificType(emFullform.getSpecificType()); - newEntityOnAcronym.setComponentId(COMPONENT_ID + "+acronym"); - newEntityOnAcronym.setConfidence(emFullform.getConfidence() + ""); - newEntityOnAcronym.addToIndexes(); - - } catch (Exception e) { - LOGGER.error("process() - could not generate output type: " + e.getMessage()); - e.printStackTrace(); - throw new AnalysisEngineProcessException(AnalysisEngineProcessException.ANNOTATOR_EXCEPTION, - null); - } - - } else { - if (emAcronym == null) - LOGGER.debug("annotateAcronymsWithFullFormEntity() - emAcronym != null"); - else if (emAcronym.getClass().getName().equals(emFullform.getClass().getName())) - LOGGER.debug("annotateAcronymsWithFullFormEntity() - emAcroType=" - + emAcronym.getClass().getCanonicalName() + " == emFullformType=" - + emFullform.getClass().getCanonicalName()); - } - - } - } - } + String[] stopwordArray = {"a", "about", "above", "across", "after", "afterwards", "again", "against", + "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", + "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", + "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", + "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", + "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", + "cant", "co", "computer", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", + "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", + "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", + "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", + "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", + "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", + "herself", "high", "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", + "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", + "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", + "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", + "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", + "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", + "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", + "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", + "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", + "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", + "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", + "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", + "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though", "three", + "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", + "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", + "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", + "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", + "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", + "you", "your", "yours", "yourself", "yourselves",}; + stopWords = new HashSet<>(); + for (String sw : stopwordArray) + stopWords.add(sw); + } catch (ResourceAccessException e) { + LOGGER.error("Exception while initializing", e); + } + + // check acronyms + checkAcronyms = (Boolean) aContext.getConfigParameterValue(PARAM_CHECK_ACRONYMS); + LOGGER.info( + "Check for acronyms (found dictionary entries that are abbreviations are only accepted if their long form is an abbreviation of the same type, too): {}", + checkAcronyms); + // filter stop words + + Boolean normalizeBoolean = provider.getNormalize();// (Boolean) + // aContext.getConfigParameterValue(PARAM_NORMALIZE_TEXT); + if (normalizeBoolean) { + normalizationTokenFactory = new IndoEuropeanTokenizerFactory(); + } + LOGGER.info("Normalize CAS document text (i.e. do stemming and remove possessive 's): {}", provider.getNormalize()); + + Boolean transliterateBoolean = provider.getTransliterate();// (Boolean) + // aContext.getConfigParameterValue(PARAM_TRANSLITERATE_TEXT); + if (transliterateBoolean || !provider.getCaseSensitive()) { + transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower"); + } + LOGGER.info("Transliterate CAS document text (i.e. transform accented characters to their base forms): {}", + provider.getTransliterate()); + + // define output level + outputType = (String) aContext.getConfigParameterValue(PARAM_OUTPUT_TYPE); + if (outputType == null) { + LOGGER.error("initialize() - output type not specified."); + throw new ResourceInitializationException(); + } + + mantraMode = aContext.getConfigParameterValue(PARAM_USE_MANTRA_MODE) != null + ? (Boolean) aContext.getConfigParameterValue(PARAM_USE_MANTRA_MODE) + : false; + } + + /** + * process the CAS, there are two subroutines: one for exact and one for + * approximate matching. + */ + public void process(JCas aJCas) throws AnalysisEngineProcessException { + if (gazetteer == null) + throw new IllegalStateException("The actual gazetteer object is null. Check previous log messages pointing to the error (most probably the dictionary file could not be found)."); + String docText = aJCas.getDocumentText(); + if (docText == null || docText.length() == 0) + return; + if (provider.getUseApproximateMatching() && !provider.getTransliterate() && !provider.getCaseSensitive()) + // We use the transliterator because it does lowercasing and also solves issues that could arise due + // to the normal docText.toLowerCase() call which would break special characters sometimes + docText = transliterator.transform(docText); + NormalizedString normalizedDocText = null; + if (provider.getNormalize()) { + normalizedDocText = StringNormalizerForChunking.normalizeString(docText, normalizationTokenFactory, + transliterator); + System.out.println(normalizedDocText.getOffsetMap()); + System.out.println(normalizedDocText.string); + } + + IndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator(); + JCoReHashMapAnnotationIndex conceptMentionIndex = new JCoReHashMapAnnotationIndex<>( + longOffsetTermGenerator, longOffsetTermGenerator, aJCas, ConceptMention.type); + JCoReHashMapAnnotationIndex abbreviationIndex = new JCoReHashMapAnnotationIndex<>( + longOffsetTermGenerator, longOffsetTermGenerator, aJCas, Abbreviation.type); + + LOGGER.debug("Performing actual Gazetteer annotation..."); + Chunking chunking; + if (provider.getNormalize()) + chunking = gazetteer.chunk(normalizedDocText.string); + else + chunking = gazetteer.chunk(docText); + LOGGER.debug("Gazetteer annotation done."); + if (provider.getUseApproximateMatching()) { + /* + * handle matches found by approx matching: this means especially overlapping + * matches with different scores (doesn't happen with exact matches) + */ + List chunkList = filterChunking(chunking); + List overlappingChunks = groupOverlappingChunks(chunkList, + chunking.charSequence().toString()); + // now add the best chunk of all overlappingChunks to the CAS + LOGGER.debug("all overlapping chunks:\n"); + // Set bestChunksSet = new HashSet<>(); + for (OverlappingChunk overlappingChunk : overlappingChunks) { + // show chunks + LOGGER.debug(overlappingChunk.toStringAll()); + List bestChunks = overlappingChunk.getBestChunks(); + LOGGER.debug("Found {} best chunks.", bestChunks.size()); + for (int i = 0; i < bestChunks.size(); i++) { + Chunk bestChunk = bestChunks.get(i); + if (LOGGER.isDebugEnabled()) { + String chunkText = provider.getNormalize() ? normalizedDocText.string.substring(bestChunk.start(), bestChunk.end()) : aJCas.getDocumentText().substring(bestChunk.start(), bestChunk.end()); + LOGGER.debug("Nr. " + i + " best chunk: " + bestChunk.start() + " - " + bestChunk.end() + ": " + + bestChunk.score() + " ; type: " + bestChunk.type() + " ; text: " + chunkText); + } + // TODO this check and the corresponding set may be removed + // when this exception hasn't been thrown + // in a + // while. Its currently just to be sure, this should not + // happen any more since the chunks are sorted + // by + // offset in the grouping method. + // if (bestChunksSet.contains(bestChunk)) { + // throw new IllegalStateException("Duplicate best chunk: " + bestChunk); + // } + // bestChunksSet.add(bestChunk); + // add 2 cas + add2Cas(aJCas, bestChunk, normalizedDocText, conceptMentionIndex, abbreviationIndex); + } + } + // for (Chunk chunk : chunking.chunkSet()) { + // add2Cas(aJCas, chunk, normalizedDocText); + // } + } else { + for (Chunk chunk : chunking.chunkSet()) { + add2Cas(aJCas, chunk, normalizedDocText, conceptMentionIndex, abbreviationIndex); + } + } + if (checkAcronyms && !mantraMode) { + LOGGER.debug("process() - checking acronyms"); + annotateAcronymsWithFullFormEntity(aJCas, conceptMentionIndex); + } + } + + private List filterChunking(Chunking chunking) { + // ChunkingImpl newChunking = new ChunkingImpl(chunking.charSequence()); + List newChunking = new ArrayList<>(chunking.chunkSet().size()); + for (Chunk chunk : chunking.chunkSet()) { + String chunkText = chunking.charSequence().subSequence(chunk.start(), chunk.end()).toString(); + if (filterParenthesis(chunkText)) + continue; + if (filterPunctuationArtifacts(chunkText)) + continue; + if (filterStopwords(chunkText)) + continue; + newChunking.add(chunk); + } + return newChunking; + } + + private boolean filterPunctuationArtifacts(String chunkText) { + if (chunkText.startsWith("-")) + return true; + if (chunkText.endsWith("-")) + return true; + return false; + } + + private boolean filterStopwords(String chunkText) { + if (stopWords.contains(chunkText.toLowerCase())) + return true; + if (chunkText.contains(" ")) { + String[] words = chunkText.split(" "); + int stopWordCounter = 0; + for (String word : words) { + if (stopWords.contains(word.toLowerCase())) + stopWordCounter++; + } + if (Math.ceil(words.length / 2.0) <= stopWordCounter) { + LOGGER.debug("Filtering due to high stop word occurrences: {}", chunkText); + return true; + } + } + return false; + } + + /** + * checks whether a chunk (= dictionary match) is an acronym. If yes, checks + * whether respective full form (obtained via abbr textReference) is + * ConceptMention and has same specificType as chunk If these conditions are not + * fulfilled, no entity annotation will be made. + * + * @param abbreviationIndex + * @param conceptMentionIndex + */ + private boolean isAcronymWithSameFullFormSpecificType(JCas aJCas, Chunk chunk, NormalizedString normalizedDocText, + JCoReHashMapAnnotationIndex conceptMentionIndex, + JCoReHashMapAnnotationIndex abbreviationIndex) { + // Annotation anno; + int start; + int end; + if (provider.getNormalize()) { + try { + start = normalizedDocText.getOriginalOffset(chunk.start()); + end = normalizedDocText.getOriginalOffset(chunk.end()); + } catch (Exception e) { + System.out.println("Text: " + normalizedDocText); + System.out.println("Chunk: " + chunk); + System.out.println("Chunk end: " + chunk.end()); + System.out + .println("Normalized Text: " + normalizedDocText.string.substring(chunk.start(), chunk.end())); + throw e; + } + // anno = new Annotation(aJCas, start, end); + } else { + start = chunk.start(); + end = chunk.end(); + } + + LongOffsetIndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator(); + // Retrieves potential abbr annotation + Abbreviation abbr = abbreviationIndex.getFirst(longOffsetTermGenerator.forOffsets(start, end)); + // check whether it's an abbr + String chunktext = null; + if (LOGGER.isDebugEnabled()) + chunktext = aJCas.getDocumentText().substring(start, end); + if (abbr == null) { + LOGGER.debug("{} chunk \"{}\" is not an abbreviation\n", chunk, chunktext); + return true; + } + // checks whether respective full form is ConceptMention + AbbreviationLongform textRef = abbr.getTextReference(); + ConceptMention em = conceptMentionIndex.getFirst(textRef); + if (em == null) { + LOGGER.debug( + chunk + " chunk \"{}\" is an abbreviation but respective full \"{}\" form is no ConceptMention\n", + chunktext, textRef.getCoveredText()); + return false; + } + + // checks whether full form annotation matches the type to be annotated + // here + String emType = em.getClass().getCanonicalName(); + if (emType.equals(outputType)) { + LOGGER.debug(chunk + + " chunk \"{}\" is an abbreviation and respective full form \"{}\" is ConceptMention with same type as OutputType\n", + chunktext, em.getCoveredText()); + return true; + } + + LOGGER.debug(chunk + + " chunk \"{}\" is an abbreviation but respective full form \"{}\" is ConceptMention without the correct OutputType (is: {}; OutputType: {})\n", + new Object[]{chunktext, em.getCoveredText(), emType, outputType}); + return false; + } + + // ------------ INFO .......... + // String text = aJCas.getDocumentText(); + // int start = chunk.start(); + // int end = chunk.end(); + // String type = chunk.type(); + // double score = chunk.score(); + // String phrase = text.substring(start, end); + // System.out.println(" found phrase=|" + phrase + "|" + // + " start=" + start + " end=" + end + " type=" + type + // + " score=" + score); + // ------------ INFO .......... + + /** + * adds a chunk as an annotation to the CAS + * + * @param normalizedDocText + * @param abbreviationIndex + * @param conceptMentionIndex + */ + private void add2Cas(JCas aJCas, Chunk chunk, NormalizedString normalizedDocText, + JCoReHashMapAnnotationIndex conceptMentionIndex, + JCoReHashMapAnnotationIndex abbreviationIndex) throws AnalysisEngineProcessException { + // System.out.println("CHUNK: start=" + chunk.start() + " end=" + + // chunk.end()); + // if checkAcronyms, then check acronyms for compliant full forms (= + // with same specificType) + if (checkAcronyms && !isAcronymWithSameFullFormSpecificType(aJCas, chunk, normalizedDocText, + conceptMentionIndex, abbreviationIndex)) { + return; + } + + int start = provider.getNormalize() ? normalizedDocText.getOriginalOffset(chunk.start()) : chunk.start(); + int end = provider.getNormalize() ? normalizedDocText.getOriginalOffset(chunk.end()) : chunk.end(); + + try { + if (mantraMode) { + // the "type" string is used to transport all data needed for + // the MAN-XML format + for (String term : chunk.type().split("@@TERM@@")) { + // @@ is used to separate source, cui, type(s) and group (in + // this order!) + String[] info = term.split("@@"); + Entity newEntity = (Entity) JCoReAnnotationTools.getAnnotationByClassName(aJCas, + "de.julielab.jcore.types.mantra.Entity"); + newEntity.setBegin(start); + newEntity.setEnd(end); + newEntity.setComponentId(COMPONENT_ID); + newEntity.setConfidence(chunk.score() + ""); + + // mantra specific + newEntity.setSource(info[0]); + newEntity.setCui(info[1]); + newEntity.setSemanticType(info[2]); + newEntity.setSemanticGroup(info[3]); + + newEntity.addToIndexes(); + } + } else { + ConceptMention newEntity = (ConceptMention) JCoReAnnotationTools.getAnnotationByClassName(aJCas, + outputType); + newEntity.setBegin(start); + newEntity.setEnd(end); + + // String entityText = newEntity.getCoveredText(); + // if (stopWords.contains(entityText.toLowerCase())) + // return; + // if (entityText.contains(" ")) { + // String[] words = entityText.split(" "); + // int stopWordCounter = 0; + // for (String word : words) { + // if (stopWords.contains(word.toLowerCase())) + // stopWordCounter++; + // } + // if (words.length == stopWordCounter) + // return; + // } + + newEntity.setSpecificType(chunk.type()); + newEntity.setComponentId(COMPONENT_ID); + newEntity.setConfidence(chunk.score() + ""); + newEntity.addToIndexes(); + + conceptMentionIndex.index(newEntity); + } + } catch (Exception e) { + LOGGER.error("process() - could not generate output type: " + e.getMessage()); + e.printStackTrace(); + throw new AnalysisEngineProcessException(e); + } + } + + private void annotateAcronymsWithFullFormEntity(JCas aJCas, + JCoReHashMapAnnotationIndex conceptMentionIndex) + throws AnalysisEngineProcessException { + + JFSIndexRepository indexes = aJCas.getJFSIndexRepository(); + FSIterator abbrevIter = indexes.getAnnotationIndex(Abbreviation.type).iterator(); + IndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator(); + + // loop over all abbreviations + while (abbrevIter.hasNext()) { + Abbreviation abbrev = (Abbreviation) abbrevIter.next(); + AbbreviationLongform fullFormAnnotation = abbrev.getTextReference(); + LOGGER.debug("annotateAcronymsWithFullFormEntity() - checking abbreviation: " + abbrev.getCoveredText()); + ConceptMention emFullform = null;// AnnotationRetrieval.getMatchingAnnotation(aJCas, fullFormAnnotation, + // ConceptMention.class); + emFullform = conceptMentionIndex.getFirst(fullFormAnnotation); + + // The following code was once introduced for gene tagging. There, + // the acronym fullforms sometimes miss minor parts of an annotated + // gene, leading to non-annotated acronyms that would have been + // correct. + // However, for general-purpose concept recognition this approach + // can be quite harmful. Example: "Anaphase-promoting complex (APC)" + // where only "anaphase" is recognized as concept. Now, "APC" would + // be annotated as an acronym for "anaphase". Here, a better + // recognition of the abbreviation span is required. + // ConceptMention emFullform = null; + // List conceptsInFullform = + // JCoReAnnotationTools.getIncludedAnnotations(aJCas, + // fullFormAnnotation, + // ConceptMention.class); + // if (conceptsInFullform.size() == 1) { + // emFullform = conceptsInFullform.get(0); + // LOGGER.debug("Found a single ConceptMention included in the full + // form: {}", emFullform.getCoveredText()); + // } else if (conceptsInFullform.size() > 1) { + // // If there are multiple ConceptMentions found in the full form, + // take that largest right-most candidate. + // int maxSize = -1; + // for (ConceptMention em : conceptsInFullform) { + // int emSize = em.getEnd() - em.getBegin(); + // if (emSize > maxSize) { + // emFullform = em; + // maxSize = emSize; + // } + // } + // LOGGER.debug("Found multiple ConceptMentions included in the full + // form \"{}\", returning the longest.", + // fullFormAnnotation.getCoveredText()); + // if (LOGGER.isTraceEnabled()) { + // LOGGER.trace("All found ConceptMentions:"); + // for (ConceptMention cm : conceptsInFullform) { + // LOGGER.trace("Text: {}; offsets: {}-{}", + // new Object[] { cm.getCoveredText(), cm.getBegin(), cm.getEnd() + // }); + // } + // } + // } else { + // LOGGER.debug("No ConceptMention in the span of acronym fullform + // \"{}\" found.", + // fullFormAnnotation.getCoveredText()); + // } + + String type = null; + if (emFullform != null) + type = emFullform.getClass().getCanonicalName(); + + ConceptMention emAcronym = null;// AnnotationRetrieval.getMatchingAnnotation(aJCas, abbrev, + // ConceptMention.class); + emAcronym = conceptMentionIndex.getFirst(abbrev); + // This is really slow, really a pain with full texts. + // It was originally introduced to push recall for gene recognition. + // So now we will lose (a bit) of recognition performance there. + // ConceptMention emAcronym = + // JCoReAnnotationTools.getPartiallyOverlappingAnnotation(aJCas, + // abbrev, + // ConceptMention.class); + + // if type of the entity is equal to the output type for this + // annotator + if (type != null && type.equals(outputType)) { + if (emFullform == null) { + LOGGER.debug( + "annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no ConceptMention\n"); + continue; + } + if (emFullform.getComponentId() != null && emFullform.getComponentId().equals(COMPONENT_ID) + && (emAcronym == null + || !emAcronym.getClass().getName().equals(emFullform.getClass().getName()))) { + + try { + LOGGER.debug("annotateAcronymsWithFullFormEntity() - fullform of abbreviation (" + + abbrev.getCoveredText() + " [begin=" + abbrev.getBegin() + "; end=" + abbrev.getEnd() + + "]) has ConceptMention: " + emFullform.toString()); + ConceptMention newEntityOnAcronym = (ConceptMention) JCoReAnnotationTools + .getAnnotationByClassName(aJCas, outputType); + newEntityOnAcronym.setBegin(abbrev.getBegin()); + newEntityOnAcronym.setEnd(abbrev.getEnd()); + newEntityOnAcronym.setTextualRepresentation(newEntityOnAcronym.getCoveredText()); + newEntityOnAcronym.setSpecificType(emFullform.getSpecificType()); + newEntityOnAcronym.setComponentId(COMPONENT_ID + "+acronym"); + newEntityOnAcronym.setConfidence(emFullform.getConfidence() + ""); + newEntityOnAcronym.addToIndexes(); + + } catch (Exception e) { + LOGGER.error("process() - could not generate output type: " + e.getMessage()); + e.printStackTrace(); + throw new AnalysisEngineProcessException(AnalysisEngineProcessException.ANNOTATOR_EXCEPTION, + null); + } + + } else { + if (emAcronym == null) + LOGGER.debug("annotateAcronymsWithFullFormEntity() - emAcronym != null"); + else if (emAcronym.getClass().getName().equals(emFullform.getClass().getName())) + LOGGER.debug("annotateAcronymsWithFullFormEntity() - emAcroType=" + + emAcronym.getClass().getCanonicalName() + " == emFullformType=" + + emFullform.getClass().getCanonicalName()); + } + + } + } + } + + enum ParenthesisType { + ROUND, BRACKET, CURLY, NONE + } } diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java index 2cffe9bde..e51c41eb9 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java @@ -1,4 +1,3 @@ - package de.julielab.jcore.ae.lingpipegazetteer.utils; import com.aliasi.tokenizer.Tokenizer; @@ -9,205 +8,207 @@ public class StringNormalizerForChunking { - public enum Mode { - /** - * Punctuation characters are deleted completely, shrinking the string. - */ - DELETE, - /** Punctuation characters are replaced by white spaces. */ - REPLACE - } - - private static Set charsToDelete = new HashSet<>(); - static { - charsToDelete.add('-'); - charsToDelete.add('+'); - charsToDelete.add(','); - charsToDelete.add('.'); - charsToDelete.add(':'); - charsToDelete.add(';'); - charsToDelete.add('?'); - charsToDelete.add('!'); - charsToDelete.add('*'); - charsToDelete.add('§'); - charsToDelete.add('$'); - charsToDelete.add('%'); - charsToDelete.add('&'); - charsToDelete.add('/'); - charsToDelete.add('\\'); - charsToDelete.add('('); - charsToDelete.add(')'); - charsToDelete.add('<'); - charsToDelete.add('>'); - charsToDelete.add('['); - charsToDelete.add(']'); - charsToDelete.add('='); - charsToDelete.add('\''); - charsToDelete.add('`'); - charsToDelete.add('´'); - charsToDelete.add('"'); - charsToDelete.add('#'); - } - - public static class NormalizedString { - public String string; - private Map offsetMap = new HashMap<>(); - - public Map getOffsetMap() { - return offsetMap; - } - - private TreeSet normalizedOffsetSet; - - public Integer getOriginalOffset(int normalizedOffset) { - Integer originalOffset = offsetMap.get(normalizedOffset); - if (originalOffset == null) { - originalOffset = deriveOriginalOffset(normalizedOffset); - offsetMap.put(normalizedOffset, originalOffset); - } - return originalOffset; - } - - private Integer deriveOriginalOffset(int normalizedOffset) { - if (normalizedOffsetSet == null) - normalizedOffsetSet = new TreeSet<>(offsetMap.keySet()); - Integer previousNormalizedOffset = normalizedOffsetSet.floor(normalizedOffset); - Integer originalPreviousOffset = offsetMap.get(previousNormalizedOffset); - int offsetShift = Math.abs(originalPreviousOffset - previousNormalizedOffset); - // Typically, the normalized string will be shorter than the - // original, thus the original offset would be larger. - if (originalPreviousOffset > previousNormalizedOffset) - return normalizedOffset + offsetShift; - // But if, for some reason, the normalized string is longer than the - // original, we would have to subtract the difference from the - // normalized offset. - return normalizedOffset - offsetShift; - } - } - - /** - * This method was meant for text normalization by just deleting punctuation - * characters. However, the approach turned out to be suboptimal in cases - * where a dictionary entry would be "SHP-1" and the text form would be "SHP - * 1". That is, when in the text there is just a whitespace where there is a - * punctuation character in the dictionary, we won't recognize the - * dictionary entry. Thus, a different normalization was developed, namely - * in the other normalization method. It is supposed to be used together - * with an approximate chunker. - * - * @param str - * @return - */ - public static NormalizedString normalizeString(String str) { - NormalizedString ns = new NormalizedString(); - StringBuilder sb = new StringBuilder(); - int deletedChars = 0; - - for (int i = 0; i < str.length(); i++) { - char c = str.charAt(i); - if (charsToDelete.contains(c)) { - deletedChars++; - // switch (mode) { - // case REPLACE: sb.append(" "); break; - // case DELETE: deletedChars++; break; - // } - } else { - sb.append(c); - } - int newOffset = Math.max(0, i - deletedChars); - if (null == ns.offsetMap.get(newOffset)) - ns.offsetMap.put(newOffset, i); - } - ns.string = sb.toString(); - return ns; - } - - /** - * This normalization method uses a given TokenizerFactory (could also be a - * PorterStemmerTokenizerFactory for stemming) and additionally removes - * possessive 's constructions. Dashes and other punctuation is left - * untouched. By using an approximate chunker, one can also handle - * punctuation. - * - * @param str - * @param tokenizerFactory - * @return - */ - public static NormalizedString normalizeString(String str, TokenizerFactory tokenizerFactory, - Transliterator transliterator) { - // boolean stemming = tokenizerFactory instanceof - // PorterStemmerTokenizerFactory; - - NormalizedString ns = new NormalizedString(); - - char[] strChars = str.toCharArray(); - Tokenizer tokenizer = tokenizerFactory.tokenizer(strChars, 0, strChars.length); - StringBuilder sb = new StringBuilder(); - ArrayDeque tokenS = new ArrayDeque<>(); - Map deleteCandidateOffsetMap = new HashMap<>(); - // According to the lingpipe API documentation, one starts with the next - // whitespace. - sb.append(tokenizer.nextWhitespace()); - ns.offsetMap.put(0, 0); - String token; - while ((token = tokenizer.nextToken()) != null) { - // Handle possessive 's (like Parkinson's). It will be deleted. In - // case we have accidentally deleted some - // tokens, those are stored in the stack and their offsets are - // stored, too. In case it was an error, the - // tokens are later added again in the "else" path. - if (token.equals("'")) { - int newStartOffset = sb.length() + sumOfStack(tokenS); - int newEndOffset = sb.length() + sumOfStack(tokenS) + token.length(); - deleteCandidateOffsetMap.put(newStartOffset, tokenizer.lastTokenStartPosition()); - deleteCandidateOffsetMap.put(newEndOffset, tokenizer.lastTokenEndPosition()); - tokenS.push(token + tokenizer.nextWhitespace()); - } else if (token.equals("s") && tokenS.size() == 1) { - int newStartOffset = sb.length() + sumOfStack(tokenS); - int newEndOffset = sb.length() + sumOfStack(tokenS) + token.length(); - deleteCandidateOffsetMap.put(newStartOffset, tokenizer.lastTokenStartPosition()); - deleteCandidateOffsetMap.put(newEndOffset, tokenizer.lastTokenEndPosition()); - tokenS.push(token); - String ws = tokenizer.nextWhitespace(); - if (ws.length() > 0) { - sb.append(ws); - tokenS.clear(); - deleteCandidateOffsetMap.clear(); - } - } else { - if (!tokenS.isEmpty()) { - for (String s : tokenS) { - sb.append(s); - } - tokenS.clear(); - ns.offsetMap.putAll(deleteCandidateOffsetMap); - deleteCandidateOffsetMap.clear(); - } - if (transliterator != null) - token = transliterator.transform(token); - // plural s, only when no stemming is done - // if (!stemming && token.endsWith("s")) - // token = token.substring(0, token.length() - 1); - sb.append(token); - int newStartOffset = sb.length() - token.length(); - int newEndOffset = sb.length(); - ns.offsetMap.put(newStartOffset, tokenizer.lastTokenStartPosition()); - ns.offsetMap.put(newEndOffset, tokenizer.lastTokenEndPosition()); - sb.append(tokenizer.nextWhitespace()); - } - } - ns.string = sb.toString(); - return ns; - } - - private static int sumOfStack(Deque stack) { - int sum = 0; - for (String i : stack) - sum += i.length(); - return sum; - } - - public static NormalizedString normalizeString(String str, TokenizerFactory tokenizerFactory) { - return normalizeString(str, tokenizerFactory, null); - } + private static Set charsToDelete = new HashSet<>(); + + static { + charsToDelete.add('-'); + charsToDelete.add('+'); + charsToDelete.add(','); + charsToDelete.add('.'); + charsToDelete.add(':'); + charsToDelete.add(';'); + charsToDelete.add('?'); + charsToDelete.add('!'); + charsToDelete.add('*'); + charsToDelete.add('§'); + charsToDelete.add('$'); + charsToDelete.add('%'); + charsToDelete.add('&'); + charsToDelete.add('/'); + charsToDelete.add('\\'); + charsToDelete.add('('); + charsToDelete.add(')'); + charsToDelete.add('<'); + charsToDelete.add('>'); + charsToDelete.add('['); + charsToDelete.add(']'); + charsToDelete.add('='); + charsToDelete.add('\''); + charsToDelete.add('`'); + charsToDelete.add('´'); + charsToDelete.add('"'); + charsToDelete.add('#'); + } + + /** + * This method was meant for text normalization by just deleting punctuation + * characters. However, the approach turned out to be suboptimal in cases + * where a dictionary entry would be "SHP-1" and the text form would be "SHP + * 1". That is, when in the text there is just a whitespace where there is a + * punctuation character in the dictionary, we won't recognize the + * dictionary entry. Thus, a different normalization was developed, namely + * in the other normalization method. It is supposed to be used together + * with an approximate chunker. + * + * @param str + * @return + */ + public static NormalizedString normalizeString(String str) { + NormalizedString ns = new NormalizedString(); + StringBuilder sb = new StringBuilder(); + int deletedChars = 0; + + for (int i = 0; i < str.length(); i++) { + char c = str.charAt(i); + if (charsToDelete.contains(c)) { + deletedChars++; + // switch (mode) { + // case REPLACE: sb.append(" "); break; + // case DELETE: deletedChars++; break; + // } + } else { + sb.append(c); + } + int newOffset = Math.max(0, i - deletedChars); + if (null == ns.offsetMap.get(newOffset)) + ns.offsetMap.put(newOffset, i); + } + ns.string = sb.toString(); + return ns; + } + + /** + * This normalization method uses a given TokenizerFactory (could also be a + * PorterStemmerTokenizerFactory for stemming) and additionally removes + * possessive 's constructions. Dashes and other punctuation is left + * untouched. By using an approximate chunker, one can also handle + * punctuation. + * + * @param str + * @param tokenizerFactory + * @return + */ + public static NormalizedString normalizeString(String str, TokenizerFactory tokenizerFactory, + Transliterator transliterator) { + // boolean stemming = tokenizerFactory instanceof + // PorterStemmerTokenizerFactory; + + NormalizedString ns = new NormalizedString(); + + char[] strChars = str.toCharArray(); + Tokenizer tokenizer = tokenizerFactory.tokenizer(strChars, 0, strChars.length); + StringBuilder sb = new StringBuilder(); + ArrayDeque tokenS = new ArrayDeque<>(); + Map deleteCandidateOffsetMap = new HashMap<>(); + // According to the lingpipe API documentation, one starts with the next + // whitespace. + sb.append(tokenizer.nextWhitespace()); + ns.offsetMap.put(0, 0); + String token; + while ((token = tokenizer.nextToken()) != null) { + // Handle possessive 's (like Parkinson's). It will be deleted. In + // case we have accidentally deleted some + // tokens, those are stored in the stack and their offsets are + // stored, too. In case it was an error, the + // tokens are later added again in the "else" path. + if (token.equals("'")) { + int newStartOffset = sb.length() + sumOfStack(tokenS); + int newEndOffset = sb.length() + sumOfStack(tokenS) + token.length(); + deleteCandidateOffsetMap.put(newStartOffset, tokenizer.lastTokenStartPosition()); + deleteCandidateOffsetMap.put(newEndOffset, tokenizer.lastTokenEndPosition()); + tokenS.push(token + tokenizer.nextWhitespace()); + } else if (token.equals("s") && tokenS.size() == 1) { + int newStartOffset = sb.length() + sumOfStack(tokenS); + int newEndOffset = sb.length() + sumOfStack(tokenS) + token.length(); + deleteCandidateOffsetMap.put(newStartOffset, tokenizer.lastTokenStartPosition()); + deleteCandidateOffsetMap.put(newEndOffset, tokenizer.lastTokenEndPosition()); + tokenS.push(token); + String ws = tokenizer.nextWhitespace(); + if (ws.length() > 0) { + sb.append(ws); + tokenS.clear(); + deleteCandidateOffsetMap.clear(); + } + } else { + if (!tokenS.isEmpty()) { + for (String s : tokenS) { + sb.append(s); + } + tokenS.clear(); + ns.offsetMap.putAll(deleteCandidateOffsetMap); + deleteCandidateOffsetMap.clear(); + } + if (transliterator != null) + token = transliterator.transform(token); + // plural s, only when no stemming is done + // if (!stemming && token.endsWith("s")) + // token = token.substring(0, token.length() - 1); + sb.append(token); + int newStartOffset = sb.length() - token.length(); + int newEndOffset = sb.length(); + ns.offsetMap.put(newStartOffset, tokenizer.lastTokenStartPosition()); + ns.offsetMap.put(newEndOffset, tokenizer.lastTokenEndPosition()); + sb.append(tokenizer.nextWhitespace()); + } + } + ns.string = sb.toString(); + return ns; + } + + private static int sumOfStack(Deque stack) { + int sum = 0; + for (String i : stack) + sum += i.length(); + return sum; + } + + public static NormalizedString normalizeString(String str, TokenizerFactory tokenizerFactory) { + return normalizeString(str, tokenizerFactory, null); + } + + public enum Mode { + /** + * Punctuation characters are deleted completely, shrinking the string. + */ + DELETE, + /** + * Punctuation characters are replaced by white spaces. + */ + REPLACE + } + + public static class NormalizedString { + public String string; + private Map offsetMap = new HashMap<>(); + private TreeSet normalizedOffsetSet; + + public Map getOffsetMap() { + return offsetMap; + } + + public Integer getOriginalOffset(int normalizedOffset) { + Integer originalOffset = offsetMap.get(normalizedOffset); + if (originalOffset == null) { + originalOffset = deriveOriginalOffset(normalizedOffset); + offsetMap.put(normalizedOffset, originalOffset); + } + return originalOffset; + } + + private Integer deriveOriginalOffset(int normalizedOffset) { + if (normalizedOffsetSet == null) + normalizedOffsetSet = new TreeSet<>(offsetMap.keySet()); + Integer previousNormalizedOffset = normalizedOffsetSet.floor(normalizedOffset); + Integer originalPreviousOffset = offsetMap.get(previousNormalizedOffset); + int offsetShift = Math.abs(originalPreviousOffset - previousNormalizedOffset); + // Typically, the normalized string will be shorter than the + // original, thus the original offset would be larger. + if (originalPreviousOffset > previousNormalizedOffset) + return normalizedOffset + offsetShift; + // But if, for some reason, the normalized string is longer than the + // original, we would have to subtract the difference from the + // normalized offset. + return normalizedOffset - offsetShift; + } + } } diff --git a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/StringNormalizerForChunkingTest.java b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/StringNormalizerForChunkingTest.java index fe1ac16a0..fef412a2e 100644 --- a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/StringNormalizerForChunkingTest.java +++ b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/StringNormalizerForChunkingTest.java @@ -49,22 +49,22 @@ public void testNormalizedOffsets() { text = "-aa :+bb"; // Outcome: "aabb"; ns = StringNormalizerForChunking.normalizeString(text); - assertEquals("The original offset is computed wrong", new Integer(0), ns.getOriginalOffset(0)); - assertEquals("The original offset is computed wrong", new Integer(2), ns.getOriginalOffset(1)); - assertEquals("The original offset is computed wrong", new Integer(3), ns.getOriginalOffset(2)); - assertEquals("The original offset is computed wrong", new Integer(6), ns.getOriginalOffset(3)); - assertEquals("The original offset is computed wrong", new Integer(7), ns.getOriginalOffset(4)); + assertEquals("The original offset is computed wrong", Integer.valueOf(0), ns.getOriginalOffset(0)); + assertEquals("The original offset is computed wrong", Integer.valueOf(2), ns.getOriginalOffset(1)); + assertEquals("The original offset is computed wrong", Integer.valueOf(3), ns.getOriginalOffset(2)); + assertEquals("The original offset is computed wrong", Integer.valueOf(6), ns.getOriginalOffset(3)); + assertEquals("The original offset is computed wrong", Integer.valueOf(7), ns.getOriginalOffset(4)); assertNull("There are more offset mappings than should be", ns.getOffsetMap().get(5)); text = "((2-n-butyl-6,7-dichloro-2-cyclopentyl-2,3-dihydro-1-oxo-1H-inden-5-yl)oxy)acetic acid"; // Outcome: // "2nbutyl67dichloro2cyclopentyl23dihydro1oxo1Hinden5yloxyacetic acid"; ns = StringNormalizerForChunking.normalizeString(text); - assertEquals("The original offset is computed wrong", new Integer(0), ns.getOriginalOffset(0)); - assertEquals("The original offset is computed wrong", new Integer(4), ns.getOriginalOffset(1)); - assertEquals("The original offset is computed wrong", new Integer(6), ns.getOriginalOffset(2)); - assertEquals("The original offset is computed wrong", new Integer(16), ns.getOriginalOffset(9)); - assertEquals("The original offset is computed wrong", new Integer(82), ns.getOriginalOffset(62)); + assertEquals("The original offset is computed wrong", Integer.valueOf(0), ns.getOriginalOffset(0)); + assertEquals("The original offset is computed wrong", Integer.valueOf(4), ns.getOriginalOffset(1)); + assertEquals("The original offset is computed wrong", Integer.valueOf(6), ns.getOriginalOffset(2)); + assertEquals("The original offset is computed wrong", Integer.valueOf(16), ns.getOriginalOffset(9)); + assertEquals("The original offset is computed wrong", Integer.valueOf(82), ns.getOriginalOffset(62)); assertNull("There are more offset mappings than should be", ns.getOffsetMap().get(66)); } @@ -84,8 +84,8 @@ public void testNormalizedOffsetsTransliterate() { assertEquals("Transliteration wasn't done correctly", "each node either a sensor or a beacon is noted as nodep, p ∈ 𝕊 ∪ 𝔹, and vector vp is used to represent the coordinate of nodep. beacons are placed onto the map with fixed coordinates vj, where j ∈ 𝔹. we assume that each beacon is aware of its own absolute location.", ns.string); - assertEquals(new Integer(83), ns.getOriginalOffset(82)); - assertEquals(new Integer(188), ns.getOriginalOffset(186)); + assertEquals(Integer.valueOf(83), ns.getOriginalOffset(82)); + assertEquals(Integer.valueOf(188), ns.getOriginalOffset(186)); } @Test @@ -114,29 +114,29 @@ public void testNormalizeWithTokenizer() { ns = StringNormalizerForChunking.normalizeString(str, tokenizerFactory); assertEquals("Normalization was wrong: ", "We saw Parkinson Diseas and S(H)P 1 in a sadli-form circumvent of applic.", ns.string); - assertEquals("Offset wrong: ", new Integer(0), ns.getOriginalOffset(new Integer(0))); - assertEquals("Offset wrong: ", new Integer(16), ns.getOriginalOffset(new Integer(16))); - assertEquals("Offset wrong: ", new Integer(19), ns.getOriginalOffset(new Integer(17))); - assertEquals("Offset wrong: ", new Integer(26), ns.getOriginalOffset(new Integer(23))); - assertEquals("Offset wrong: ", new Integer(49), ns.getOriginalOffset(new Integer(46))); - assertEquals("Offset wrong: ", new Integer(50), ns.getOriginalOffset(new Integer(47))); - assertEquals("Offset wrong: ", new Integer(56), ns.getOriginalOffset(new Integer(51))); + assertEquals("Offset wrong: ", Integer.valueOf(0), ns.getOriginalOffset(Integer.valueOf(0))); + assertEquals("Offset wrong: ", Integer.valueOf(16), ns.getOriginalOffset(Integer.valueOf(16))); + assertEquals("Offset wrong: ", Integer.valueOf(19), ns.getOriginalOffset(Integer.valueOf(17))); + assertEquals("Offset wrong: ", Integer.valueOf(26), ns.getOriginalOffset(Integer.valueOf(23))); + assertEquals("Offset wrong: ", Integer.valueOf(49), ns.getOriginalOffset(Integer.valueOf(46))); + assertEquals("Offset wrong: ", Integer.valueOf(50), ns.getOriginalOffset(Integer.valueOf(47))); + assertEquals("Offset wrong: ", Integer.valueOf(56), ns.getOriginalOffset(Integer.valueOf(51))); str = "We go to James' to have some coffee'ses."; ns = StringNormalizerForChunking.normalizeString(str, tokenizerFactory); assertEquals("Normalization was wrong: ", "We go to Jame' to have some coffe'se.", ns.string); - assertEquals("Offset wrong: ", new Integer(0), ns.getOriginalOffset(new Integer(0))); - assertEquals("Offset wrong: ", new Integer(9), ns.getOriginalOffset(new Integer(9))); - assertEquals("Offset wrong: ", new Integer(14), ns.getOriginalOffset(new Integer(13))); - assertEquals("Offset wrong: ", new Integer(35), ns.getOriginalOffset(new Integer(33))); + assertEquals("Offset wrong: ", Integer.valueOf(0), ns.getOriginalOffset(Integer.valueOf(0))); + assertEquals("Offset wrong: ", Integer.valueOf(9), ns.getOriginalOffset(Integer.valueOf(9))); + assertEquals("Offset wrong: ", Integer.valueOf(14), ns.getOriginalOffset(Integer.valueOf(13))); + assertEquals("Offset wrong: ", Integer.valueOf(35), ns.getOriginalOffset(Integer.valueOf(33))); str = "We have some 'serious things' to talk about."; ns = StringNormalizerForChunking.normalizeString(str, tokenizerFactory); assertEquals("Normalization was wrong: ", "We have some 'seriou thing' to talk about.", ns.string); - assertEquals("Offset wrong: ", new Integer(0), ns.getOriginalOffset(new Integer(0))); - assertEquals("Offset wrong: ", new Integer(12), ns.getOriginalOffset(new Integer(12))); - assertEquals("Offset wrong: ", new Integer(13), ns.getOriginalOffset(new Integer(13))); - assertEquals("Offset wrong: ", new Integer(28), ns.getOriginalOffset(new Integer(26))); - assertEquals("Offset wrong: ", new Integer(29), ns.getOriginalOffset(new Integer(27))); - assertEquals("Offset wrong: ", new Integer(30), ns.getOriginalOffset(new Integer(28))); + assertEquals("Offset wrong: ", Integer.valueOf(0), ns.getOriginalOffset(Integer.valueOf(0))); + assertEquals("Offset wrong: ", Integer.valueOf(12), ns.getOriginalOffset(Integer.valueOf(12))); + assertEquals("Offset wrong: ", Integer.valueOf(13), ns.getOriginalOffset(Integer.valueOf(13))); + assertEquals("Offset wrong: ", Integer.valueOf(28), ns.getOriginalOffset(Integer.valueOf(26))); + assertEquals("Offset wrong: ", Integer.valueOf(29), ns.getOriginalOffset(Integer.valueOf(27))); + assertEquals("Offset wrong: ", Integer.valueOf(30), ns.getOriginalOffset(Integer.valueOf(28))); str = "test dosing unit KLRg1 killer cell lectin like receptor G2 Parkinson's Disease"; ns = StringNormalizerForChunking.normalizeString(str, tokenizerFactory); @@ -144,6 +144,15 @@ public void testNormalizeWithTokenizer() { } + @Test + public void testNewlines() { + String str = "Clinical Features and Course of Patients with Peripheral Exudative Hemorrhagic Chorioretinopathy.\n" + + "To evaluate the clinical characteristics of patients who were followed in our clinic with the diagnosis of peripheral exudative hemorrhagic chorioretinopathy (PEHC).\n" + + "Medical records of 12 patients who were diagnosed with PEHC in İstanbul University İstanbul Faculty of Medicine, Department of Ophthalmology between July 2006 and June 2014 were reviewed retrospectively."; + NormalizedString normalizedString = StringNormalizerForChunking.normalizeString(str, new IndoEuropeanTokenizerFactory(), Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower")); + System.out.println(normalizedString.getOffsetMap()); + } + @Test @Ignore /** diff --git a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java index 2266b3e4b..612e8c094 100644 --- a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java +++ b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java @@ -18,6 +18,7 @@ import com.aliasi.chunk.Chunk; import com.aliasi.chunk.ChunkFactory; import de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProviderImplAlt; +import de.julielab.jcore.ae.lingpipegazetteer.chunking.ConfigurableChunkerProviderImplAlt; import de.julielab.jcore.ae.lingpipegazetteer.chunking.OverlappingChunk; import de.julielab.jcore.types.*; import junit.framework.TestCase; @@ -49,6 +50,8 @@ import java.util.List; import java.util.Set; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; public class GazetteerAnnotatorTest extends TestCase { private static final Logger LOGGER = LoggerFactory.getLogger(GazetteerAnnotatorTest.class); @@ -654,4 +657,37 @@ public void testReadCompressedDictionary() throws Exception { assertEquals(1, counter); } + @Test + public void testOffsetIssueWhenNoTransliteration() throws Exception { + ExternalResourceDescription extDesc = ExternalResourceFactory.createExternalResourceDescription( + ConfigurableChunkerProviderImplAlt.class, "file:src/test/resources/pehc.dict", ConfigurableChunkerProviderImplAlt.PARAM_CASE_SENSITIVE, false, ConfigurableChunkerProviderImplAlt.PARAM_NORMALIZE_TEXT, true, ConfigurableChunkerProviderImplAlt.PARAM_TRANSLITERATE_TEXT, false, ConfigurableChunkerProviderImplAlt.PARAM_STOPWORD_FILE, "de/julielab/jcore/ae/lingpipegazetteer/stopwords/general_english_words", ConfigurableChunkerProviderImplAlt.PARAM_USE_APPROXIMATE_MATCHING, true, ConfigurableChunkerProviderImplAlt.PARAM_MAKE_VARIANTS, false); + TypeSystemDescription tsDesc = TypeSystemDescriptionFactory + .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); + + AnalysisEngine annotator = AnalysisEngineFactory.createEngine(GazetteerAnnotator.class, tsDesc, + GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.EntityMention", + GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); + + JCas jCas = annotator.newJCas(); + + jCas.setDocumentText("Clinical Features and Course of Patients with Peripheral Exudative Hemorrhagic Chorioretinopathy.\nTo evaluate the clinical characteristics of patients who were followed in our clinic with the diagnosis of peripheral exudative hemorrhagic chorioretinopathy (PEHC).\nMedical records of 12 patients who were diagnosed with PEHC in İstanbul University İstanbul Faculty of Medicine, Department of Ophthalmology between July 2006 and June 2014 were reviewed retrospectively.\nThis study included 21 eyes of 12 patients. Four (33.3%) of the patients were male and 8 (66.7%) were female and ages ranged between 73 and 89 years. Eight (66.7%) of the patients were referred to us with the diagnosis of choroidal mass. Unilateral involvement was found in 3 and bilateral involvement in 9 patients. Temporal quadrants were involved in all eyes. Fifteen eyes (71.4%) had subretinal hemorrhage and hemorrhagic/serous retinal pigment epithelial detachment, 11 (52.4%) had lipid exudation, 5 (23.8%) had chronic retinal pigment epithelium alterations, 2 (9.5%) had subretinal fibrosis and 1 (4.8%) had vitreous hemorrhage. PEHC lesions were accompanied by drusen in 11 eyes (52.4%), geographic atrophy in 2 eyes (9.5%), and choroidal neovascularization scar in 2 eyes (9.5%)."); + annotator.process(jCas); + + List entityStrings = new ArrayList<>(); + for (EntityMention g : jCas.getAnnotationIndex(EntityMention.type)) { + entityStrings.add(g.getCoveredText()); + } + assertThat(entityStrings).containsExactly("PEHC", "PEHC", "PEHC", "lesions"); + } + + @Test + public void testEncoding() { + String s1 = "İ"; + String s2 = "i̇"; + System.out.println(s1.getBytes(UTF_8).length); + System.out.println(s1.length()); + System.out.println(s2.getBytes(UTF_8).length); + System.out.println(s2.length()); + } + } diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/pehc.dict b/jcore-lingpipegazetteer-ae/src/test/resources/pehc.dict new file mode 100644 index 000000000..79830708e --- /dev/null +++ b/jcore-lingpipegazetteer-ae/src/test/resources/pehc.dict @@ -0,0 +1,2 @@ +PEHC Gene +lesions Gene \ No newline at end of file diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/unused/bio_text.xmi b/jcore-lingpipegazetteer-ae/src/test/resources/unused/bio_text.xmi deleted file mode 100644 index 5e3993e5f..000000000 --- a/jcore-lingpipegazetteer-ae/src/test/resources/unused/bio_text.xmi +++ /dev/null @@ -1,3 +0,0 @@ - - diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/unused/tmp.txt b/jcore-lingpipegazetteer-ae/src/test/resources/unused/tmp.txt deleted file mode 100644 index 93e1214e3..000000000 --- a/jcore-lingpipegazetteer-ae/src/test/resources/unused/tmp.txt +++ /dev/null @@ -1,4878 +0,0 @@ -0 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: -0 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: 0 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 40 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerA0 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2"0 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: 0 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView0 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: -0 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - tests for errors when loading and initializing dictionary... -375 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -419 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testReadDictionary() - building dictionary took: 0 secs -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -2155 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -2321 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -2327 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2330 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2331 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2334 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -2344 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for EXACT matching (6 matches expected)... -14407 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14416 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14433 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14434 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14435 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14436 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14437 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14440 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14466 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14594 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14595 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14596 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@1.0 chunk is an abbreviation but respective full form is no EntityMention - -14597 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@1.0 chunk is not an abbreviation - -14605 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14608 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@1.0 chunk is an abbreviation and respective full form is EntityMention with same specificType - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@1.0 chunk is not an abbreviation - -14609 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@1.0 chunk is not an abbreviation - -14610 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14614 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no EntityMention - -14615 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14616 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - emAcroSpecType=KLRG2 == emFullformSpecType=KLRG2 -14617 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14626 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14630 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14631 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14632 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14640 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14642 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14643 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "1.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - -14643 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - testProcess() - testing process for APPROX matching (13 matches expected)... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - initialize() - initializing GazetteerAnnotator... -14828 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -readDictionary() - adding entries from src/test/resources/general_english_words to dictionary... -14848 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -readDictionary() - adding entries from src/test/resources/dictionary.tst to dictionary... -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14849 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14850 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14851 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - readDictionary() - make term variants and add them to dictionary (NOTE: this may take a while if dictionary is big!) -14859 [main] INFO de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - process() - processing next document with GazetteerAnnotator... -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - all overlapping chunks: - -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 563-569 -563-568:KLRG1@10.0 start=563 end=568 score=10.0 -563-569:KLRG2@100.0 start=563 end=569 score=100.0 -563-568:KLRG2@0.0 start=563 end=568 score=0.0 -562-568:KLRG2@100.0 start=562 end=568 score=100.0 -14958 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 563 - 568: 0.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 563-568:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 679-685 -679-684:KLRG1@10.0 start=679 end=684 score=10.0 -679-684:KLRG2@0.0 start=679 end=684 score=0.0 -679-685:KLRG2@100.0 start=679 end=685 score=100.0 -14959 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 679 - 684: 0.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 679-684:KLRG2@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 40-46 -41-46:KLRG1@0.0 start=41 end=46 score=0.0 -40-46:KLRG1@100.0 start=40 end=46 score=100.0 -41-47:KLRG1@100.0 start=41 end=47 score=100.0 -41-46:KLRG2@10.0 start=41 end=46 score=10.0 -14960 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 41 - 46: 0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 41-46:KLRG1@0.0 chunk is an abbreviation but respective full form is no EntityMention - -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 526-561 -526-561:KLRG2@0.0 start=526 end=561 score=0.0 -14963 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 526 - 561: 0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 526-561:KLRG2@0.0 chunk is not an abbreviation - -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 741-787 -741-787:ITIM@0.0 start=741 end=787 score=0.0 -14964 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 741 - 787: 0.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 741-787:ITIM@0.0 chunk is not an abbreviation - -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 4-39 -4-39:KLRG2@10.0 start=4 end=39 score=10.0 -14968 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 4 - 39: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 4-39:KLRG2@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 829-834 -829-832:SHP-1@100.0 start=829 end=832 score=100.0 -829-834:SHP-1@10.0 start=829 end=834 score=10.0 -829-833:SHP-1@50.0 start=829 end=833 score=50.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 829 - 834: 10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 829-834:SHP-1@10.0 chunk is not an abbreviation - -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 843-848 -843-847:SHP-1@60.0 start=843 end=847 score=60.0 -843-848:SHP-1@10.0 start=843 end=848 score=10.0 -14969 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 843 - 848: 10.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 843-848:SHP-1@10.0 chunk is not an abbreviation - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 289-294 -289-294:KLRG2@10.0 start=289 end=294 score=10.0 -289-294:KLRG1@0.0 start=289 end=294 score=0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 289 - 294: 0.0 -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 289-294:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14970 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 370-374 -370-374:KLRG2@100.0 start=370 end=374 score=100.0 -370-374:KLRG1@100.0 start=370 end=374 score=100.0 -14971 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 370 - 374: 100.0 -14973 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 370-374:KLRG2@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 818-824 -818-822:SHP-1@100.0 start=818 end=822 score=100.0 -818-824:SHP-1@100.0 start=818 end=824 score=100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 818 - 824: 100.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 818-824:SHP-1@100.0 chunk is not an abbreviation - -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - max span: 428-433 -428-433:KLRG1@0.0 start=428 end=433 score=0.0 -428-433:KLRG2@10.0 start=428 end=433 score=10.0 -14975 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - best chunk: 428 - 433: 0.0 -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - 428-433:KLRG1@0.0 chunk is an abbreviation but respective full form is EntityMention without same specificType - -14983 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14984 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=41; end=46]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - - - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - - - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - - - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - - nnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - - - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=289; end=294]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14985 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG1 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG1 [begin=428; end=433]) has EntityMention: EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14986 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=563; end=568]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: KLRG2 -14987 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (KLRG2 [begin=679; end=684]) has EntityMention: EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14988 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - checking abbreviation: ITIM -14989 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotator - annotateAcronymsWithFullFormEntity() - fullform of abbreviation (ITIM [begin=789; end=793]) has EntityMention: EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - - -+++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ OUTPUTTING ENTITIES +++ - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G1 -EntityMention - sofa: _InitialView - begin: 4 - end: 39 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G1" - head: - mentionLevel: - -14990 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 41 - end: 46 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 289 - end: 294 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG -EntityMention - sofa: _InitialView - begin: 370 - end: 374 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG" - head: - mentionLevel: - -14991 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG1 -EntityMention - sofa: _InitialView - begin: 428 - end: 433 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG1" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - - - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: killer cell lectin-like receptor G2 -EntityMention - sofa: _InitialView - begin: 526 - end: 561 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "killer cell lectin-like receptor G2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 563 - end: 568 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14992 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: KLRG2 -EntityMention - sofa: _InitialView - begin: 679 - end: 684 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "KLRG2" - ref: - resourceEntryList: - textualRepresentation: "KLRG2" - head: - mentionLevel: - -14993 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: immunoreceptor tyrosine-based inhibitory motif -EntityMention - sofa: _InitialView - begin: 741 - end: 787 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "immunoreceptor tyrosine-based inhibitory motif" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: ITIM -EntityMention - sofa: _InitialView - begin: 789 - end: 793 - confidence: "0.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "ITIM" - ref: - resourceEntryList: - textualRepresentation: "ITIM" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHIP-1 -EntityMention - sofa: _InitialView - begin: 818 - end: 824 - confidence: "100.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHIP-1" - head: - mentionLevel: - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - - -14994 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: SHP-2 -EntityMention - sofa: _InitialView - begin: 829 - end: 834 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "SHP-2" - head: - mentionLevel: - -14996 [main] DEBUG de.julielab.jules.lingpipegazetteer.GazetteerAnnotatorTest - entity: sHP-1 -EntityMention - sofa: _InitialView - begin: 843 - end: 848 - confidence: "10.0" - componentId: "de.julielab.jules.lingpipegazetteer.GazetteerAnnotator" - id: - specificType: "SHP-1" - ref: - resourceEntryList: - textualRepresentation: "sHP-1" - head: - mentionLevel: - From 281dc8ba60844d7a67884e98c9bee1590cc5ceb3 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 5 Aug 2020 11:42:15 +0200 Subject: [PATCH 018/269] Removing debug output. --- .../lingpipegazetteer/uima/GazetteerAnnotator.java | 2 -- .../uima/GazetteerAnnotatorTest.java | 14 +++++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java index dd0c68c20..1a9220007 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java @@ -359,8 +359,6 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { if (provider.getNormalize()) { normalizedDocText = StringNormalizerForChunking.normalizeString(docText, normalizationTokenFactory, transliterator); - System.out.println(normalizedDocText.getOffsetMap()); - System.out.println(normalizedDocText.string); } IndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator(); diff --git a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java index 612e8c094..7134ae3e7 100644 --- a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java +++ b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java @@ -241,7 +241,7 @@ public void testProcessWithNormalizationAndApproximateMatching() throws Exceptio TypeSystemDescription tsDesc = TypeSystemDescriptionFactory .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); - AnalysisEngine gazetteerAnnotator = AnalysisEngineFactory.createPrimitive(GazetteerAnnotator.class, tsDesc, + AnalysisEngine gazetteerAnnotator = AnalysisEngineFactory.createEngine(GazetteerAnnotator.class, tsDesc, GazetteerAnnotator.PARAM_CHECK_ACRONYMS, false, GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.OntClassMention", GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); @@ -285,7 +285,7 @@ public void testAnnotatorWithTextNormalization() TypeSystemDescription tsDesc = TypeSystemDescriptionFactory .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); - AnalysisEngine annotator = AnalysisEngineFactory.createPrimitive(GazetteerAnnotator.class, tsDesc, + AnalysisEngine annotator = AnalysisEngineFactory.createEngine(GazetteerAnnotator.class, tsDesc, GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.EntityMention", GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); JCas jCas = annotator.newJCas(); @@ -366,7 +366,7 @@ public void testAnnotateAcronymsWithFullFormEntity() throws Exception { TypeSystemDescription tsDesc = TypeSystemDescriptionFactory .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); - AnalysisEngine annotator = AnalysisEngineFactory.createPrimitive(GazetteerAnnotator.class, tsDesc, + AnalysisEngine annotator = AnalysisEngineFactory.createEngine(GazetteerAnnotator.class, tsDesc, GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.EntityMention", GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); JCas jCas = annotator.newJCas(); @@ -440,7 +440,7 @@ public void testAnnotatorWithTextNormalizationMuh() TypeSystemDescription tsDesc = TypeSystemDescriptionFactory .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); - AnalysisEngine annotator = AnalysisEngineFactory.createPrimitive(GazetteerAnnotator.class, tsDesc, + AnalysisEngine annotator = AnalysisEngineFactory.createEngine(GazetteerAnnotator.class, tsDesc, GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.EntityMention", GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); JCas jCas = annotator.newJCas(); @@ -462,7 +462,7 @@ public void testSontesthalt() throws Exception { TypeSystemDescription tsDesc = TypeSystemDescriptionFactory .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); - AnalysisEngine annotator = AnalysisEngineFactory.createPrimitive(GazetteerAnnotator.class, tsDesc, + AnalysisEngine annotator = AnalysisEngineFactory.createEngine(GazetteerAnnotator.class, tsDesc, GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.EntityMention", GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); @@ -516,7 +516,7 @@ public void testApproximate() throws Exception { TypeSystemDescription tsDesc = TypeSystemDescriptionFactory .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); - AnalysisEngine annotator = AnalysisEngineFactory.createPrimitive(GazetteerAnnotator.class, tsDesc, + AnalysisEngine annotator = AnalysisEngineFactory.createEngine(GazetteerAnnotator.class, tsDesc, GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.EntityMention", GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); @@ -638,7 +638,7 @@ public void testReadCompressedDictionary() throws Exception { TypeSystemDescription tsDesc = TypeSystemDescriptionFactory .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); - AnalysisEngine annotator = AnalysisEngineFactory.createPrimitive(GazetteerAnnotator.class, tsDesc, + AnalysisEngine annotator = AnalysisEngineFactory.createEngine(GazetteerAnnotator.class, tsDesc, GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.EntityMention", GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); From c1182553fd403e03ffcf95694dc246393dae3395 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 14 Aug 2020 09:45:29 +0200 Subject: [PATCH 019/269] XMI Reader/Multiplier: Removed the jcore-all-types type system from the descriptors. The completely inclusion of all types caused problems when other type systems should be included as well. This change could introduce issues at other places where now types are missing. If such cases appear we will need to introduce a new way to add type systems, for example with a no-op component that only exist for type system imports. --- jcore-xmi-db-reader/README.md | 4 +++- .../jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml | 2 +- .../de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-reader.xml | 2 +- jedis-parent/pom.xml | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/jcore-xmi-db-reader/README.md b/jcore-xmi-db-reader/README.md index d587fa8b1..af691dee8 100644 --- a/jcore-xmi-db-reader/README.md +++ b/jcore-xmi-db-reader/README.md @@ -1,8 +1,10 @@ # JCoRe XMI Database Reader -**Descriptor Path**: +**Descriptor Paths**: ``` de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-reader +de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-multiplier-reader +de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-multiplier ``` ### Objective diff --git a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml index 081c3d6a8..992ed962a 100644 --- a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml +++ b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml @@ -29,7 +29,7 @@ - + diff --git a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-reader.xml b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-reader.xml index dd703d3d1..fb634e618 100644 --- a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-reader.xml +++ b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-reader.xml @@ -169,7 +169,7 @@ - + diff --git a/jedis-parent/pom.xml b/jedis-parent/pom.xml index f56a81be0..794ee5c9c 100644 --- a/jedis-parent/pom.xml +++ b/jedis-parent/pom.xml @@ -27,7 +27,7 @@ de.julielab jcore-xmi-splitter - 2.3.4 + 2.3.5-SNAPSHOT From d983891d47dd6b1623dc328221bb6859ddd2f10f Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 14 Aug 2020 09:45:56 +0200 Subject: [PATCH 020/269] DBCheckpointAE: Correcting descriptors paths in the README.md file. --- jcore-db-checkpoint-ae/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jcore-db-checkpoint-ae/README.md b/jcore-db-checkpoint-ae/README.md index 6a4ed4f4b..a74f91d53 100644 --- a/jcore-db-checkpoint-ae/README.md +++ b/jcore-db-checkpoint-ae/README.md @@ -2,7 +2,8 @@ **Descriptor Path**: ``` -de.julielab.desc.jcore-db-checkpoint-ae +de.julielab.jcore.ae.checkpoint.desc.jcore-db-checkpoint-ae +de.julielab.jcore.ae.checkpoint.desc.jcore-db-checkpoint-consumer ``` This is a JeDiS[1] component. It can be used to set the 'last component' column in a subset table. This help to keep track of the pipeline status. From 140d91d185387e96383be0fc143dc8c0df2c44d0 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 18 Aug 2020 08:22:20 +0200 Subject: [PATCH 021/269] Neo4jRelationsConsumer: Not sending empty relation documents. --- .../sharedresources/AbstractMapProvider.java | 2 +- .../Neo4jRelationsConsumer.java | 74 ++++++++++--------- 2 files changed, 39 insertions(+), 37 deletions(-) diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AbstractMapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AbstractMapProvider.java index 6491627cf..fdc15aaa1 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AbstractMapProvider.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AbstractMapProvider.java @@ -49,7 +49,7 @@ public void load(DataResource aData) throws ResourceInitializationException { map.put(getKey(split[0]), getValue(split[1])); } log.info("Finished reading resource {}", aData.getUri()); - log.info("Copying {} values into a fresh HashMap of the exactly correct size", map.size()); + log.info("Copying {} values into a fresh HashMap of the exact correct size", map.size()); HashMap tmp = new HashMap<>(map.size(), 1f); tmp.putAll(map); map = tmp; diff --git a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java index 7ff69f9f8..0a1aaafff 100644 --- a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java +++ b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java @@ -157,45 +157,47 @@ public void collectionProcessComplete() throws AnalysisEngineProcessException { private void sendRelationsToNeo4j() throws AnalysisEngineProcessException { try { - URL url = URI.create(this.url).toURL(); - HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); - urlConnection.addRequestProperty("Content-Type", "application/json"); - String authorizationToken = neo4jUser != null && neo4jPassword != null - ? "Basic " + Base64.encodeBase64URLSafeString((neo4jUser + ":" + neo4jPassword).getBytes()) - : null; - if (authorizationToken != null) - urlConnection.setRequestProperty("Authorization", authorizationToken); - urlConnection.setRequestMethod(HttpMethod.POST); - urlConnection.setDoOutput(true); - try (OutputStream outputStream = urlConnection.getOutputStream()) { - JsonFactory jf = new JsonFactory(om); - JsonGenerator g = jf.createGenerator(outputStream); - g.writeStartObject(); - g.writeObjectField(ImportIERelations.NAME_ID_PROPERTY, idProperty); - g.writeObjectField(ImportIERelations.NAME_ID_SOURCE, globalSource); - - List documents = importIERelations.getDocuments(); - g.writeFieldName(ImportIERelations.NAME_DOCUMENTS); - g.writeStartArray(); - log.debug("Converting {} relation documents to JSON.", documents.size()); - for (ImportIERelationDocument document : (Iterable) documents::iterator) { - g.writeObject(document); + if (!importIERelations.getDocuments().isEmpty()) { + URL url = URI.create(this.url).toURL(); + HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); + urlConnection.addRequestProperty("Content-Type", "application/json"); + String authorizationToken = neo4jUser != null && neo4jPassword != null + ? "Basic " + Base64.encodeBase64URLSafeString((neo4jUser + ":" + neo4jPassword).getBytes()) + : null; + if (authorizationToken != null) + urlConnection.setRequestProperty("Authorization", authorizationToken); + urlConnection.setRequestMethod(HttpMethod.POST); + urlConnection.setDoOutput(true); + try (OutputStream outputStream = urlConnection.getOutputStream()) { + JsonFactory jf = new JsonFactory(om); + JsonGenerator g = jf.createGenerator(outputStream); + g.writeStartObject(); + g.writeObjectField(ImportIERelations.NAME_ID_PROPERTY, idProperty); + g.writeObjectField(ImportIERelations.NAME_ID_SOURCE, globalSource); + + List documents = importIERelations.getDocuments(); + g.writeFieldName(ImportIERelations.NAME_DOCUMENTS); + g.writeStartArray(); + log.debug("Converting {} relation documents to JSON.", documents.size()); + for (ImportIERelationDocument document : (Iterable) documents::iterator) { + g.writeObject(document); + } + g.writeEndArray(); + g.writeEndObject(); + g.close(); } - g.writeEndArray(); - g.writeEndObject(); - g.close(); - } - try (InputStream inputStream = urlConnection.getInputStream()) { - log.debug("Response from Neo4j: {}", IOUtils.toString(inputStream, UTF_8)); - } catch (IOException e) { - log.error("Exception occurred while sending relation data to Neo4j server."); - try (InputStream inputStream = urlConnection.getErrorStream()) { - if (inputStream != null) - log.error("Error from Neo4j: {}", IOUtils.toString(inputStream, UTF_8)); + try (InputStream inputStream = urlConnection.getInputStream()) { + log.debug("Response from Neo4j: {}", IOUtils.toString(inputStream, UTF_8)); + } catch (IOException e) { + log.error("Exception occurred while sending relation data to Neo4j server."); + try (InputStream inputStream = urlConnection.getErrorStream()) { + if (inputStream != null) + log.error("Error from Neo4j: {}", IOUtils.toString(inputStream, UTF_8)); + } + throw e; } - throw e; + importIERelations.clear(); } - importIERelations.clear(); log.debug("Releasing {} document IDs that have successfully been sent to Neo4j", documentIds.size()); DocumentReleaseCheckpoint.get().release(Neo4jRelationsConsumer.class.getCanonicalName(), documentIds.stream()); documentIds.clear(); From 23265383355a0fde824ba41ac733f7115d052863 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 8 Sep 2020 14:05:01 +0200 Subject: [PATCH 022/269] Adding a error log message to the FlairNerAnnotator. --- .../de/julielab/jcore/ae/flairner/FlairNerAnnotator.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java index 4aea01797..f09332fd0 100644 --- a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java +++ b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java @@ -140,7 +140,12 @@ public void process(final JCas aJCas) throws AnalysisEngineProcessException { for (TaggedEntity entity : taggedEntities) { final Sentence sentence = sentenceMap.get(entity.getDocumentId()); EntityMention em = (EntityMention) JCoReAnnotationTools.getAnnotationByClassName(aJCas, entityClass); - helper.setAnnotationOffsetsRelativeToSentence(sentence, em, entity, adderConfig); + try { + helper.setAnnotationOffsetsRelativeToSentence(sentence, em, entity, adderConfig); + } catch (AnnotationOffsetException e) { + log.error("Cannot add entity {} to sentence: {}", entity, sentence.getCoveredText()); + throw e; + } em.setSpecificType(entity.getTag()); em.setConfidence(String.valueOf(entity.getLabelConfidence())); em.setComponentId(componentId); From e89dae61eb1674bbfc8c8a85fd9ace67fce1c3db Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 8 Sep 2020 14:36:37 +0200 Subject: [PATCH 023/269] Flair NER AE: Token offset issue fix with flair 0.6. We now explicitly use the SpaceTokenizer when creating a flair Sentence to make sure that the given tokenization is employed. --- .../jcore/ae/annotationadder/AnnotationAdderHelper.java | 8 +++++++- .../de/julielab/jcore/ae/flairner/FlairNerAnnotator.java | 7 +------ .../de/julielab/jcore/ae/flairner/python/nerScript.py | 6 ++++-- .../julielab/jcore/ae/flairner/FlairNerAnnotatorTest.java | 2 +- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelper.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelper.java index 831ecb280..97a2d8447 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelper.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelper.java @@ -8,6 +8,8 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.HashMap; @@ -15,11 +17,13 @@ import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; /** * Caches information for the current document. */ public class AnnotationAdderHelper { + private final static Logger log = LoggerFactory.getLogger(AnnotationAdderHelper.class); // Required for token-offsets private List tokenList; private Map> tokensBySentences; @@ -68,8 +72,10 @@ public void setAnnotationOffsetsRelativeToSentence(Sentence sentence, Annotation List tokenList = tokensBySentences.get(sentence); int startTokenNum = a.getStart(); int endTokenNum = a.getEnd(); - if (startTokenNum < 1 || startTokenNum > tokenList.size()) + if (startTokenNum < 1 || startTokenNum > tokenList.size()) { + log.error("Cannot create entity because of a token offset mismatch. The entity should tart at token {} and end at {}. But there are only {} tokens available: {}", startTokenNum, endTokenNum, tokenList.size(), tokenList.stream().map(Annotation::getCoveredText).collect(Collectors.joining(" "))); throw new AnnotationOffsetException("The current annotation to add to the CAS starts at token " + startTokenNum + " which does not fit to the range of tokens in the sentence with ID " + sentence.getId() + " which is 1 - " + tokenList.size()); + } if (endTokenNum < 1 || endTokenNum > tokenList.size()) throw new AnnotationOffsetException("The current annotation to add to the CAS ends at token " + endTokenNum + " which does not fit to the range of tokens in the sentence with ID " + sentence.getId() + " which is 1 - " + tokenList.size()); if (endTokenNum < startTokenNum) diff --git a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java index f09332fd0..4aea01797 100644 --- a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java +++ b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java @@ -140,12 +140,7 @@ public void process(final JCas aJCas) throws AnalysisEngineProcessException { for (TaggedEntity entity : taggedEntities) { final Sentence sentence = sentenceMap.get(entity.getDocumentId()); EntityMention em = (EntityMention) JCoReAnnotationTools.getAnnotationByClassName(aJCas, entityClass); - try { - helper.setAnnotationOffsetsRelativeToSentence(sentence, em, entity, adderConfig); - } catch (AnnotationOffsetException e) { - log.error("Cannot add entity {} to sentence: {}", entity, sentence.getCoveredText()); - throw e; - } + helper.setAnnotationOffsetsRelativeToSentence(sentence, em, entity, adderConfig); em.setSpecificType(entity.getTag()); em.setConfidence(String.valueOf(entity.getLabelConfidence())); em.setComponentId(componentId); diff --git a/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/python/nerScript.py b/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/python/nerScript.py index d55859594..e405ea93b 100644 --- a/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/python/nerScript.py +++ b/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/python/nerScript.py @@ -4,6 +4,7 @@ import torch from flair.data import Sentence from flair.models import SequenceTagger +from flair.tokenization import SpaceTokenizer from struct import * @@ -43,9 +44,10 @@ def decodeString(buffer): ba = bytearray() for sentenceToTag in sentenceTaggingRequests: sid = sentenceToTag['sid'] - sentence = Sentence(sentenceToTag['text']) + # Use the SpaceTokenizer to just use the tokenization given from UIMA + sentence = Sentence(sentenceToTag['text'], use_tokenizer=SpaceTokenizer()) # NER tagging - embeddingStorageMode = "none" if sendEmbeddings == "NONE" else "cpu"; + embeddingStorageMode = "none" if sendEmbeddings == "NONE" else "cpu" tagger.predict(sentence, embedding_storage_mode = embeddingStorageMode) for e in sentence.get_spans("ner"): diff --git a/jcore-flair-ner-ae/src/test/java/de/julielab/jcore/ae/flairner/FlairNerAnnotatorTest.java b/jcore-flair-ner-ae/src/test/java/de/julielab/jcore/ae/flairner/FlairNerAnnotatorTest.java index 2317e08e9..9c5171fd6 100644 --- a/jcore-flair-ner-ae/src/test/java/de/julielab/jcore/ae/flairner/FlairNerAnnotatorTest.java +++ b/jcore-flair-ner-ae/src/test/java/de/julielab/jcore/ae/flairner/FlairNerAnnotatorTest.java @@ -240,7 +240,7 @@ public void testAnnotator2() throws Exception { } @Test - public void testAnnotatorOnOffsetIsseDocument() throws Exception { + public void testAnnotatorOnOffsetIssueDocument() throws Exception { final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types"); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt"); From c82db9a47632fb5b926cb8865ef2a93ced39a491 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 8 Sep 2020 15:09:53 +0200 Subject: [PATCH 024/269] Adapting the flair NER script to still support flair 0.4x apart from newer versions. --- .../de/julielab/jcore/ae/flairner/python/nerScript.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/python/nerScript.py b/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/python/nerScript.py index e405ea93b..f37fdab4a 100644 --- a/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/python/nerScript.py +++ b/jcore-flair-ner-ae/src/main/resources/de/julielab/jcore/ae/flairner/python/nerScript.py @@ -4,7 +4,6 @@ import torch from flair.data import Sentence from flair.models import SequenceTagger -from flair.tokenization import SpaceTokenizer from struct import * @@ -44,8 +43,14 @@ def decodeString(buffer): ba = bytearray() for sentenceToTag in sentenceTaggingRequests: sid = sentenceToTag['sid'] - # Use the SpaceTokenizer to just use the tokenization given from UIMA - sentence = Sentence(sentenceToTag['text'], use_tokenizer=SpaceTokenizer()) + # In newer flair versions we need to specify the tokenizer in order to use + # the exact input tokenization and avoid token offset mismatches + if "0.4" in flair.__version__: + sentence = Sentence(sentenceToTag['text']) + else: + from flair.tokenization import SpaceTokenizer + # Use the SpaceTokenizer to just use the tokenization given from UIMA + sentence = Sentence(sentenceToTag['text'], use_tokenizer=SpaceTokenizer()) # NER tagging embeddingStorageMode = "none" if sendEmbeddings == "NONE" else "cpu" tagger.predict(sentence, embedding_storage_mode = embeddingStorageMode) From 3bdb8c7f7ad9ba62c16fc8060fc640e843cb3b80 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 8 Sep 2020 15:31:20 +0200 Subject: [PATCH 025/269] Updating BioC TextMining API to v1.0.3. --- jcore-ign-reader/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-ign-reader/pom.xml b/jcore-ign-reader/pom.xml index df7d561d4..423a3fbce 100644 --- a/jcore-ign-reader/pom.xml +++ b/jcore-ign-reader/pom.xml @@ -17,7 +17,7 @@ com.pengyifan.bioc pengyifan-bioc - 1.0.2 + 1.0.3 de.julielab From 84477bf167f74f80306fd0cfe64c8eb3fdbd3221 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 14 Oct 2020 13:22:06 +0200 Subject: [PATCH 026/269] PMC Reader: Had issues with file paths containing XML-style escaped characters. Now applying XML-unescaping on the input URLs. --- jcore-flair-ner-ae/pom.xml | 2 +- jcore-pmc-reader/pom.xml | 5 +++++ .../de/julielab/jcore/reader/pmc/CasPopulator.java | 5 ++--- .../julielab/jcore/reader/pmc/NXMLURIIterator.java | 3 ++- .../jcore/reader/pmc/NXMLURIIteratorTest.java | 14 ++++++++++++++ 5 files changed, 24 insertions(+), 5 deletions(-) diff --git a/jcore-flair-ner-ae/pom.xml b/jcore-flair-ner-ae/pom.xml index 9ad39de20..f608f17a3 100644 --- a/jcore-flair-ner-ae/pom.xml +++ b/jcore-flair-ner-ae/pom.xml @@ -21,7 +21,7 @@ de.julielab java-stdio-ipc - 1.0.1 + 1.0.2 de.julielab diff --git a/jcore-pmc-reader/pom.xml b/jcore-pmc-reader/pom.xml index 976a1b456..8325af177 100644 --- a/jcore-pmc-reader/pom.xml +++ b/jcore-pmc-reader/pom.xml @@ -14,6 +14,11 @@ + + org.apache.commons + commons-text + 1.9 + org.slf4j slf4j-api diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java index 481e4db4c..ff3a1e0f0 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java @@ -29,9 +29,8 @@ public void populateCas(URI nxmlUri, JCas cas) throws ElementParsingException { nxmlDocumentParser.reset(currentUri, cas); result = nxmlDocumentParser.parse(); } catch (DocumentParsingException e) { - log.warn("Error occurred: {}. Skipping document.", e.getMessage()); - if (nxmlIterator.hasNext()) - currentUri = nxmlIterator.next(); + log.warn("Error occurred when trying to read from URI {} (ASCII string: {}): {}. Skipping document.", currentUri, currentUri.toASCIIString(), e.getMessage()); + currentUri = nxmlIterator.next(); } } StringBuilder sb = populateCas(result, new StringBuilder()); diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java index 02b5d7feb..5ef2dbe94 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java @@ -1,5 +1,6 @@ package de.julielab.jcore.reader.pmc; +import org.apache.commons.text.StringEscapeUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -94,7 +95,7 @@ private void setFilesAndSubDirectories(File directory, boolean recursiveCall) { while (entries.hasMoreElements()) { final ZipEntry e = entries.nextElement(); if (!e.isDirectory() && e.getName().contains(".nxml") && isInWhitelist(new File(e.getName()))) { - final String urlStr = "jar:" + directory.toURI().toString() + "!/" + e.getName(); + final String urlStr = StringEscapeUtils.unescapeXml("jar:" + directory.toURI().toString() + "!/" + e.getName()); URL url = new URL(urlStr); try { final URI uri = url.toURI(); diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/NXMLURIIteratorTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/NXMLURIIteratorTest.java index 14faf27df..df967924b 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/NXMLURIIteratorTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/NXMLURIIteratorTest.java @@ -1,16 +1,20 @@ package de.julielab.jcore.reader.pmc; +import org.apache.commons.text.StringEscapeUtils; import org.junit.Test; import java.io.File; import java.io.FileNotFoundException; +import java.net.MalformedURLException; import java.net.URI; +import java.net.URL; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatCode; import static org.junit.Assert.assertTrue; public class NXMLURIIteratorTest { @@ -53,4 +57,14 @@ public void testGetPmcFiles() throws Exception { assertThat(expectedFiles).containsExactlyInAnyOrder("PMC2847692.nxml.gz", "PMC2758189.nxml.gz", "PMC2970367.nxml.gz", "PMC3201365.nxml.gz", "PMC4257438.nxml.gz"); } + + @Test + public void testXmlEntities() throws MalformedURLException { + String s = "jar:file:/data/data_corpora/PMC/non_comm_use.O-Z.xml.zip!/Pädiatrische_Gastroenterologie,_Hepatologie_und_Ernährung/PMC7498810.nxml"; + s = StringEscapeUtils.unescapeXml(s); + assertThat(s).doesNotContain("ä"); + URL url = new URL(s); + assertThat(url).isNotNull(); + assertThatCode(() -> url.toURI()).doesNotThrowAnyException(); + } } From 7685201d7ca91f5143b3fa91596f565bbd9dc7d0 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 14 Oct 2020 13:40:06 +0200 Subject: [PATCH 027/269] Now only falling back to XML unescaping if the first try throw an exception. This is a safety measure to avoid other issues by always doing the unescaping. --- .../jcore/reader/pmc/NXMLURIIterator.java | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java index 5ef2dbe94..652e9db17 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java @@ -95,7 +95,7 @@ private void setFilesAndSubDirectories(File directory, boolean recursiveCall) { while (entries.hasMoreElements()) { final ZipEntry e = entries.nextElement(); if (!e.isDirectory() && e.getName().contains(".nxml") && isInWhitelist(new File(e.getName()))) { - final String urlStr = StringEscapeUtils.unescapeXml("jar:" + directory.toURI().toString() + "!/" + e.getName()); + final String urlStr ="jar:" + directory.toURI().toString() + "!/" + e.getName(); URL url = new URL(urlStr); try { final URI uri = url.toURI(); @@ -106,7 +106,21 @@ private void setFilesAndSubDirectories(File directory, boolean recursiveCall) { logFileSearch.error("Putting URI for URL {} into the queue was interrupted", url); throw new UncheckedPmcReaderException(e1); } catch (URISyntaxException e1) { - logFileSearch.error("Could not convert URL {} to URI.", url, e); + // This exception can happen when the path contains XML escaped characters, e.g. + // non_comm_use.O-Z.xml.zip!/Pädiatrische_Gastroenterologie,_Hepatologie_und_Ernährung/PMC7498810.nxml + // Try to unescape it. + try { + url = new URL(StringEscapeUtils.unescapeXml(urlStr)); + final URI uri = url.toURI(); + logFileSearch.trace("Waiting to put URI {} into queue", uri); + uris.put(uri); + logFileSearch.trace("Successfully put URI {} into queue", uri); + } catch (URISyntaxException e2) { + logFileSearch.error("Could not convert URL {} to URI.", url, e); + } catch (InterruptedException e2) { + logFileSearch.error("Putting URI for URL {} into the queue was interrupted", url); + throw new UncheckedPmcReaderException(e2); + } } } } From d401ae110d9a1a21bf047cb6ada9ea9af535db76 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 15 Oct 2020 11:51:18 +0200 Subject: [PATCH 028/269] Now handling the issue correctly: By URLEncoding the file path. --- jcore-pmc-reader/pom.xml | 5 --- .../jcore/reader/pmc/NXMLURIIterator.java | 39 +++++++++---------- .../reader/pmc/parser/NxmlDocumentParser.java | 5 +-- .../jcore/reader/pmc/NXMLURIIteratorTest.java | 22 ++++++----- 4 files changed, 33 insertions(+), 38 deletions(-) diff --git a/jcore-pmc-reader/pom.xml b/jcore-pmc-reader/pom.xml index 8325af177..976a1b456 100644 --- a/jcore-pmc-reader/pom.xml +++ b/jcore-pmc-reader/pom.xml @@ -14,11 +14,6 @@ - - org.apache.commons - commons-text - 1.9 - org.slf4j slf4j-api diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java index 652e9db17..7aa245057 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java @@ -1,6 +1,5 @@ package de.julielab.jcore.reader.pmc; -import org.apache.commons.text.StringEscapeUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -10,15 +9,19 @@ import java.net.URI; import java.net.URISyntaxException; import java.net.URL; +import java.net.URLEncoder; import java.nio.file.Path; import java.util.*; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.CompletableFuture; +import java.util.stream.Collectors; import java.util.stream.Stream; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; +import static java.nio.charset.StandardCharsets.UTF_8; + public class NXMLURIIterator implements Iterator { private final static Logger log = LoggerFactory.getLogger(NXMLURIIterator.class); private final static Logger logFileSearch = LoggerFactory.getLogger(NXMLURIIterator.class.getCanonicalName() + ".FileSearch"); @@ -84,49 +87,45 @@ private void setFilesAndSubDirectories(File directory, boolean recursiveCall) { throw new UncheckedPmcReaderException(e); } } + // Save the subdirectories and potentially ZIP files for a recursive reading call further below Stream.of(directory.listFiles(f -> f.isDirectory())).forEach(pendingSubdirs::push); if (searchZip) Stream.of(directory.listFiles(f -> f.isFile() && isZipFile(f))).forEach(pendingSubdirs::push); + logFileSearch.trace("Added subdirectories and/or ZIP files to the list of pending directories and archives. There are now {} pending.", pendingSubdirs.size()); } else if (searchZip && isZipFile(directory)) { logFileSearch.debug("Identified {} as a ZIP archive, retrieving its inventory", directory); logFileSearch.debug("Searching ZIP archive {} for eligible documents", directory); try (ZipFile zf = new ZipFile(directory)) { final Enumeration entries = zf.entries(); + int numEntries = 0; while (entries.hasMoreElements()) { final ZipEntry e = entries.nextElement(); if (!e.isDirectory() && e.getName().contains(".nxml") && isInWhitelist(new File(e.getName()))) { - final String urlStr ="jar:" + directory.toURI().toString() + "!/" + e.getName(); - URL url = new URL(urlStr); + final String urlStr = "jar:" + directory.toURI().toString() + "!/" + e.getName(); + int exclamationIndex = urlStr.indexOf('!'); + final String urlEncodedStr = urlStr.substring(0, exclamationIndex + 2) + Stream.of(urlStr.substring(exclamationIndex + 2).split("/")).map(x -> URLEncoder.encode(x, UTF_8)).collect(Collectors.joining("/")); + URL url = new URL(urlEncodedStr); try { final URI uri = url.toURI(); logFileSearch.trace("Waiting to put URI {} into queue", uri); uris.put(uri); - logFileSearch.trace("Successfully put URI {} into queue", uri); + ++numEntries; + logFileSearch.trace("Successfully put URI {} into queue. Queue size: {}", uri, uris.size()); } catch (InterruptedException e1) { logFileSearch.error("Putting URI for URL {} into the queue was interrupted", url); throw new UncheckedPmcReaderException(e1); } catch (URISyntaxException e1) { - // This exception can happen when the path contains XML escaped characters, e.g. - // non_comm_use.O-Z.xml.zip!/Pädiatrische_Gastroenterologie,_Hepatologie_und_Ernährung/PMC7498810.nxml - // Try to unescape it. - try { - url = new URL(StringEscapeUtils.unescapeXml(urlStr)); - final URI uri = url.toURI(); - logFileSearch.trace("Waiting to put URI {} into queue", uri); - uris.put(uri); - logFileSearch.trace("Successfully put URI {} into queue", uri); - } catch (URISyntaxException e2) { - logFileSearch.error("Could not convert URL {} to URI.", url, e); - } catch (InterruptedException e2) { - logFileSearch.error("Putting URI for URL {} into the queue was interrupted", url); - throw new UncheckedPmcReaderException(e2); - } + logFileSearch.error("Could not convert URL {} to URI.", url, e); + throw new UncheckedPmcReaderException(e1); } } } + logFileSearch.trace("Finished retrieving files from ZIP archive {}. {} eligible documents were read.", directory, numEntries); } catch (IOException e) { logFileSearch.error("Could not read from {}", directory); throw new UncheckedPmcReaderException(e); + } catch (Throwable t) { + logFileSearch.error("Unexpected error:", t); } } else { logFileSearch.debug("Recursive search is deactivated, skipping subdirectory {}", directory); @@ -179,7 +178,7 @@ private boolean isInWhitelist(File file) { private boolean isInWhitelist(String name) { boolean inWhitelist = whitelist.contains(name) || (whitelist.size() == 1 && whitelist.contains("all")); if (!inWhitelist) - log.trace("Skipping document with name/id {} because it is not contained in the white list.", name); + logFileSearch.trace("Skipping document with name/id {} because it is not contained in the white list.", name); return inWhitelist; } diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java index d85e133c2..069d038f1 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java @@ -40,16 +40,15 @@ public class NxmlDocumentParser extends NxmlParser { private DefaultElementParser defaultElementParser; private Map> tagProperties; private Tagset tagset; - private URI uri; public void reset(File nxmlFile, JCas cas) throws DocumentParsingException { reset(nxmlFile.toURI(), cas); } public void reset(URI uri, JCas cas) throws DocumentParsingException { - this.uri = uri; - boolean gzipped = uri.toString().endsWith(".gz") || this.uri.toString().endsWith(".gzip"); + boolean gzipped = uri.toString().endsWith(".gz") || uri.toString().endsWith(".gzip"); try { + log.debug("Reading from URL {}", uri.toURL()); InputStream is = uri.toURL().openStream(); if (gzipped) is = new GZIPInputStream(is); diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/NXMLURIIteratorTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/NXMLURIIteratorTest.java index df967924b..8c328c2ac 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/NXMLURIIteratorTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/NXMLURIIteratorTest.java @@ -1,18 +1,18 @@ package de.julielab.jcore.reader.pmc; -import org.apache.commons.text.StringEscapeUtils; import org.junit.Test; import java.io.File; import java.io.FileNotFoundException; -import java.net.MalformedURLException; -import java.net.URI; -import java.net.URL; +import java.net.*; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatCode; import static org.junit.Assert.assertTrue; @@ -59,12 +59,14 @@ public void testGetPmcFiles() throws Exception { } @Test - public void testXmlEntities() throws MalformedURLException { - String s = "jar:file:/data/data_corpora/PMC/non_comm_use.O-Z.xml.zip!/Pädiatrische_Gastroenterologie,_Hepatologie_und_Ernährung/PMC7498810.nxml"; - s = StringEscapeUtils.unescapeXml(s); - assertThat(s).doesNotContain("ä"); - URL url = new URL(s); + public void testXmlEntities() throws MalformedURLException, URISyntaxException { + String inputPath = "jar:file:/data/data_corpora/PMC/non_comm_use.O-Z.xml.zip!/Pädiatrische_Gastroenterologie,_Hepatologie_und_Ernährung/PMC7498810.nxml"; + int exclamationIndex = inputPath.indexOf('!'); + String encoded = inputPath.substring(0, exclamationIndex + 2) + Stream.of(inputPath.substring(exclamationIndex+2).split("/")).map(x -> URLEncoder.encode(x, UTF_8)).collect(Collectors.joining("/")); + URL url = new URL(encoded); assertThat(url).isNotNull(); - assertThatCode(() -> url.toURI()).doesNotThrowAnyException(); + assertThatCode(() -> url.toURI().toASCIIString()).doesNotThrowAnyException(); + String outputPath = Stream.of(url.toURI().toASCIIString().split("/")).map(x -> URLDecoder.decode(x, UTF_8)).collect(Collectors.joining("/")); + assertThat(inputPath).isEqualTo(outputPath); } } From d4739984e1a7974291818a60b8acb028ea2181b1 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 20 Oct 2020 09:59:22 +0200 Subject: [PATCH 029/269] `JCoReOverlapAnnotationIndex`: When searching, returning a list instead of a stream from that list. Added getters for the internal index lists. Fixes #117. --- .../index/JCoReOverlapAnnotationIndex.java | 286 +++++++++--------- 1 file changed, 141 insertions(+), 145 deletions(-) diff --git a/jcore-utilities/src/main/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndex.java b/jcore-utilities/src/main/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndex.java index ea919ae06..7a44dedee 100644 --- a/jcore-utilities/src/main/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndex.java +++ b/jcore-utilities/src/main/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndex.java @@ -1,11 +1,10 @@ -/** - * +/** * Copyright (c) 2017, JULIE Lab. - * All rights reserved. This program and the accompanying materials + * All rights reserved. This program and the accompanying materials * are made available under the terms of the BSD-2-Clause License - * - * Author: - * + *

+ * Author: + *

* Description: **/ package de.julielab.jcore.utility.index; @@ -19,7 +18,6 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.stream.Stream; /** *

@@ -46,145 +44,143 @@ * elements. Those are - in the case a lies in the middle of the index elements * - n/2. *

- * - * @author faessler * - * @param - * The annotation type the index should be over. + * @param The annotation type the index should be over. + * @author faessler */ public class JCoReOverlapAnnotationIndex implements JCoReAnnotationIndex { - private List beginIndex; - private List endIndex; - private boolean frozen; - - public JCoReOverlapAnnotationIndex() { - beginIndex = new ArrayList<>(); - endIndex = new ArrayList<>(); - } - - public JCoReOverlapAnnotationIndex(JCas jcas, int type) { - this(jcas, jcas.getCasType(type)); - } - - public JCoReOverlapAnnotationIndex(JCas jcas, Type type) { - this(); - index(jcas, type); - freeze(); - } - - /** - * Indexes the whole contents of the CAS annotation index of type - * type. For each annotation, the {@link #indexTermGenerator} is - * used to create terms with which the annotation will be associated in the - * index and can be retrieved by a search method. - * - * @param jCas - * A CAS instance. - * @param type - * The annotation type to index. - */ - public void index(JCas jCas, int type) { - index(jCas, jCas.getCasType(type)); - } - - /** - * Indexes the whole contents of the CAS annotation index of type - * type. For each annotation, the {@link #indexTermGenerator} is - * used to create terms with which the annotation will be associated in the - * index and can be retrieved by a search method. - * - * @param jCas - * A CAS instance. - * @param type - * The annotation type to index. - */ - @SuppressWarnings("unchecked") - public void index(JCas jCas, Type type) { - FSIterator it = jCas.getAnnotationIndex(type).iterator(); - while (it.hasNext()) { - Annotation annotation = (Annotation) it.next(); - index((E) annotation); - } - } - - public void index(E annotation) { - if (frozen) - throw new IllegalStateException("This index is frozen and cannot except further items."); - beginIndex.add(annotation); - endIndex.add(annotation); - } - - public void freeze() { - frozen = true; - Collections.sort(beginIndex, Comparators.beginOffsetComparator()); - Collections.sort(endIndex, Comparators.endOffsetComparator()); - } - - /** - * Returns all annotation in the index overlapping in any way with a - * (embedded, covering, partial overlappings). The resulting list is either - * sorted by begin or end offset. It is not easily predictable which case it - * is (could be added as a return value if that would be useful in any way). - * - * @param a - * The annotation to retrieve overlapping annotations from the - * index for. - * @return All annotations in the index overlapping a. - */ - public Stream search(T a) { - if (!frozen) - throw new IllegalStateException( - "This index is not frozen and cannot be used yet. Freeze the index before searching."); - if (beginIndex.isEmpty()) - return Stream.empty(); - // The following is rather difficult to understand from the code. The - // idea is the following: - // We search annotations overlapping with a. Thus, we can rule out those - // annotations that end before a or start after a. - // In the next 4 lines, we determine how many annotations can be ruled - // out because they start after a and how many end before a. - int begin = a.getBegin(); - int end = a.getEnd(); - int indexBeginAfterEnd = insertionPoint(JCoReTools.binarySearch(beginIndex, an -> an.getBegin(), end)); - int indexEndBeforeBegin = insertionPoint(JCoReTools.binarySearch(endIndex, an -> an.getEnd(), begin)); - - // Depending on which case rules out more annotations - ending before a - // or starting after a - we look at the case that leaves us with the - // fewest annotations. If those were the annotations that started after - // a, then we keep those that start before a ends. Those are than - // filtered for annotations that end before a starts. - if (indexBeginAfterEnd < endIndex.size() - indexEndBeforeBegin) { - List beginBeforeEnd = new ArrayList<>(beginIndex.subList(0, indexBeginAfterEnd)); - ArrayList result = new ArrayList<>(); - for (E e : beginBeforeEnd) { - if (e.getEnd() > begin) - result.add(e); - } - return result.stream(); - } else { - List endAfterBegin = new ArrayList<>(endIndex.subList(indexEndBeforeBegin, endIndex.size())); - ArrayList result = new ArrayList<>(); - for (E e : endAfterBegin) { - if (e.getBegin() < end) - result.add(e); - } - return result.stream(); - } - } - - private int insertionPoint(int i) { - return i < 0 ? -(i + 1) : i; - } - - /** - * Un-freeze the index to allow new elements to be added. - */ - public void melt() { - frozen = false; - } - - @Override - public void add(E a) { - index(a); - } + private List beginIndex; + private List endIndex; + private boolean frozen; + + public JCoReOverlapAnnotationIndex() { + beginIndex = new ArrayList<>(); + endIndex = new ArrayList<>(); + } + + public JCoReOverlapAnnotationIndex(JCas jcas, int type) { + this(jcas, jcas.getCasType(type)); + } + + public JCoReOverlapAnnotationIndex(JCas jcas, Type type) { + this(); + index(jcas, type); + freeze(); + } + + public void index(JCas jCas, int type) { + index(jCas, jCas.getCasType(type)); + } + + public void index(JCas jCas, Type type) { + FSIterator it = jCas.getAnnotationIndex(type).iterator(); + while (it.hasNext()) { + Annotation annotation = it.next(); + index((E) annotation); + } + } + + public void index(E annotation) { + if (frozen) + throw new IllegalStateException("This index is frozen and cannot accept further items."); + beginIndex.add(annotation); + endIndex.add(annotation); + } + + public void freeze() { + frozen = true; + Collections.sort(beginIndex, Comparators.beginOffsetComparator()); + Collections.sort(endIndex, Comparators.endOffsetComparator()); + } + + /** + * Returns all annotation in the index overlapping in any way with a + * (embedded, covering, partial overlappings). The resulting list is either + * sorted by begin or end offset. It is not easily predictable which case it + * is (could be added as a return value if that would be useful in any way). + * + * @param a The annotation to retrieve overlapping annotations from the + * index for. + * @return All annotations in the index overlapping a. + */ + public List search(T a) { + if (!frozen) + throw new IllegalStateException( + "This index is not frozen and cannot be used yet. Freeze the index before searching."); + if (beginIndex.isEmpty()) + return Collections.emptyList(); + // The following is rather difficult to understand from the code. The + // idea is the following: + // We search annotations overlapping with a. Thus, we can rule out those + // annotations that end before a or start after a. + // In the next 4 lines, we determine how many annotations can be ruled + // out because they start after a and how many end before a. + int begin = a.getBegin(); + int end = a.getEnd(); + int indexBeginAfterEnd = insertionPoint(JCoReTools.binarySearch(beginIndex, an -> an.getBegin(), end)); + int indexEndBeforeBegin = insertionPoint(JCoReTools.binarySearch(endIndex, an -> an.getEnd(), begin)); + + // Depending on which case rules out more annotations - ending before a + // or starting after a - we look at the case that leaves us with the + // fewest annotations. If those were the annotations that started after + // a, then we keep those that start before a ends. Those are than + // filtered for annotations that end before a starts. + if (indexBeginAfterEnd < endIndex.size() - indexEndBeforeBegin) { + List beginBeforeEnd = new ArrayList<>(beginIndex.subList(0, indexBeginAfterEnd)); + List result = new ArrayList<>(); + for (E e : beginBeforeEnd) { + if (e.getEnd() > begin) + result.add(e); + } + return result; + } else { + List endAfterBegin = new ArrayList<>(endIndex.subList(indexEndBeforeBegin, endIndex.size())); + List result = new ArrayList<>(); + for (E e : endAfterBegin) { + if (e.getBegin() < end) + result.add(e); + } + return result; + } + } + + private int insertionPoint(int i) { + return i < 0 ? -(i + 1) : i; + } + + /** + * Un-freeze the index to allow new elements to be added. + */ + public void melt() { + frozen = false; + } + + @Override + public void add(E a) { + index(a); + } + + /** + *

Returns the internal list where the indexed annotations are sorted by begin offset. External changes to + * this list might break the index.

+ * + * @return The indexed annotations sorted bei their begin offset. + */ + public List getBeginIndex() { + return beginIndex; + } + + /** + *

Returns the internal list where the indexed annotations are sorted by end offset. External changes to + * this list might break the index.

+ * + * @return The indexed annotations sorted bei their end offset. + */ + public List getEndIndex() { + return endIndex; + } + + public boolean isFrozen() { + return frozen; + } } From dce28b13dc75b3077b96d26b0a83ea3c77017f71 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 20 Oct 2020 10:00:47 +0200 Subject: [PATCH 030/269] Adapted the index test to the fact that we now return the list. --- .../utility/index/JCoReOverlapAnnotationIndexTest.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndexTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndexTest.java index ef0a044c9..e2f7a39b2 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndexTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndexTest.java @@ -16,7 +16,6 @@ import org.junit.Test; import java.util.List; -import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -39,23 +38,23 @@ public void testOverlapAnnotationIndex() throws Exception { t6.addToIndexes(); JCoReOverlapAnnotationIndex index = new JCoReOverlapAnnotationIndex<>(jcas, Token.type); - List result = index.search(t2).collect(Collectors.toList()); + List result = index.search(t2); assertTrue(result.contains(t1)); assertTrue(result.contains(t2)); assertTrue(result.contains(t3)); assertEquals(3, result.size()); - result = index.search(t1).collect(Collectors.toList()); + result = index.search(t1); assertTrue(result.contains(t1)); assertTrue(result.contains(t2)); assertEquals(2, result.size()); - result = index.search(t4).collect(Collectors.toList()); + result = index.search(t4); assertTrue(result.contains(t4)); assertTrue(result.contains(t5)); assertEquals(2, result.size()); - result = index.search(t6).collect(Collectors.toList()); + result = index.search(t6); assertTrue(result.contains(t6)); assertEquals(1, result.size()); } From e66ebf2ef0f32ddb1ae21c336e7552032c77a99e Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 20 Oct 2020 10:07:38 +0200 Subject: [PATCH 031/269] Updating to jcore-parent 2.5.2-SNAPSHOT which updates the JCoRe version properties to 2.6.0-SNAPSHOT. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 7f4011b1e..f401caac7 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ jcore-parent - 2.5.1 + 2.5.2-SNAPSHOT From 36b225278f1c847a416889d3887dc011df17f265 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 20 Oct 2020 12:32:48 +0200 Subject: [PATCH 032/269] JeDIS: Bumping xmi splitter version to 2.3.5. --- jedis-parent/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jedis-parent/pom.xml b/jedis-parent/pom.xml index 794ee5c9c..71ffa5ceb 100644 --- a/jedis-parent/pom.xml +++ b/jedis-parent/pom.xml @@ -27,7 +27,7 @@ de.julielab jcore-xmi-splitter - 2.3.5-SNAPSHOT + 2.3.5
From f2f60d8d82f6cf55904d0c6e84c3a306a2d93d33 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 29 Oct 2020 10:15:05 +0100 Subject: [PATCH 033/269] PMC Reader: PMC IDs now have the "PMC" prefix. This is how PubMed and PMC handle it so one can recognize full text IDs immediately. --- .../java/de/julielab/jcore/reader/pmc/parser/FrontParser.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java index 4823fed54..6548e00ea 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java @@ -109,7 +109,7 @@ else if (xPathExists(String.format(pubDateFmt, "pmc-release"))) Header header = new Header(nxmlDocumentParser.cas); header.setComponentId(PMCReader.class.getName()); - pmcid.ifPresent(header::setDocId); + pmcid.ifPresent(id -> header.setDocId("PMC" + id)); pmid.ifPresent(p -> { OtherID otherID = new OtherID(nxmlDocumentParser.cas); otherID.setComponentId(PMCReader.class.getName()); From 59c020c31f644d14fb7e5f45e5ad8891a14c5803 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 29 Oct 2020 10:37:18 +0100 Subject: [PATCH 034/269] Fixing the PMC reader test with regards to the PMC prefix for the PMC IDs. --- .../julielab/jcore/reader/pmc/PMCMultiplierTest.java | 8 ++++---- .../de/julielab/jcore/reader/pmc/PMCReaderTest.java | 10 +++++----- .../jcore/reader/pmc/parser/FrontParserTest.java | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCMultiplierTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCMultiplierTest.java index b411afc46..8a8527930 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCMultiplierTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCMultiplierTest.java @@ -47,8 +47,8 @@ public void testMultiplier() throws UIMAException, IOException { } ++numBatches; } - assertThat(receivedDocIds).containsExactlyInAnyOrder("2847692", "2758189", - "2970367", "3201365", "4257438"); + assertThat(receivedDocIds).containsExactlyInAnyOrder("PMC2847692", "PMC2758189", + "PMC2970367", "PMC3201365", "PMC4257438"); assertThat(numBatches).isEqualTo(3); } @@ -78,8 +78,8 @@ public void testMultiplierFromDescriptors() throws UIMAException, IOException { } ++numBatches; } - assertThat(receivedDocIds).containsExactlyInAnyOrder("2847692", "2758189", - "2970367", "3201365", "4257438"); + assertThat(receivedDocIds).containsExactlyInAnyOrder("PMC2847692", "PMC2758189", + "PMC2970367", "PMC3201365", "PMC4257438"); assertThat(numBatches).isEqualTo(3); } } diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCReaderTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCReaderTest.java index 308f950d2..9d5d91007 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCReaderTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCReaderTest.java @@ -83,7 +83,7 @@ public void testPmcReader2() throws Exception { cas.reset(); } - assertThat(foundDocuments).containsExactlyInAnyOrder("2847692", "3201365", "4257438", "2758189", "2970367"); + assertThat(foundDocuments).containsExactlyInAnyOrder("PMC2847692", "PMC3201365", "PMC4257438", "PMC2758189", "PMC2970367"); } @Test @@ -122,7 +122,7 @@ public void testPmcReaderRecursiveZip() throws Exception { cas.reset(); } - assertThat(foundDocuments).containsExactlyInAnyOrder("2847692", "3201365", "4257438", "2758189", "2970367"); + assertThat(foundDocuments).containsExactlyInAnyOrder("PMC2847692", "PMC3201365", "PMC4257438", "PMC2758189", "PMC2970367"); } @Test @@ -146,7 +146,7 @@ public void testPmcReaderWhitelist() throws Exception { foundDocuments.add(header.getDocId()); cas.reset(); } - assertThat(foundDocuments).containsExactlyInAnyOrder("2847692", "2758189"); + assertThat(foundDocuments).containsExactlyInAnyOrder("PMC2847692", "PMC2758189"); } @Test @@ -176,7 +176,7 @@ public void testHeader() throws Exception { Header header = (Header) CasUtil.selectSingle(cas.getCas(), CasUtil.getAnnotationType(cas.getCas(), Header.class)); assertNotNull(header); - assertEquals("2847692", header.getDocId()); + assertEquals("PMC2847692", header.getDocId()); assertNotNull(header.getPubTypeList()); assertTrue(header.getPubTypeList().size() > 0); assertEquals("Ambio", ((Journal) header.getPubTypeList(0)).getTitle()); @@ -378,7 +378,7 @@ public void testPmcReaderDescriptor() throws Exception { cas.reset(); } - assertThat(foundDocuments).containsExactlyInAnyOrder("2847692", "3201365", "4257438", "2758189", "2970367"); + assertThat(foundDocuments).containsExactlyInAnyOrder("PMC2847692", "PMC3201365", "PMC4257438", "PMC2758189", "PMC2970367"); } @Test diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/FrontParserTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/FrontParserTest.java index c09fc6313..c5ac41078 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/FrontParserTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/FrontParserTest.java @@ -38,7 +38,7 @@ public void testParser() throws Exception { Annotation annotation = frontResult.getAnnotation(); assertTrue(annotation instanceof Header); Header header = (Header) annotation; - assertEquals("2847692", header.getDocId()); + assertEquals("PMC2847692", header.getDocId()); assertEquals("10.1007/s13280-009-0005-8", header.getDoi()); assertNotNull(header.getOtherIDs()); assertTrue(header.getOtherIDs().size() > 0); From 429ef777337a6dd5cf91fb2b3f0d7ce4af29d294 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 29 Oct 2020 15:10:31 +0100 Subject: [PATCH 035/269] Adding the source "PubMed Central" to the header created by the PMC reader. --- .../java/de/julielab/jcore/reader/pmc/parser/FrontParser.java | 1 + 1 file changed, 1 insertion(+) diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java index 6548e00ea..b21a66aec 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java @@ -107,6 +107,7 @@ else if (xPathExists(String.format(pubDateFmt, "pmc-release"))) assert volume.isPresent(); Header header = new Header(nxmlDocumentParser.cas); + header.setSource("PubMed Central"); header.setComponentId(PMCReader.class.getName()); pmcid.ifPresent(id -> header.setDocId("PMC" + id)); From 0f8211ba29a5e7c3ec95929c5073f793417a7d18 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 29 Oct 2020 15:57:58 +0100 Subject: [PATCH 036/269] Updating flair in travis to 0.6.1. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 57daeceac..172756b0e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -31,7 +31,7 @@ before_install: if ! find "$HOME/pip-cache" -mindepth 1 -print -quit 2>/dev/null | grep -q .; then $PYTHON -m pip download --destination-directory="$HOME/pip-cache" flair fi - sudo -H $PYTHON -m pip install --find-links="$HOME/pip-cache" flair==0.4.5 + sudo -H $PYTHON -m pip install --find-links="$HOME/pip-cache" flair==0.6.1 - #./travis-deployment/install-flair-nightly.sh - export BOTO_CONFIG=/dev/null install: mvn install -DskipTests=true -Dmaven.javadoc.skip=true -B -V From 02857425ded3855bdfe5fb2498d7e913eb8dc50a Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 19 Feb 2021 13:47:57 +0100 Subject: [PATCH 037/269] Fixed the parent of the acronym writer --- jcore-acronym-writer/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-acronym-writer/pom.xml b/jcore-acronym-writer/pom.xml index e01349996..69f995886 100644 --- a/jcore-acronym-writer/pom.xml +++ b/jcore-acronym-writer/pom.xml @@ -10,7 +10,7 @@ de.julielab jcore-base - 2.5.0-SNAPSHOT + 2.6.0-SNAPSHOT From 754e2983bfbe5355662719c74ca4e7f48e855df1 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 23 Feb 2021 15:19:43 +0100 Subject: [PATCH 038/269] Trying to add a coreference writer and made some small changes. Have issues with not found classes in IntelliJ which are obviously present, dont't know. --- jcore-acronym-writer/pom.xml | 3 +- .../acronyms/desc/jcore-acronym-writer.xml | 2 +- .../consumer/acronyms/AcronymWriterTest.java | 2 +- jcore-coreference-writer/LICENSE | 26 ++++++ jcore-coreference-writer/README.md | 26 ++++++ jcore-coreference-writer/component.meta | 20 +++++ jcore-coreference-writer/pom.xml | 61 +++++++++++++ .../coreference/CoreferenceWriter.java | 86 +++++++++++++++++++ .../coreference/desc/jcore-acronym-writer.xml | 33 +++++++ .../coreference/CoreferenceWriterTest.java | 10 +++ jcore-neo4j-relations-consumer/pom.xml | 12 +-- .../jcore/types/jcore-discourse-types.xml | 2 +- .../jcore/utility/index/JCoReCoverIndex.java | 6 +- pom.xml | 6 +- 14 files changed, 279 insertions(+), 16 deletions(-) create mode 100644 jcore-coreference-writer/LICENSE create mode 100644 jcore-coreference-writer/README.md create mode 100644 jcore-coreference-writer/component.meta create mode 100644 jcore-coreference-writer/pom.xml create mode 100644 jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java create mode 100644 jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/jcore-acronym-writer.xml create mode 100644 jcore-coreference-writer/src/test/java/de/julielab/jcore/consumer/coreference/CoreferenceWriterTest.java diff --git a/jcore-acronym-writer/pom.xml b/jcore-acronym-writer/pom.xml index 69f995886..035774709 100644 --- a/jcore-acronym-writer/pom.xml +++ b/jcore-acronym-writer/pom.xml @@ -5,7 +5,6 @@ 4.0.0 jcore-acronym-writer jar - de.julielab.jcore.consumer.acronyms de.julielab @@ -58,5 +57,5 @@
https://github.com/JULIELab/jcore-base/tree/master/jcore-acronym-writer - Writes acronyms annotations from the CAS to a text file format. + Writes acronym annotations from the CAS to a text file format. diff --git a/jcore-acronym-writer/src/main/resources/de/julielab/jcore/consumer/acronyms/desc/jcore-acronym-writer.xml b/jcore-acronym-writer/src/main/resources/de/julielab/jcore/consumer/acronyms/desc/jcore-acronym-writer.xml index 6659cbf31..26840e7c6 100644 --- a/jcore-acronym-writer/src/main/resources/de/julielab/jcore/consumer/acronyms/desc/jcore-acronym-writer.xml +++ b/jcore-acronym-writer/src/main/resources/de/julielab/jcore/consumer/acronyms/desc/jcore-acronym-writer.xml @@ -2,7 +2,7 @@ org.apache.uima.java true - de.julielab.jcore.consumer.acronyms.AcronymWriter + de.julielab.jcore.consumer.coreference.AcronymWriter JCoRe Acronym Writer Writes acronym annotation to a text file. diff --git a/jcore-acronym-writer/src/test/java/de/julielab/jcore/consumer/acronyms/AcronymWriterTest.java b/jcore-acronym-writer/src/test/java/de/julielab/jcore/consumer/acronyms/AcronymWriterTest.java index 243f4481a..c63bfd442 100644 --- a/jcore-acronym-writer/src/test/java/de/julielab/jcore/consumer/acronyms/AcronymWriterTest.java +++ b/jcore-acronym-writer/src/test/java/de/julielab/jcore/consumer/acronyms/AcronymWriterTest.java @@ -1,5 +1,5 @@ -package de.julielab.jcore.consumer.acronyms; +package de.julielab.jcore.consumer.coreference; /** * Unit tests for jcore-acronym-writer. diff --git a/jcore-coreference-writer/LICENSE b/jcore-coreference-writer/LICENSE new file mode 100644 index 000000000..7190118b3 --- /dev/null +++ b/jcore-coreference-writer/LICENSE @@ -0,0 +1,26 @@ +BSD 2-Clause License + +Copyright (c) 2021, JULIE Lab +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/jcore-coreference-writer/README.md b/jcore-coreference-writer/README.md new file mode 100644 index 000000000..da767a4d1 --- /dev/null +++ b/jcore-coreference-writer/README.md @@ -0,0 +1,26 @@ +# JCoRe Acronym Writer + +**Descriptor Path**: +``` +de.julielab.jcore.consumer.acronyms.desc.jcore-acronym-writer +``` + +Writes acronyms annotations from the CAS to a text file format. + + + +**1. Parameters** + +| Parameter Name | Parameter Type | Mandatory | Multivalued | Description | +|----------------|----------------|-----------|-------------|-------------| +| OutputFile | string | true | false | Path to the ourput file. | + + +**2. Capabilities** + +| Type | Input | Output | +|------|:-----:|:------:| +| de.julielab.jcore.types.Abbreviation | `+` | | + + + diff --git a/jcore-coreference-writer/component.meta b/jcore-coreference-writer/component.meta new file mode 100644 index 000000000..b0999bc38 --- /dev/null +++ b/jcore-coreference-writer/component.meta @@ -0,0 +1,20 @@ +{ + "categories": [ + "consumer" + ], + "description": "Writes acronyms annotations from the CAS to a text file format.", + "descriptors": [ + { + "category": "consumer", + "location": "de.julielab.jcore.consumer.acronyms.desc.jcore-acronym-writer" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-acronym-writer", + "groupId": "de.julielab.jcore.consumer.acronyms", + "version": "2.5.0-SNAPSHOT" + }, + "name": "JCoRe Acronym Writer" +} diff --git a/jcore-coreference-writer/pom.xml b/jcore-coreference-writer/pom.xml new file mode 100644 index 000000000..ee4c26044 --- /dev/null +++ b/jcore-coreference-writer/pom.xml @@ -0,0 +1,61 @@ + + + + 4.0.0 + jcore-coreference-writer + jar + + + de.julielab + jcore-base + 2.6.0-SNAPSHOT + + + + + de.julielab + jcore-descriptor-creator + + + de.julielab + jcore-utilities + ${jcore-utilities-version} + + + de.julielab + julielab-java-utilities + + + ch.qos.logback + logback-classic + test + + + org.slf4j + slf4j-api + + + de.julielab + jcore-types + ${jcore-version} + + + junit + junit + + + JCoRe Coreference Writer + + JULIE Lab Jena, Germany + http://www.julielab.de + + + + BSD-2-Clause + https://opensource.org/licenses/BSD-2-Clause + + + https://github.com/JULIELab/jcore-base/tree/master/jcore-coreference-writer + Writes coreference annotations from the CAS to a text file format. + diff --git a/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java b/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java new file mode 100644 index 000000000..27eb28de2 --- /dev/null +++ b/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java @@ -0,0 +1,86 @@ + +package de.julielab.jcore.consumer.coreference; + +import de.julielab.java.utilities.FileUtilities; +import de.julielab.jcore.types.Abbreviation; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.commons.io.IOUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CASRuntimeException; +import org.apache.uima.cas.FSIterator; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.apache.uima.resource.ResourceInitializationException; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.util.HashMap; +import java.util.Map; + +@ResourceMetaData(name = "JCoRe Coreference Writer", description = "Writes co-reference annotation to a text file.") +public class CoreferenceWriter extends JCasAnnotator_ImplBase { + + public static final String PARAM_OUTPUTFILE = "OutputFile"; + + @ConfigurationParameter(name = PARAM_OUTPUTFILE) + private String outputFile; + private OutputStream os; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException { + super.initialize(aContext); + outputFile = (String) aContext.getConfigParameterValue(PARAM_OUTPUTFILE); + try { + os = FileUtilities.getOutputStreamToFile(new File(outputFile)); + } catch (IOException e) { + throw new ResourceInitializationException(e); + } + } + + @Override + public void process(JCas jcas) throws AnalysisEngineProcessException { + try { + String pubmedId = JCoReTools.getDocId(jcas); + FSIterator it = jcas.getAnnotationIndex(Abbreviation.type).iterator(); + + Map fullForms = new HashMap<>(); + int abbrCount = 0; + while (it.hasNext()) { + Abbreviation abbr = (Abbreviation) it.next(); + de.julielab.jcore.types.Annotation textReference = abbr.getTextReference(); + + String abbrId = "A" + abbrCount; + + String fullformId = fullForms.get(textReference); + if (fullformId == null) { + fullformId = "F" + abbrCount; + fullForms.put(textReference, fullformId); + IOUtils.write(String.join("\t", pubmedId, fullformId, String.valueOf(textReference.getBegin()), + String.valueOf(textReference.getEnd())) + "\n", os, "UTF-8"); + } + + IOUtils.write(String.join("\t", pubmedId, abbrId, String.valueOf(abbr.getBegin()), + String.valueOf(abbr.getEnd()), fullformId) + "\n", os, "UTF-8"); + + ++abbrCount; + } + } catch (CASRuntimeException | IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + @Override + public void collectionProcessComplete() throws AnalysisEngineProcessException { + try { + os.close(); + } catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + +} diff --git a/jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/jcore-acronym-writer.xml b/jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/jcore-acronym-writer.xml new file mode 100644 index 000000000..71991a2ce --- /dev/null +++ b/jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/jcore-acronym-writer.xml @@ -0,0 +1,33 @@ + + + org.apache.uima.java + true + de.julielab.jcore.consumer.acronyms.CoreferenceWriter + + JCoRe Acronym Writer + Writes acronym annotation to a text file. + 2.6.0-SNAPSHOT + + + OutputFile + + String + false + true + + + + + + + + + + + + true + true + false + + + \ No newline at end of file diff --git a/jcore-coreference-writer/src/test/java/de/julielab/jcore/consumer/coreference/CoreferenceWriterTest.java b/jcore-coreference-writer/src/test/java/de/julielab/jcore/consumer/coreference/CoreferenceWriterTest.java new file mode 100644 index 000000000..9e3e8e14a --- /dev/null +++ b/jcore-coreference-writer/src/test/java/de/julielab/jcore/consumer/coreference/CoreferenceWriterTest.java @@ -0,0 +1,10 @@ + +package de.julielab.jcore.consumer.acronyms; + +/** + * Unit tests for jcore-acronym-writer. + * + */ +public class CoreferenceWriterTest { +// TODO +} diff --git a/jcore-neo4j-relations-consumer/pom.xml b/jcore-neo4j-relations-consumer/pom.xml index 6b0d0060c..92fc5f29b 100644 --- a/jcore-neo4j-relations-consumer/pom.xml +++ b/jcore-neo4j-relations-consumer/pom.xml @@ -29,11 +29,6 @@ jcore-types ${jcore-types-version} - - de.julielab - julielab-neo4j-plugins-concepts-representation - 3.0.0-SNAPSHOT - de.julielab jcore-utilities @@ -50,10 +45,15 @@ 4.0.4 test + + de.julielab + julielab-neo4j-plugins-concepts-representation + 3.0.1-SNAPSHOT + de.julielab julielab-neo4j-plugins-concepts - 3.0.0-SNAPSHOT + 3.0.1-SNAPSHOT test diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-discourse-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-discourse-types.xml index ab4888c8c..01d7e272e 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-discourse-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-discourse-types.xml @@ -34,7 +34,7 @@ de.julielab.jcore.types.CorefExpression - A 'coreference expression' might by any span of text that is part of a set of text expressions refering to the same entity. Speaking in the anaphora framework, coreference expressions are either anaphors - mostly pronouns and definite noun phrases - or their antecedents - the original, first mention of an entity or already an anaphoric expression referring itself to a reference to the original entity mention. + A 'coreference expression' might be any span of text that is part of a set of text expressions referring to the same entity. Speaking in the anaphora framework, co-reference expressions are either anaphors - mostly pronouns and definite noun phrases - or their antecedents - the original, first mention of an entity or already an anaphoric expression referring itself to a reference to the original entity mention. de.julielab.jcore.types.Annotation diff --git a/jcore-utilities/src/main/java/de/julielab/jcore/utility/index/JCoReCoverIndex.java b/jcore-utilities/src/main/java/de/julielab/jcore/utility/index/JCoReCoverIndex.java index 02d192b73..ef6c6588b 100644 --- a/jcore-utilities/src/main/java/de/julielab/jcore/utility/index/JCoReCoverIndex.java +++ b/jcore-utilities/src/main/java/de/julielab/jcore/utility/index/JCoReCoverIndex.java @@ -106,10 +106,8 @@ public void index(JCas jCas, Type type) { * indexed annotations, first {@link #freeze()} the index and then * {@link #search(int, int)} it. * - * @param jCas - * A CAS instance. - * @param type - * The annotation type to index. + * @param annotation + * A UIMA annotation */ public void index(E annotation) { if (frozen) diff --git a/pom.xml b/pom.xml index f401caac7..8768f3a25 100644 --- a/pom.xml +++ b/pom.xml @@ -74,6 +74,8 @@ jcore-ace-reader jcore-acronym-ae + + jcore-acronym-writer jcore-banner-ae @@ -88,7 +90,9 @@ jcore-conll-consumer jcore-coordination-baseline-ae - + + jcore-coreference-writer + jcore-ct-reader jcore-descriptor-creator From 9f80444f9df180c3ee5bb5b0a2c381c071d1c440 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 23 Feb 2021 15:37:46 +0100 Subject: [PATCH 039/269] First sketch of the coreference writer --- jcore-acronym-writer/component.meta | 6 ++-- jcore-coreference-writer/component.meta | 12 +++---- .../coreference/CoreferenceWriter.java | 36 +++++++++++-------- .../coreference/CoreferenceWriterTest.java | 4 +-- 4 files changed, 32 insertions(+), 26 deletions(-) diff --git a/jcore-acronym-writer/component.meta b/jcore-acronym-writer/component.meta index b0999bc38..6869b7664 100644 --- a/jcore-acronym-writer/component.meta +++ b/jcore-acronym-writer/component.meta @@ -2,7 +2,7 @@ "categories": [ "consumer" ], - "description": "Writes acronyms annotations from the CAS to a text file format.", + "description": "Writes acronym annotations from the CAS to a text file format.", "descriptors": [ { "category": "consumer", @@ -13,8 +13,8 @@ "group": "general", "maven-artifact": { "artifactId": "jcore-acronym-writer", - "groupId": "de.julielab.jcore.consumer.acronyms", - "version": "2.5.0-SNAPSHOT" + "groupId": "de.julielab", + "version": "2.6.0-SNAPSHOT" }, "name": "JCoRe Acronym Writer" } diff --git a/jcore-coreference-writer/component.meta b/jcore-coreference-writer/component.meta index b0999bc38..ec5fe6810 100644 --- a/jcore-coreference-writer/component.meta +++ b/jcore-coreference-writer/component.meta @@ -2,19 +2,19 @@ "categories": [ "consumer" ], - "description": "Writes acronyms annotations from the CAS to a text file format.", + "description": "Writes coreference annotations from the CAS to a text file format.", "descriptors": [ { "category": "consumer", - "location": "de.julielab.jcore.consumer.acronyms.desc.jcore-acronym-writer" + "location": "de.julielab.jcore.consumer.coreference.desc.jcore-acronym-writer" } ], "exposable": true, "group": "general", "maven-artifact": { - "artifactId": "jcore-acronym-writer", - "groupId": "de.julielab.jcore.consumer.acronyms", - "version": "2.5.0-SNAPSHOT" + "artifactId": "jcore-coreference-writer", + "groupId": "de.julielab", + "version": "2.6.0-SNAPSHOT" }, - "name": "JCoRe Acronym Writer" + "name": "JCoRe Coreference Writer" } diff --git a/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java b/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java index 27eb28de2..0884f6509 100644 --- a/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java +++ b/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java @@ -3,6 +3,8 @@ import de.julielab.java.utilities.FileUtilities; import de.julielab.jcore.types.Abbreviation; +import de.julielab.jcore.types.CorefExpression; +import de.julielab.jcore.types.CorefRelation; import de.julielab.jcore.utility.JCoReTools; import org.apache.commons.io.IOUtils; import org.apache.uima.UimaContext; @@ -10,9 +12,11 @@ import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CASRuntimeException; import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.FeatureStructure; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; @@ -20,6 +24,7 @@ import java.io.IOException; import java.io.OutputStream; import java.util.HashMap; +import java.util.Iterator; import java.util.Map; @ResourceMetaData(name = "JCoRe Coreference Writer", description = "Writes co-reference annotation to a text file.") @@ -46,28 +51,29 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept public void process(JCas jcas) throws AnalysisEngineProcessException { try { String pubmedId = JCoReTools.getDocId(jcas); - FSIterator it = jcas.getAnnotationIndex(Abbreviation.type).iterator(); + FSIterator it = jcas.getAnnotationIndex(CorefRelation.type).iterator(); - Map fullForms = new HashMap<>(); - int abbrCount = 0; + int relcount = 0; while (it.hasNext()) { - Abbreviation abbr = (Abbreviation) it.next(); - de.julielab.jcore.types.Annotation textReference = abbr.getTextReference(); + CorefRelation rel = it.next(); + de.julielab.jcore.types.Annotation anaphora = rel.getAnaphora(); - String abbrId = "A" + abbrCount; + String abbrId = "Ana" + relcount; - String fullformId = fullForms.get(textReference); - if (fullformId == null) { - fullformId = "F" + abbrCount; - fullForms.put(textReference, fullformId); - IOUtils.write(String.join("\t", pubmedId, fullformId, String.valueOf(textReference.getBegin()), - String.valueOf(textReference.getEnd())) + "\n", os, "UTF-8"); + IOUtils.write(String.join("\t", pubmedId, abbrId, String.valueOf(anaphora.getBegin()), + String.valueOf(anaphora.getEnd())) + "\n", os, "UTF-8"); + + Iterator antecedentsIt = rel.getAntecedents().iterator(); + while (antecedentsIt.hasNext()) { + CorefExpression antecedent = (CorefExpression) antecedentsIt.next(); + + String antecedentGroup = "Ant" + relcount; + IOUtils.write(String.join("\t", pubmedId, antecedentGroup, String.valueOf(antecedent.getBegin()), + String.valueOf(antecedent.getEnd())) + "\n", os, "UTF-8"); } - IOUtils.write(String.join("\t", pubmedId, abbrId, String.valueOf(abbr.getBegin()), - String.valueOf(abbr.getEnd()), fullformId) + "\n", os, "UTF-8"); - ++abbrCount; + ++relcount; } } catch (CASRuntimeException | IOException e) { throw new AnalysisEngineProcessException(e); diff --git a/jcore-coreference-writer/src/test/java/de/julielab/jcore/consumer/coreference/CoreferenceWriterTest.java b/jcore-coreference-writer/src/test/java/de/julielab/jcore/consumer/coreference/CoreferenceWriterTest.java index 9e3e8e14a..7b7bf0429 100644 --- a/jcore-coreference-writer/src/test/java/de/julielab/jcore/consumer/coreference/CoreferenceWriterTest.java +++ b/jcore-coreference-writer/src/test/java/de/julielab/jcore/consumer/coreference/CoreferenceWriterTest.java @@ -1,8 +1,8 @@ -package de.julielab.jcore.consumer.acronyms; +package de.julielab.jcore.consumer.coreference; /** - * Unit tests for jcore-acronym-writer. + * Unit tests for jcore-coreference-writer. * */ public class CoreferenceWriterTest { From cf54d884c5677b8ef2f0e64bb02bdef2b474ab5a Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 23 Feb 2021 15:56:24 +0100 Subject: [PATCH 040/269] Bug fixing --- ...re-acronym-writer.xml => jcore-coreference-writer.xml} | 8 ++++---- .../ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) rename jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/{jcore-acronym-writer.xml => jcore-coreference-writer.xml} (77%) diff --git a/jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/jcore-acronym-writer.xml b/jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/jcore-coreference-writer.xml similarity index 77% rename from jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/jcore-acronym-writer.xml rename to jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/jcore-coreference-writer.xml index 71991a2ce..855be5b78 100644 --- a/jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/jcore-acronym-writer.xml +++ b/jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/jcore-coreference-writer.xml @@ -2,10 +2,10 @@ org.apache.uima.java true - de.julielab.jcore.consumer.acronyms.CoreferenceWriter + de.julielab.jcore.consumer.coreference.CoreferenceWriter - JCoRe Acronym Writer - Writes acronym annotation to a text file. + JCoRe Coreference Writer + Writes coreference annotation to a text file. 2.6.0-SNAPSHOT @@ -19,7 +19,7 @@ - + diff --git a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml index 28c03ebe8..835faf684 100644 --- a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml +++ b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml @@ -54,6 +54,7 @@ + From a45cb71e67bb1284b989bc4d0b92ed79258d408a Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 23 Feb 2021 16:02:10 +0100 Subject: [PATCH 041/269] Meta descriptor update --- jcore-coreference-writer/component.meta | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-coreference-writer/component.meta b/jcore-coreference-writer/component.meta index ec5fe6810..77f18d497 100644 --- a/jcore-coreference-writer/component.meta +++ b/jcore-coreference-writer/component.meta @@ -6,7 +6,7 @@ "descriptors": [ { "category": "consumer", - "location": "de.julielab.jcore.consumer.coreference.desc.jcore-acronym-writer" + "location": "de.julielab.jcore.consumer.coreference.desc.jcore-coreference-writer" } ], "exposable": true, From 9878a3979d4a62dd58816e695286dbd541dae8f0 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 1 Apr 2021 13:06:34 +0200 Subject: [PATCH 042/269] Adding the jcore-semantics-mention-types type system to the lingpipe gazetteer configurable descriptor. --- .../coreference/CoreferenceWriter.java | 119 +++++++++--------- .../jcore/misc/DescriptorCreator.java | 30 ++--- .../de.julielab.jcore.ae.testae.TestAE.xml | 62 ++++----- ...ore.consumer.testconsumer.Testconsumer.xml | 62 ++++----- ...ltiplier.testmultiplier.TestMultiplier.xml | 62 ++++----- ...lab.jcore.reader.testreader.TestReader.xml | 62 ++++----- ...-0.json-Eriks-MacBook-Air-2.local-2-2.json | 3 - .../EntityEvaluatorConsumer.java | 2 +- ...ipe-gazetteer-ae-configurable-resource.xml | 3 + 9 files changed, 204 insertions(+), 201 deletions(-) delete mode 100644 jcore-elasticsearch-consumer/src/test/resources/onefile-output/thefile-Eriks-MacBook-Air-2.local-1-0.json-Eriks-MacBook-Air-2.local-2-2.json diff --git a/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java b/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java index 0884f6509..32613e57d 100644 --- a/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java +++ b/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java @@ -1,4 +1,3 @@ - package de.julielab.jcore.consumer.coreference; import de.julielab.java.utilities.FileUtilities; @@ -26,67 +25,69 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import java.util.Spliterators; @ResourceMetaData(name = "JCoRe Coreference Writer", description = "Writes co-reference annotation to a text file.") public class CoreferenceWriter extends JCasAnnotator_ImplBase { - public static final String PARAM_OUTPUTFILE = "OutputFile"; - - @ConfigurationParameter(name = PARAM_OUTPUTFILE) - private String outputFile; - private OutputStream os; - - @Override - public void initialize(UimaContext aContext) throws ResourceInitializationException { - super.initialize(aContext); - outputFile = (String) aContext.getConfigParameterValue(PARAM_OUTPUTFILE); - try { - os = FileUtilities.getOutputStreamToFile(new File(outputFile)); - } catch (IOException e) { - throw new ResourceInitializationException(e); - } - } - - @Override - public void process(JCas jcas) throws AnalysisEngineProcessException { - try { - String pubmedId = JCoReTools.getDocId(jcas); - FSIterator it = jcas.getAnnotationIndex(CorefRelation.type).iterator(); - - int relcount = 0; - while (it.hasNext()) { - CorefRelation rel = it.next(); - de.julielab.jcore.types.Annotation anaphora = rel.getAnaphora(); - - String abbrId = "Ana" + relcount; - - IOUtils.write(String.join("\t", pubmedId, abbrId, String.valueOf(anaphora.getBegin()), - String.valueOf(anaphora.getEnd())) + "\n", os, "UTF-8"); - - Iterator antecedentsIt = rel.getAntecedents().iterator(); - while (antecedentsIt.hasNext()) { - CorefExpression antecedent = (CorefExpression) antecedentsIt.next(); - - String antecedentGroup = "Ant" + relcount; - IOUtils.write(String.join("\t", pubmedId, antecedentGroup, String.valueOf(antecedent.getBegin()), - String.valueOf(antecedent.getEnd())) + "\n", os, "UTF-8"); - } - - - ++relcount; - } - } catch (CASRuntimeException | IOException e) { - throw new AnalysisEngineProcessException(e); - } - } - - @Override - public void collectionProcessComplete() throws AnalysisEngineProcessException { - try { - os.close(); - } catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - } + public static final String PARAM_OUTPUTFILE = "OutputFile"; + + @ConfigurationParameter(name = PARAM_OUTPUTFILE) + private String outputFile; + private OutputStream os; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException { + super.initialize(aContext); + outputFile = (String) aContext.getConfigParameterValue(PARAM_OUTPUTFILE); + try { + os = FileUtilities.getOutputStreamToFile(new File(outputFile)); + } catch (IOException e) { + throw new ResourceInitializationException(e); + } + } + + @Override + public void process(JCas jcas) throws AnalysisEngineProcessException { + try { + String pubmedId = JCoReTools.getDocId(jcas); + FSIterator it = jcas.getAnnotationIndex(CorefRelation.type).iterator(); + + int relcount = 0; + while (it.hasNext()) { + CorefRelation rel = it.next(); + de.julielab.jcore.types.Annotation anaphora = rel.getAnaphora(); + + String abbrId = "Ana" + relcount; + + IOUtils.write(String.join("\t", pubmedId, abbrId, String.valueOf(anaphora.getBegin()), + String.valueOf(anaphora.getEnd())) + "\n", os, "UTF-8"); + + Iterator antecedentsIt = rel.getAntecedents() != null ? rel.getAntecedents().iterator() : null; + while (antecedentsIt != null && antecedentsIt.hasNext()) { + CorefExpression antecedent = (CorefExpression) antecedentsIt.next(); + if (antecedent != null) { + String antecedentGroup = "Ant" + relcount; + IOUtils.write(String.join("\t", pubmedId, antecedentGroup, String.valueOf(antecedent.getBegin()), + String.valueOf(antecedent.getEnd())) + "\n", os, "UTF-8"); + } + } + + + ++relcount; + } + } catch (CASRuntimeException | IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + + @Override + public void collectionProcessComplete() throws AnalysisEngineProcessException { + try { + os.close(); + } catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } } diff --git a/jcore-descriptor-creator/src/main/java/de/julielab/jcore/misc/DescriptorCreator.java b/jcore-descriptor-creator/src/main/java/de/julielab/jcore/misc/DescriptorCreator.java index 3f5ca368a..92c3178a1 100644 --- a/jcore-descriptor-creator/src/main/java/de/julielab/jcore/misc/DescriptorCreator.java +++ b/jcore-descriptor-creator/src/main/java/de/julielab/jcore/misc/DescriptorCreator.java @@ -38,33 +38,35 @@ public class DescriptorCreator { private static final String DESC = "desc"; public static void main(String[] args) throws Exception { + String basePackage = "de.julielab.jcore"; + if (args.length > 0) + basePackage = args[0]; DescriptorCreator creator = new DescriptorCreator(); - creator.run(); + creator.run(basePackage); } public static String getComponentName() { return new File(".").getAbsoluteFile().getParentFile().getName(); } - public void run() throws Exception { - run(DEFAULT_OUTPUT_ROOT); + public void run(String basePackage) throws Exception { + run(basePackage, DEFAULT_OUTPUT_ROOT); } - public void run(String outputRoot) throws Exception { - List> readers; - List> aes; - readers = findSubclasses(CollectionReader.class.getCanonicalName()); - aes = findSubclasses(AnalysisComponent.class.getCanonicalName()); + public void run(String basePackage, String outputRoot) throws Exception { + List> readers = findSubclasses(CollectionReader.class.getCanonicalName()); + List> aes = findSubclasses(AnalysisComponent.class.getCanonicalName()); - readers = readers.stream().filter(c -> c.getPackage().getName().contains("de.julielab.jcore.reader")) + readers = readers.stream().filter(c -> c.getPackage().getName().startsWith(basePackage) && (c.getPackage().getName().endsWith("reader") || c.getName().toLowerCase().endsWith("reader"))) .collect(toList()); - // Since consumers and also multipliers can be or are AnalysisComponents, were may list all component categories here. + // Since consumers and also multipliers can be or are AnalysisComponents, we may list all component categories here. // Also, remove abstract classes aes = aes.stream().filter(c -> !Modifier.isAbstract(c.getModifiers())). - filter(c -> c.getPackage().getName().contains("de.julielab.jcore.ae") - || c.getPackage().getName().contains("de.julielab.jcore.consumer") - || c.getPackage().getName().contains("de.julielab.jcore.multiplier") - || c.getPackage().getName().contains("de.julielab.jcore.reader")).collect(toList()); + filter(c -> c.getPackage().getName().startsWith(basePackage) && + (c.getPackage().getName().endsWith("ae") || c.getName().toLowerCase().endsWith("ae") || c.getName().toLowerCase().endsWith("annotator") + || c.getPackage().getName().endsWith("consumer") || c.getName().toLowerCase().endsWith("consumer") || c.getName().toLowerCase().endsWith("writer") + || c.getPackage().getName().endsWith("multiplier") || c.getName().toLowerCase().endsWith("multiplier")) + ).collect(toList()); if (readers.isEmpty() && aes.isEmpty()) { log.warn("No JCoRe UIMA component classes were found."); diff --git a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/ae/testae/desc/de.julielab.jcore.ae.testae.TestAE.xml b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/ae/testae/desc/de.julielab.jcore.ae.testae.TestAE.xml index 558a62b57..3cf0a3a39 100644 --- a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/ae/testae/desc/de.julielab.jcore.ae.testae.TestAE.xml +++ b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/ae/testae/desc/de.julielab.jcore.ae.testae.TestAE.xml @@ -1,4 +1,4 @@ - + org.apache.uima.java true @@ -6,41 +6,41 @@ de.julielab.jcore.ae.testae.TestAE Descriptor automatically generated by uimaFIT - 2.6.0-SNAPSHOT + unknown de.julielab.jcore.ae.testae - - + + - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + - - + + true true diff --git a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/consumer/testconsumer/desc/de.julielab.jcore.consumer.testconsumer.Testconsumer.xml b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/consumer/testconsumer/desc/de.julielab.jcore.consumer.testconsumer.Testconsumer.xml index 3bf9a16c1..cf47fdd0f 100644 --- a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/consumer/testconsumer/desc/de.julielab.jcore.consumer.testconsumer.Testconsumer.xml +++ b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/consumer/testconsumer/desc/de.julielab.jcore.consumer.testconsumer.Testconsumer.xml @@ -1,4 +1,4 @@ - + org.apache.uima.java true @@ -6,41 +6,41 @@ de.julielab.jcore.consumer.testconsumer.Testconsumer Descriptor automatically generated by uimaFIT - 2.6.0-SNAPSHOT + unknown de.julielab.jcore.consumer.testconsumer - - + + - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + - - + + true true diff --git a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/multiplier/testmultiplier/desc/de.julielab.jcore.multiplier.testmultiplier.TestMultiplier.xml b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/multiplier/testmultiplier/desc/de.julielab.jcore.multiplier.testmultiplier.TestMultiplier.xml index 8ef78db33..703b7b436 100644 --- a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/multiplier/testmultiplier/desc/de.julielab.jcore.multiplier.testmultiplier.TestMultiplier.xml +++ b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/multiplier/testmultiplier/desc/de.julielab.jcore.multiplier.testmultiplier.TestMultiplier.xml @@ -1,4 +1,4 @@ - + org.apache.uima.java true @@ -6,41 +6,41 @@ de.julielab.jcore.multiplier.testmultiplier.TestMultiplier Descriptor automatically generated by uimaFIT - 2.6.0-SNAPSHOT + unknown de.julielab.jcore.multiplier.testmultiplier - - + + - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + - - + + true true diff --git a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/reader/testreader/desc/de.julielab.jcore.reader.testreader.TestReader.xml b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/reader/testreader/desc/de.julielab.jcore.reader.testreader.TestReader.xml index bd482d6ee..24cc9ac66 100644 --- a/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/reader/testreader/desc/de.julielab.jcore.reader.testreader.TestReader.xml +++ b/jcore-descriptor-creator/src/test/resources/de/julielab/jcore/reader/testreader/desc/de.julielab.jcore.reader.testreader.TestReader.xml @@ -1,45 +1,45 @@ - + org.apache.uima.java de.julielab.jcore.reader.testreader.TestReader de.julielab.jcore.reader.testreader.TestReader Descriptor automatically generated by uimaFIT - 2.6.0-SNAPSHOT + unknown de.julielab.jcore.reader.testreader - - + + - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + - - + + true false diff --git a/jcore-elasticsearch-consumer/src/test/resources/onefile-output/thefile-Eriks-MacBook-Air-2.local-1-0.json-Eriks-MacBook-Air-2.local-2-2.json b/jcore-elasticsearch-consumer/src/test/resources/onefile-output/thefile-Eriks-MacBook-Air-2.local-1-0.json-Eriks-MacBook-Air-2.local-2-2.json deleted file mode 100644 index 5a085e8d3..000000000 --- a/jcore-elasticsearch-consumer/src/test/resources/onefile-output/thefile-Eriks-MacBook-Air-2.local-1-0.json-Eriks-MacBook-Air-2.local-2-2.json +++ /dev/null @@ -1,3 +0,0 @@ -{"documentText":"This is one line that should not be interrupted."} -{"documentText":"This is one line that should not be interrupted."} -{"documentText":"This is one line that should not be interrupted."} diff --git a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java index b92b32ad1..bffd2311d 100644 --- a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java +++ b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java @@ -249,7 +249,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept super.initialize(aContext); outputColumnNamesArray = (String[]) aContext.getConfigParameterValue(PARAM_OUTPUT_COLUMNS); - columnDefinitionDescriptions = (String[]) aContext.getConfigParameterValue(PARAM_COLUMN_DEFINITIONS); + columnDefinitionDescriptions = Optional.ofNullable((String[]) aContext.getConfigParameterValue(PARAM_COLUMN_DEFINITIONS)).orElse(new String[0]); typePrefix = (String) aContext.getConfigParameterValue(PARAM_TYPE_PREFIX); featureFilterDefinitions = (String[]) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_FEATURE_FILTERS)).orElse(new String[0]); diff --git a/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae-configurable-resource.xml b/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae-configurable-resource.xml index e8895177a..16a94eb70 100644 --- a/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae-configurable-resource.xml +++ b/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae-configurable-resource.xml @@ -50,6 +50,9 @@ + + + From af82f3dc5104362fa247fa27351fb12b68421a4d Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 9 Apr 2021 09:35:16 +0200 Subject: [PATCH 043/269] Adding optional plural normalization. Requires PennBioIEPOSTags to be set. --- .../chunking/ChunkerProvider.java | 2 + .../chunking/ChunkerProviderImpl.java | 5 + .../chunking/ChunkerProviderImplAlt.java | 17 +- .../ConfigurableChunkerProviderImplAlt.java | 587 +++++++++--------- .../uima/GazetteerAnnotator.java | 64 +- .../utils/StringNormalizerForChunking.java | 25 +- .../StringNormalizerForChunkingTest.java | 12 +- 7 files changed, 360 insertions(+), 352 deletions(-) diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProvider.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProvider.java index 0395da7c8..0e43d4cd4 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProvider.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProvider.java @@ -13,6 +13,8 @@ public interface ChunkerProvider { public boolean getUseApproximateMatching(); public boolean getNormalize(); + + public boolean getNormalizePlural(); public boolean getTransliterate(); diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImpl.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImpl.java index dc5613755..06171ed03 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImpl.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImpl.java @@ -428,6 +428,11 @@ public boolean getNormalize() { return false; } + @Override + public boolean getNormalizePlural() { + return false; + } + @Override public boolean getTransliterate() { return false; diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImplAlt.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImplAlt.java index 7e3daa924..23f4800d6 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImplAlt.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImplAlt.java @@ -42,6 +42,12 @@ public class ChunkerProviderImplAlt implements ChunkerProvider, SharedResourceOb * switched on in the descriptor for the annotator itself! */ public final static String PARAM_NORMALIZE_TEXT = "NormalizeText"; + /** + * Only in effect when {@link #PARAM_NORMALIZE_TEXT} is set to true. If so, will normalize plurals + * found in the text by removing the training 's'. Requires annotations of the type {@link de.julielab.jcore.types.PennBioIEPOSTag} + * to be present in the CAS. + */ + public static final String PARAM_NORMALIZE_PLURAL = "NormalizePlural"; /** * Parameter to indicate whether text - dictionary entries for this class - should be transliterated, i.e. whether * accents and other character variations should be stripped. If this is switched on here, it must also be switched @@ -54,6 +60,7 @@ public class ChunkerProviderImplAlt implements ChunkerProvider, SharedResourceOb private boolean useApproximateMatching; private boolean transliterate; private boolean normalize; + private boolean normalizePlural; private InputStream dictFile; private InputStream stopFile; @@ -71,6 +78,10 @@ public Chunker getChunker() { return dictChunker; } + public boolean getNormalizePlural() { + return normalizePlural; + } + public void load(DataResource resource) throws ResourceInitializationException { LOGGER.info("Loading configuration file from URI \"{}\" (URL: \"{}\").", resource.getUri(), resource.getUrl()); Properties properties = new Properties(); @@ -118,7 +129,11 @@ public void load(DataResource resource) throws ResourceInitializationException { normalize = false; if (normalizeString != null) normalize = new Boolean(normalizeString); - LOGGER.info("Normalize dictionary entries (i.e. completely strip dashes, parenthesis etc): {}", normalize); + LOGGER.info("Normalize dictionary entries and text (i.e. completely strip dashes, parenthesis etc): {}", normalize); + + normalizePlural = Boolean.parseBoolean(properties.getProperty(PARAM_NORMALIZE_PLURAL, "false")) && normalize; + if (normalize) + LOGGER.info("Also normalize plural forms to singular: {}", normalizePlural); String transliterateString = properties.getProperty(PARAM_TRANSLITERATE_TEXT); transliterate = false; diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.java index f0ae88711..aa1c07623 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.java @@ -1,4 +1,3 @@ - package de.julielab.jcore.ae.lingpipegazetteer.chunking; import com.aliasi.chunk.Chunker; @@ -21,6 +20,7 @@ import java.io.*; import java.net.URI; import java.util.HashSet; +import java.util.Optional; import java.util.Set; import java.util.zip.GZIPInputStream; @@ -29,317 +29,328 @@ * Also, this implementation expects a configurableDataResourceSpecifier for the external resource, * specifying the dictionary directly and providing the parameters via the normal UIMA resource meta data * mechanism. - * + * * @author faessler - * */ public class ConfigurableChunkerProviderImplAlt implements ChunkerProvider, SharedResourceObject { - private static final Logger LOGGER = LoggerFactory.getLogger(ConfigurableChunkerProviderImplAlt.class); - public final static String PARAM_USE_APPROXIMATE_MATCHING = "UseApproximateMatching"; - public final static String PARAM_CASE_SENSITIVE = "CaseSensitive"; - public final static String PARAM_MAKE_VARIANTS = "MakeVariants"; - public final static String PARAM_STOPWORD_FILE = "StopWordFile"; - /** - * Parameter to indicate whether text - dictionary entries for this class - should be normalized by completely - * removing dashes, parenthesis, genitive 's and perhaps more. This is meant to replace the generation of term - * variants and cannot be used together with variation generation. If this is switched on here, it must also be - * switched on in the descriptor for the annotator itself! - */ - public final static String PARAM_NORMALIZE_TEXT = "NormalizeText"; - /** - * Parameter to indicate whether text - dictionary entries for this class - should be transliterated, i.e. whether - * accents and other character variations should be stripped. If this is switched on here, it must also be switched - * on in the descriptor of the annotator itself! - */ - public final static String PARAM_TRANSLITERATE_TEXT = "TransliterateText"; - - private Boolean generateVariants; - private Boolean caseSensitive; - private Boolean useApproximateMatching; - private Boolean transliterate; - private Boolean normalize; - private InputStream dictFile; - private InputStream stopFile; - - private AbstractDictionary dict; - private Chunker dictChunker = null; - private final double CHUNK_SCORE = 1.0; - - private final int MIN_TERM_LENGTH = 3; - private final double APPROX_MATCH_THRESHOLD_SCORE = 100; - private Set stopWords = new HashSet(); - private String stopwordFilePath; + public final static String PARAM_USE_APPROXIMATE_MATCHING = "UseApproximateMatching"; + public final static String PARAM_CASE_SENSITIVE = "CaseSensitive"; + public final static String PARAM_MAKE_VARIANTS = "MakeVariants"; + public final static String PARAM_STOPWORD_FILE = "StopWordFile"; + /** + * Parameter to indicate whether text - dictionary entries for this class - should be normalized by completely + * removing dashes, parenthesis, genitive 's and perhaps more. This is meant to replace the generation of term + * variants and cannot be used together with variation generation. If this is switched on here, it must also be + * switched on in the descriptor for the annotator itself! + */ + public final static String PARAM_NORMALIZE_TEXT = "NormalizeText"; + /** + * Only in effect when {@link #PARAM_NORMALIZE_TEXT} is set to true. If so, will normalize plurals + * found in the text by removing the training 's'. Requires annotations of the type {@link de.julielab.jcore.types.PennBioIEPOSTag} + * to be present in the CAS. + */ + public static final String PARAM_NORMALIZE_PLURAL = "NormalizePlural"; + /** + * Parameter to indicate whether text - dictionary entries for this class - should be transliterated, i.e. whether + * accents and other character variations should be stripped. If this is switched on here, it must also be switched + * on in the descriptor of the annotator itself! + */ + public final static String PARAM_TRANSLITERATE_TEXT = "TransliterateText"; + private static final Logger LOGGER = LoggerFactory.getLogger(ConfigurableChunkerProviderImplAlt.class); + private final double CHUNK_SCORE = 1.0; + private final int MIN_TERM_LENGTH = 3; + private final double APPROX_MATCH_THRESHOLD_SCORE = 100; + private Boolean generateVariants; + private Boolean caseSensitive; + private Boolean useApproximateMatching; + private Boolean transliterate; + private Boolean normalize; + private Boolean normalizePlural; + private InputStream dictFile; + private InputStream stopFile; + private AbstractDictionary dict; + private Chunker dictChunker = null; + private Set stopWords = new HashSet(); + private String stopwordFilePath; private URI resourceUri; public Chunker getChunker() { - return dictChunker; - } + return dictChunker; + } - public void load(DataResource resource) throws ResourceInitializationException { + public void load(DataResource resource) throws ResourceInitializationException { resourceUri = resource.getUri(); LOGGER.info("Creating dictionary chunker with dictionary loaded from " + resourceUri); - ConfigurationParameterSettings settings = resource.getMetaData().getConfigurationParameterSettings(); - stopwordFilePath = (String) settings.getParameterValue(PARAM_STOPWORD_FILE); - if (stopwordFilePath == null) - throw new ResourceInitializationException(ResourceInitializationException.CONFIG_SETTING_ABSENT, - new Object[] { PARAM_STOPWORD_FILE }); + ConfigurationParameterSettings settings = resource.getMetaData().getConfigurationParameterSettings(); + stopwordFilePath = (String) settings.getParameterValue(PARAM_STOPWORD_FILE); + if (stopwordFilePath == null) + throw new ResourceInitializationException(ResourceInitializationException.CONFIG_SETTING_ABSENT, + new Object[]{PARAM_STOPWORD_FILE}); - generateVariants = (Boolean) settings.getParameterValue(PARAM_MAKE_VARIANTS); - LOGGER.info("Generate variants: {}", generateVariants); + generateVariants = (Boolean) settings.getParameterValue(PARAM_MAKE_VARIANTS); + LOGGER.info("Generate variants: {}", generateVariants); - normalize = (Boolean) settings.getParameterValue(PARAM_NORMALIZE_TEXT); - LOGGER.info("Normalize dictionary entries (i.e. completely strip dashes, parenthesis etc): {}", normalize); + normalize = (Boolean) settings.getParameterValue(PARAM_NORMALIZE_TEXT); + LOGGER.info("Normalize dictionary entries (i.e. completely strip dashes, parenthesis etc): {}", normalize); + normalizePlural = Optional.ofNullable((Boolean) settings.getParameterValue(PARAM_NORMALIZE_PLURAL)).orElse(false) && normalize; + if (normalize) + LOGGER.info("Also normalize plural forms to singular: {}", normalizePlural); - transliterate = (Boolean) settings.getParameterValue(PARAM_TRANSLITERATE_TEXT); - LOGGER.info("Transliterate dictionary entries (i.e. transform accented characters to their base forms): {}", - transliterate); + transliterate = (Boolean) settings.getParameterValue(PARAM_TRANSLITERATE_TEXT); + LOGGER.info("Transliterate dictionary entries (i.e. transform accented characters to their base forms): {}", + transliterate); - caseSensitive = (Boolean) settings.getParameterValue(PARAM_CASE_SENSITIVE); - LOGGER.info("Case sensitive: {}", caseSensitive); + caseSensitive = (Boolean) settings.getParameterValue(PARAM_CASE_SENSITIVE); + LOGGER.info("Case sensitive: {}", caseSensitive); - useApproximateMatching = (Boolean) settings.getParameterValue(PARAM_USE_APPROXIMATE_MATCHING); - LOGGER.info("Use approximate matching: {}", useApproximateMatching); + useApproximateMatching = (Boolean) settings.getParameterValue(PARAM_USE_APPROXIMATE_MATCHING); + LOGGER.info("Use approximate matching: {}", useApproximateMatching); - if (normalize && generateVariants) - throw new ResourceInitializationException( - new IllegalStateException( - "MakeVariants and NormalizeText are both activated which is invalid. The two options work towards the same goal in two different ways, i.e. to recognize dictionary entry variants not given explicitly. However, the approaches are not compatible and you have to choose a single one.")); + if (normalize && generateVariants) + throw new ResourceInitializationException( + new IllegalStateException( + "MakeVariants and NormalizeText are both activated which is invalid. The two options work towards the same goal in two different ways, i.e. to recognize dictionary entry variants not given explicitly. However, the approaches are not compatible and you have to choose a single one.")); - try { + try { try { dictFile = UriUtilities.getInputStreamFromUri(resource.getUri()); } catch (Exception e) { LOGGER.error("Could not load the dictionary from {}, see the following exception for details.", resource.getUri()); throw e; } - stopFile = readStreamFromFileSystemOrClassPath(stopwordFilePath); - initStopWords(stopFile); - readDictionary(dictFile); - - LOGGER.info("Now creating chunker."); - long time = System.currentTimeMillis(); - if (useApproximateMatching) { - final Set charsToDelete = new HashSet<>(); - charsToDelete.add('-'); - // charsToDelete.add('+'); - // charsToDelete.add(','); - // charsToDelete.add('.'); - // charsToDelete.add(':'); - // charsToDelete.add(';'); - // charsToDelete.add('?'); - // charsToDelete.add('!'); - // charsToDelete.add('*'); - // charsToDelete.add('§'); - // charsToDelete.add('$'); - // charsToDelete.add('%'); - // charsToDelete.add('&'); - // charsToDelete.add('/'); - // charsToDelete.add('\\'); - // charsToDelete.add('('); - // charsToDelete.add(')'); - // charsToDelete.add('<'); - // charsToDelete.add('>'); - // charsToDelete.add('['); - // charsToDelete.add(']'); - // charsToDelete.add('='); - // charsToDelete.add('\''); - // charsToDelete.add('`'); - // charsToDelete.add('´'); - // charsToDelete.add('"'); - // charsToDelete.add('#'); - - WeightedEditDistance editDistance = ApproxDictionaryChunker.TT_DISTANCE; - editDistance = new WeightedEditDistance() { - - @Override - public double deleteWeight(char cDeleted) { - double ret; - if (cDeleted == '-') - ret = -5.0; - else if (cDeleted == ' ' || charsToDelete.contains(cDeleted)) - ret = -10.0; - else - ret = -110.0; - return ret; - } - - @Override - public double insertWeight(char cInserted) { - return deleteWeight(cInserted); - } - - @Override - public double matchWeight(char cMatched) { - return 0.0; - } - - @Override - public double substituteWeight(char cDeleted, char cInserted) { - if (cDeleted == ' ' && cInserted == '-') - return -2.0; - if (cDeleted == '-' && cInserted == ' ') - return -2.0; - if (cDeleted == ' ' && charsToDelete.contains(cInserted)) - return -10.0; - if (charsToDelete.contains(cDeleted) && cInserted == ' ') - return -10.0; - return -110.0; - } - - @Override - public double transposeWeight(char c1, char c2) { - return Double.NEGATIVE_INFINITY; - } - }; - - dictChunker = - new ApproxDictionaryChunker((TrieDictionary) dict, - IndoEuropeanTokenizerFactory.INSTANCE, editDistance, APPROX_MATCH_THRESHOLD_SCORE); - } else { - dictChunker = - new ExactDictionaryChunker(dict, IndoEuropeanTokenizerFactory.INSTANCE, false, caseSensitive); - } - time = System.currentTimeMillis() - time; - LOGGER.info("Building the actual chunker from the dictionary took {}ms ({}s).", time, time / 1000); - - } catch (Exception e) { - LOGGER.error("Exception while creating chunker instance from dictionary file {} with stopwords from {}", resource.getUri(), stopwordFilePath, e); - } - } - - private void readDictionary(InputStream dictFileStream) throws IOException, AnalysisEngineProcessException { - long time = System.currentTimeMillis(); - if (useApproximateMatching) { - dict = new TrieDictionary(); - } else { - dict = new MapDictionary(); - } - // now read from file and add entries - LOGGER.info("readDictionary() - adding entries from " + resourceUri.toString() + " to dictionary..."); - BufferedReader bf = null; - try { - bf = new BufferedReader(new InputStreamReader(dictFileStream)); - String line = ""; - - Transliterator transliterator = null; - if (transliterate) - transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower"); - - TokenizerFactory tokenizerFactory = null; - if (normalize) - tokenizerFactory = new IndoEuropeanTokenizerFactory(); - while ((line = bf.readLine()) != null) { - String[] values = line.split("\t"); - if (values.length != 2) { - LOGGER.error("readDictionary() - wrong format of line: " + line); - throw new AnalysisEngineProcessException(AnalysisEngineProcessException.ANNOTATOR_EXCEPTION, null); - } - - String term = values[0].trim(); - - if (stopWords.contains(term.toLowerCase())) - continue; - - if (normalize) { - term = StringNormalizerForChunking.normalizeString(term, tokenizerFactory).string; - } - if (transliterate) - term = transliterator.transform(term); - if (useApproximateMatching && !caseSensitive && !transliterate) - term = term.toLowerCase(); - - String label = values[1].trim(); - if (term.length() < MIN_TERM_LENGTH) - continue; - - if (generateVariants) { - if (true) - throw new NotImplementedException( - "In this alternative ChunkerProvider, generating variants will currently fail to adequately filter out stop words due to the transliteration and/or normalization algorithms. If you don't need those algorithms, just stick to the original ChunkerProviderImpl. Otherwise, this issue must be fixed (shouldnt be too difficult). Variants are also currently not treated with normalization/transliteration (but this is deemed to be two alternative ways to achieve a similar thing anyway)"); - } else { - // This is a second stop-word-check but here the term has been transliterated and/or normalized. If - // somehow the result of this was a stop word, ignore it. - if (!stopWords.contains(term.toLowerCase())) - dict.addEntry(new DictionaryEntry(term, label, CHUNK_SCORE)); - } - } - - time = System.currentTimeMillis() - time; - LOGGER.info("Reading dictionary took {}ms ({}s)", time, time / 1000); - } finally { - if (null != bf) - bf.close(); - } - } - - private void initStopWords(InputStream stopFileStream) throws IOException { - stopWords = new HashSet(); - - LOGGER.info("readDictionary() - adding entries from " + stopwordFilePath + " to dictionary..."); - BufferedReader bf = new BufferedReader(new InputStreamReader(stopFileStream)); - String line = ""; - - try { - while ((line = bf.readLine()) != null) { - if (line.startsWith("#")) { - continue; - } - stopWords.add(line.trim().toLowerCase()); - } - bf.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - @Override - public Set getStopWords() { - return stopWords; - } - - @Override - public boolean getUseApproximateMatching() { - return useApproximateMatching; - } - - @Override - public boolean getNormalize() { - return normalize; - } - - @Override - public boolean getTransliterate() { - return transliterate; - } - - @Override - public boolean getCaseSensitive() { - return caseSensitive; - - } - - private InputStream readStreamFromFileSystemOrClassPath(String filePath) { - InputStream is = null; - File file = new File(filePath); - if (file.exists()) { - try { - is = new FileInputStream(file); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - } else { - is = getClass().getResourceAsStream(filePath.startsWith("/") ? filePath : "/" + filePath); - } - if (filePath.endsWith(".gz") || filePath.endsWith(".gzip")) - try { - is = new GZIPInputStream(is); - } catch (IOException e) { - e.printStackTrace(); - } - return is; - } + stopFile = readStreamFromFileSystemOrClassPath(stopwordFilePath); + initStopWords(stopFile); + readDictionary(dictFile); + + LOGGER.info("Now creating chunker."); + long time = System.currentTimeMillis(); + if (useApproximateMatching) { + final Set charsToDelete = new HashSet<>(); + charsToDelete.add('-'); + // charsToDelete.add('+'); + // charsToDelete.add(','); + // charsToDelete.add('.'); + // charsToDelete.add(':'); + // charsToDelete.add(';'); + // charsToDelete.add('?'); + // charsToDelete.add('!'); + // charsToDelete.add('*'); + // charsToDelete.add('§'); + // charsToDelete.add('$'); + // charsToDelete.add('%'); + // charsToDelete.add('&'); + // charsToDelete.add('/'); + // charsToDelete.add('\\'); + // charsToDelete.add('('); + // charsToDelete.add(')'); + // charsToDelete.add('<'); + // charsToDelete.add('>'); + // charsToDelete.add('['); + // charsToDelete.add(']'); + // charsToDelete.add('='); + // charsToDelete.add('\''); + // charsToDelete.add('`'); + // charsToDelete.add('´'); + // charsToDelete.add('"'); + // charsToDelete.add('#'); + + WeightedEditDistance editDistance = ApproxDictionaryChunker.TT_DISTANCE; + editDistance = new WeightedEditDistance() { + + @Override + public double deleteWeight(char cDeleted) { + double ret; + if (cDeleted == '-') + ret = -5.0; + else if (cDeleted == ' ' || charsToDelete.contains(cDeleted)) + ret = -10.0; + else + ret = -110.0; + return ret; + } + + @Override + public double insertWeight(char cInserted) { + return deleteWeight(cInserted); + } + + @Override + public double matchWeight(char cMatched) { + return 0.0; + } + + @Override + public double substituteWeight(char cDeleted, char cInserted) { + if (cDeleted == ' ' && cInserted == '-') + return -2.0; + if (cDeleted == '-' && cInserted == ' ') + return -2.0; + if (cDeleted == ' ' && charsToDelete.contains(cInserted)) + return -10.0; + if (charsToDelete.contains(cDeleted) && cInserted == ' ') + return -10.0; + return -110.0; + } + + @Override + public double transposeWeight(char c1, char c2) { + return Double.NEGATIVE_INFINITY; + } + }; + + dictChunker = + new ApproxDictionaryChunker((TrieDictionary) dict, + IndoEuropeanTokenizerFactory.INSTANCE, editDistance, APPROX_MATCH_THRESHOLD_SCORE); + } else { + dictChunker = + new ExactDictionaryChunker(dict, IndoEuropeanTokenizerFactory.INSTANCE, false, caseSensitive); + } + time = System.currentTimeMillis() - time; + LOGGER.info("Building the actual chunker from the dictionary took {}ms ({}s).", time, time / 1000); + + } catch (Exception e) { + LOGGER.error("Exception while creating chunker instance from dictionary file {} with stopwords from {}", resource.getUri(), stopwordFilePath, e); + } + } + + private void readDictionary(InputStream dictFileStream) throws IOException, AnalysisEngineProcessException { + long time = System.currentTimeMillis(); + if (useApproximateMatching) { + dict = new TrieDictionary(); + } else { + dict = new MapDictionary(); + } + // now read from file and add entries + LOGGER.info("readDictionary() - adding entries from " + resourceUri.toString() + " to dictionary..."); + BufferedReader bf = null; + try { + bf = new BufferedReader(new InputStreamReader(dictFileStream)); + String line = ""; + + Transliterator transliterator = null; + if (transliterate) + transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower"); + + TokenizerFactory tokenizerFactory = null; + if (normalize) + tokenizerFactory = new IndoEuropeanTokenizerFactory(); + while ((line = bf.readLine()) != null) { + String[] values = line.split("\t"); + if (values.length != 2) { + LOGGER.error("readDictionary() - wrong format of line: " + line); + throw new AnalysisEngineProcessException(AnalysisEngineProcessException.ANNOTATOR_EXCEPTION, null); + } + + String term = values[0].trim(); + + if (stopWords.contains(term.toLowerCase())) + continue; + + if (normalize) { + term = StringNormalizerForChunking.normalizeString(term, tokenizerFactory).string; + } + if (transliterate) + term = transliterator.transform(term); + if (useApproximateMatching && !caseSensitive && !transliterate) + term = term.toLowerCase(); + + String label = values[1].trim(); + if (term.length() < MIN_TERM_LENGTH) + continue; + + if (generateVariants) { + if (true) + throw new NotImplementedException( + "In this alternative ChunkerProvider, generating variants will currently fail to adequately filter out stop words due to the transliteration and/or normalization algorithms. If you don't need those algorithms, just stick to the original ChunkerProviderImpl. Otherwise, this issue must be fixed (shouldnt be too difficult). Variants are also currently not treated with normalization/transliteration (but this is deemed to be two alternative ways to achieve a similar thing anyway)"); + } else { + // This is a second stop-word-check but here the term has been transliterated and/or normalized. If + // somehow the result of this was a stop word, ignore it. + if (!stopWords.contains(term.toLowerCase())) + dict.addEntry(new DictionaryEntry(term, label, CHUNK_SCORE)); + } + } + + time = System.currentTimeMillis() - time; + LOGGER.info("Reading dictionary took {}ms ({}s)", time, time / 1000); + } finally { + if (null != bf) + bf.close(); + } + } + + private void initStopWords(InputStream stopFileStream) throws IOException { + stopWords = new HashSet(); + + LOGGER.info("readDictionary() - adding entries from " + stopwordFilePath + " to dictionary..."); + BufferedReader bf = new BufferedReader(new InputStreamReader(stopFileStream)); + String line = ""; + + try { + while ((line = bf.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + stopWords.add(line.trim().toLowerCase()); + } + bf.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @Override + public Set getStopWords() { + return stopWords; + } + + @Override + public boolean getUseApproximateMatching() { + return useApproximateMatching; + } + + @Override + public boolean getNormalize() { + return normalize; + } + + @Override + public boolean getNormalizePlural() { + return normalizePlural; + } + + @Override + public boolean getTransliterate() { + return transliterate; + } + + @Override + public boolean getCaseSensitive() { + return caseSensitive; + + } + + private InputStream readStreamFromFileSystemOrClassPath(String filePath) { + InputStream is = null; + File file = new File(filePath); + if (file.exists()) { + try { + is = new FileInputStream(file); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + } else { + is = getClass().getResourceAsStream(filePath.startsWith("/") ? filePath : "/" + filePath); + } + if (filePath.endsWith(".gz") || filePath.endsWith(".gzip")) + try { + is = new GZIPInputStream(is); + } catch (IOException e) { + e.printStackTrace(); + } + return is; + } } diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java index 1a9220007..afec25926 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java @@ -25,6 +25,7 @@ import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; import com.aliasi.tokenizer.TokenizerFactory; import com.ibm.icu.text.Transliterator; +import de.julielab.java.utilities.spanutils.OffsetSet; import de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider; import de.julielab.jcore.ae.lingpipegazetteer.chunking.OverlappingChunk; import de.julielab.jcore.ae.lingpipegazetteer.utils.StringNormalizerForChunking; @@ -32,12 +33,14 @@ import de.julielab.jcore.types.Abbreviation; import de.julielab.jcore.types.AbbreviationLongform; import de.julielab.jcore.types.ConceptMention; +import de.julielab.jcore.types.PennBioIEPOSTag; import de.julielab.jcore.types.mantra.Entity; import de.julielab.jcore.utility.JCoReAnnotationTools; import de.julielab.jcore.utility.index.IndexTermGenerator; import de.julielab.jcore.utility.index.JCoReHashMapAnnotationIndex; import de.julielab.jcore.utility.index.TermGenerators; import de.julielab.jcore.utility.index.TermGenerators.LongOffsetIndexTermGenerator; +import org.apache.commons.lang3.Range; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; @@ -53,6 +56,8 @@ import org.slf4j.LoggerFactory; import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; public class GazetteerAnnotator extends JCasAnnotator_ImplBase { @@ -213,52 +218,6 @@ public int compare(Chunk o1, Chunk o2) { return overlappingChunks; } - // enum ParenthesesType { - // ROUND_CLOSED { - // @Override - // boolean isOpen() { - // return false; - // } - // - // }, - // BRACKET_CLOSED { - // @Override - // boolean isOpen() { - // return false; - // } - // }, - // CURLY_CLOSED { - // @Override - // boolean isOpen() { - // return false; - // } - // - // }, - // ROUND_OPENED { - // @Override - // boolean isOpen() { - // return true; - // } - // }, - // BRACKET_OPENED { - // @Override - // boolean isOpen() { - // return true; - // } - // }, - // CURLY_OPENED { - // @Override - // boolean isOpen() { - // return true; - // } - // }; - // abstract boolean isOpen(); - // - // boolean isClose() { - // return !isOpen(); - // }; - // } - public void initialize(UimaContext aContext) throws ResourceInitializationException { LOGGER.info("calls to initialize: " + initializeCount); @@ -314,8 +273,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept checkAcronyms); // filter stop words - Boolean normalizeBoolean = provider.getNormalize();// (Boolean) - // aContext.getConfigParameterValue(PARAM_NORMALIZE_TEXT); + Boolean normalizeBoolean = provider.getNormalize(); if (normalizeBoolean) { normalizationTokenFactory = new IndoEuropeanTokenizerFactory(); } @@ -357,8 +315,14 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { docText = transliterator.transform(docText); NormalizedString normalizedDocText = null; if (provider.getNormalize()) { - normalizedDocText = StringNormalizerForChunking.normalizeString(docText, normalizationTokenFactory, - transliterator); + boolean hasPosTags = aJCas.getAnnotationIndex(PennBioIEPOSTag.type).iterator().hasNext(); + if (provider.getNormalizePlural()) { + OffsetSet pluralOffsets = StreamSupport.stream(Spliterators.spliterator(aJCas.getAnnotationIndex(PennBioIEPOSTag.type).iterator(), 0, 0), false).filter(tag -> tag.getValue().equals("NNS")).map(tag -> Range.between(tag.getBegin(), tag.getEnd())).collect(Collectors.toCollection(OffsetSet::new)); + normalizedDocText = StringNormalizerForChunking.normalizeString(docText, normalizationTokenFactory, true, pluralOffsets, transliterator); + }else { + normalizedDocText = StringNormalizerForChunking.normalizeString(docText, normalizationTokenFactory, + transliterator); + } } IndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator(); diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java index e51c41eb9..a081858fd 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java @@ -1,8 +1,11 @@ package de.julielab.jcore.ae.lingpipegazetteer.utils; +import com.aliasi.tokenizer.PorterStemmerTokenizerFactory; import com.aliasi.tokenizer.Tokenizer; import com.aliasi.tokenizer.TokenizerFactory; import com.ibm.icu.text.Transliterator; +import de.julielab.java.utilities.spanutils.OffsetSet; +import org.apache.commons.lang3.Range; import java.util.*; @@ -88,10 +91,10 @@ public static NormalizedString normalizeString(String str) { * @param tokenizerFactory * @return */ - public static NormalizedString normalizeString(String str, TokenizerFactory tokenizerFactory, + public static NormalizedString normalizeString(String str, TokenizerFactory tokenizerFactory, boolean normalizePlural, OffsetSet pluralPositions, Transliterator transliterator) { - // boolean stemming = tokenizerFactory instanceof - // PorterStemmerTokenizerFactory; + boolean stemming = tokenizerFactory instanceof + PorterStemmerTokenizerFactory; NormalizedString ns = new NormalizedString(); @@ -141,8 +144,10 @@ public static NormalizedString normalizeString(String str, TokenizerFactory toke if (transliterator != null) token = transliterator.transform(token); // plural s, only when no stemming is done - // if (!stemming && token.endsWith("s")) - // token = token.substring(0, token.length() - 1); + // an even better normalization would be to use the lemma, of course + Range tokenOffsets = Range.between(tokenizer.lastTokenStartPosition(), tokenizer.lastTokenEndPosition()); + if (normalizePlural && !stemming && token.endsWith("s") && pluralPositions.locate(tokenOffsets).isOverlappedBy(tokenOffsets)) + token = token.substring(0, token.length() - 1); sb.append(token); int newStartOffset = sb.length() - token.length(); int newEndOffset = sb.length(); @@ -162,8 +167,16 @@ private static int sumOfStack(Deque stack) { return sum; } + public static NormalizedString normalizeString(String str, TokenizerFactory tokenizerFactory, Transliterator transliterator) { + return normalizeString(str, tokenizerFactory, false, null, transliterator); + } + public static NormalizedString normalizeString(String str, TokenizerFactory tokenizerFactory) { - return normalizeString(str, tokenizerFactory, null); + return normalizeString(str, tokenizerFactory, false, null, null); + } + + public static NormalizedString normalizeString(String str, boolean normalizePlural, OffsetSet pluralPositions, TokenizerFactory tokenizerFactory) { + return normalizeString(str, tokenizerFactory, normalizePlural, pluralPositions, null); } public enum Mode { diff --git a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/StringNormalizerForChunkingTest.java b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/StringNormalizerForChunkingTest.java index fef412a2e..a1bbadf8c 100644 --- a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/StringNormalizerForChunkingTest.java +++ b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/StringNormalizerForChunkingTest.java @@ -5,11 +5,14 @@ import com.aliasi.tokenizer.PorterStemmerTokenizerFactory; import com.aliasi.tokenizer.TokenizerFactory; import com.ibm.icu.text.Transliterator; +import de.julielab.java.utilities.spanutils.OffsetSet; import de.julielab.jcore.ae.lingpipegazetteer.utils.StringNormalizerForChunking; import de.julielab.jcore.ae.lingpipegazetteer.utils.StringNormalizerForChunking.NormalizedString; -import org.junit.Ignore; +import org.apache.commons.lang3.Range; import org.junit.Test; +import java.util.List; + import static org.junit.Assert.*; public class StringNormalizerForChunkingTest { @@ -154,16 +157,11 @@ public void testNewlines() { } @Test - @Ignore - /** - * Ignored because the plural ignore introduced too much errors on test data - * so it was removed from the algorithm. - */ public void testNormalizePlural() { String str; str = "glutathione transferases are evil"; TokenizerFactory tokenizerFactory = new IndoEuropeanTokenizerFactory(); - NormalizedString ns = StringNormalizerForChunking.normalizeString(str, tokenizerFactory); + NormalizedString ns = StringNormalizerForChunking.normalizeString(str, tokenizerFactory, true, new OffsetSet(List.of(Range.between(12, 24))), null); assertEquals("glutathione transferase are evil", ns.string); } } From 1e3472eab5a0c02c73b70fe0f7cf3c5d10ab29e3 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 9 Apr 2021 09:43:37 +0200 Subject: [PATCH 044/269] The stop words given to the chunker provider are now also used by the annotator to filter chunks (this was previously a hard-coded list of words). --- .../uima/GazetteerAnnotator.java | 70 ++-- .../resources/normalizegazetteer.properties | 3 +- .../test/resources/reducedStopWordList.txt | 320 ++++++++++++++++++ 3 files changed, 357 insertions(+), 36 deletions(-) create mode 100644 jcore-lingpipegazetteer-ae/src/test/resources/reducedStopWordList.txt diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java index afec25926..539c0a918 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java @@ -227,41 +227,41 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept try { provider = (ChunkerProvider) getContext().getResourceObject(CHUNKER_RESOURCE_NAME); gazetteer = provider.getChunker(); -// stopWords = provider.getStopWords(); - String[] stopwordArray = {"a", "about", "above", "across", "after", "afterwards", "again", "against", - "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", - "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", - "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", - "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", - "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", - "cant", "co", "computer", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", - "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", - "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", - "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", - "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", - "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", - "herself", "high", "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", - "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", - "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", - "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", - "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", - "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", - "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", - "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", - "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", - "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", - "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", - "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", - "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though", "three", - "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", - "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", - "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", - "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", - "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", - "you", "your", "yours", "yourself", "yourselves",}; - stopWords = new HashSet<>(); - for (String sw : stopwordArray) - stopWords.add(sw); + stopWords = provider.getStopWords(); +// String[] stopwordArray = {"a", "about", "above", "across", "after", "afterwards", "again", "against", +// "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", +// "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", +// "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", +// "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", +// "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", +// "cant", "co", "computer", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", +// "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", +// "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", +// "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", +// "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", +// "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", +// "herself", "high", "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", +// "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", +// "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", +// "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", +// "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", +// "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", +// "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", +// "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", +// "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", +// "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", +// "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", +// "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", +// "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though", "three", +// "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", +// "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", +// "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", +// "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", +// "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", +// "you", "your", "yours", "yourself", "yourselves",}; +// stopWords = new HashSet<>(); +// for (String sw : stopwordArray) +// stopWords.add(sw); } catch (ResourceAccessException e) { LOGGER.error("Exception while initializing", e); } diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/normalizegazetteer.properties b/jcore-lingpipegazetteer-ae/src/test/resources/normalizegazetteer.properties index 88c7883d4..91ac661e7 100644 --- a/jcore-lingpipegazetteer-ae/src/test/resources/normalizegazetteer.properties +++ b/jcore-lingpipegazetteer-ae/src/test/resources/normalizegazetteer.properties @@ -1,5 +1,6 @@ DictionaryFile=src/test/resources/dictionary.tst -StopWordFile=src/test/resources/general_english_words +#StopWordFile=src/test/resources/general_english_words +StopWordFile=src/test/resources/reducedStopWordList.txt NormalizeText=true UseApproximateMatching=true MakeVariants=false diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/reducedStopWordList.txt b/jcore-lingpipegazetteer-ae/src/test/resources/reducedStopWordList.txt new file mode 100644 index 000000000..b0385b7e1 --- /dev/null +++ b/jcore-lingpipegazetteer-ae/src/test/resources/reducedStopWordList.txt @@ -0,0 +1,320 @@ +about +above +across +after +afterwards +again +against +almost +alone +along +already +also +although +always +am +among +amoungst +amount +an +and +another +any +anyhow +anyone +anything +anywhere +are +around +as +at +back +be +became +because +become +becoming +been +before +beforehand +behind +being +below +beside +between +beyond +bill +both +bottom +but +by +call +can +cannot +co +computer +con +could +couldnt +cry +de +describe +detail +do +down +due +during +each +eg +eight +either +eleven +else +elsewhere +enough +etc +even +ever +every +everyone +everything +everywhere +except +fifteen +fify +fill +find +fire +first +five +for +former +formerly +found +four +from +front +full +further +get +give +go +had +has +have +he +hence +her +here +hereafter +hereby +herein +hereupon +hers +high +him +himself +his +how +however +hundred +i +ie +if +in +indeed +interest +into +is +it +its +itself +keep +last +latter +least +less +ltd +made +many +may +me +meanwhile +might +mill +more +moreover +most +mostly +move +much +must +my +myself +name +neither +never +nevertheless +next +nine +no +nobody +none +noone +not +nothing +now +nowhere +of +off +often +on +once +one +only +or +other +others +otherwise +our +ours +ourselves +out +over +own +per +perhaps +please +put +rather +re +same +see +seem +seemed +seems +serious +several +she +should +show +side +since +sincere +sixty +so +some +somehow +someone +something +sometime +sometimes +still +such +system +take +ten +than +that +the +their +them +then +thence +there +thereafter +thereby +therefore +therein +these +they +thick +thin +third +this +those +though +three +throughout +thru +thus +to +together +too +top +toward +towards +twenty +two +un +under +until +up +upon +us +very +via +was +we +were +what +whatever +when +whence +whenever +where +whereafter +whereas +wherein +whereupon +wherever +whether +which +while +whither +who +whole +whom +whose +why +will +with +within +without +would +yet +your +yours +yourself +yourselves +a +all +amongst +anyway +becomes +besides +cant +done +empty +few +forty +hasnt +herself +inc +latterly +mine +namely +noer +onto +part +seeming +six +somewhere +themselves +thereupon +through +twelve +well +whereby +whoever +you From 5b9aef6cf2b10dca5dcd17dec285686dd25a263f Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 9 Apr 2021 10:17:44 +0200 Subject: [PATCH 045/269] Travis build hangs after the tests. Removing multithreading to check if that's the cause. --- .travis.yml | 2 +- .../uima/GazetteerAnnotatorTest.java | 28 +++++++++++++++++-- .../src/test/resources/normalizePlural.dict | 1 + .../normalizepluralgazetteer.properties | 9 ++++++ 4 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 jcore-lingpipegazetteer-ae/src/test/resources/normalizePlural.dict create mode 100644 jcore-lingpipegazetteer-ae/src/test/resources/normalizepluralgazetteer.properties diff --git a/.travis.yml b/.travis.yml index 172756b0e..3b3b4c4e0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,7 +35,7 @@ before_install: - #./travis-deployment/install-flair-nightly.sh - export BOTO_CONFIG=/dev/null install: mvn install -DskipTests=true -Dmaven.javadoc.skip=true -B -V -script: mvn -T 1C test -B +script: mvn test -B cache: directories: diff --git a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java index 7134ae3e7..556b4f0ee 100644 --- a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java +++ b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java @@ -30,6 +30,7 @@ import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.ExternalResourceFactory; import org.apache.uima.fit.factory.TypeSystemDescriptionFactory; +import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.JFSIndexRepository; import org.apache.uima.resource.ExternalResourceDescription; @@ -45,10 +46,8 @@ import org.xml.sax.SAXException; import java.io.*; -import java.util.ArrayList; -import java.util.HashSet; import java.util.List; -import java.util.Set; +import java.util.*; import static java.nio.charset.StandardCharsets.UTF_8; import static org.assertj.core.api.Assertions.assertThat; @@ -359,6 +358,29 @@ public void testAnnotatorWithTextNormalization() } + @Test + public void testAnnotatorWithPluralNormalization() + throws ResourceInitializationException, AnalysisEngineProcessException { + ExternalResourceDescription extDesc = ExternalResourceFactory.createExternalResourceDescription( + ChunkerProviderImplAlt.class, new File("src/test/resources/normalizepluralgazetteer.properties")); + TypeSystemDescription tsDesc = TypeSystemDescriptionFactory + .createTypeSystemDescription("de.julielab.jcore.types.jcore-semantics-mention-types"); + + AnalysisEngine annotator = AnalysisEngineFactory.createEngine(GazetteerAnnotator.class, tsDesc, + GazetteerAnnotator.PARAM_OUTPUT_TYPE, "de.julielab.jcore.types.EntityMention", + GazetteerAnnotator.CHUNKER_RESOURCE_NAME, extDesc); + JCas jCas = annotator.newJCas(); + + jCas.setDocumentText("High-density lipoprotein (HDL) is one of the five major groups of lipoproteins."); + PennBioIEPOSTag tag = new PennBioIEPOSTag(jCas, 74, 86); + tag.setValue("NNS"); + tag.addToIndexes(); + annotator.process(jCas); + + Collection entityMentions = JCasUtil.select(jCas, EntityMention.class); + assertEquals("Expected a single entity", 1, entityMentions.size()); + } + @Test public void testAnnotateAcronymsWithFullFormEntity() throws Exception { ExternalResourceDescription extDesc = ExternalResourceFactory.createExternalResourceDescription( diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/normalizePlural.dict b/jcore-lingpipegazetteer-ae/src/test/resources/normalizePlural.dict new file mode 100644 index 000000000..713dbb370 --- /dev/null +++ b/jcore-lingpipegazetteer-ae/src/test/resources/normalizePlural.dict @@ -0,0 +1 @@ +lipoproteins Group diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/normalizepluralgazetteer.properties b/jcore-lingpipegazetteer-ae/src/test/resources/normalizepluralgazetteer.properties new file mode 100644 index 000000000..2100ebeaf --- /dev/null +++ b/jcore-lingpipegazetteer-ae/src/test/resources/normalizepluralgazetteer.properties @@ -0,0 +1,9 @@ +DictionaryFile=src/test/resources/dictionary.tst +#StopWordFile=src/test/resources/general_english_words +StopWordFile=src/test/resources/reducedStopWordList.txt +NormalizeText=true +NormalizePlural=true +UseApproximateMatching=true +MakeVariants=false +CaseSensitive=false + From c32844dbc5d0e2f927cea05479afee81f3faf8df Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 9 Apr 2021 10:25:29 +0200 Subject: [PATCH 046/269] Added a test for the plural normalization. --- .../lingpipegazetteer/utils/StringNormalizerForChunking.java | 2 +- .../ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java | 5 ++++- .../src/test/resources/normalizePlural.dict | 2 +- .../src/test/resources/normalizepluralgazetteer.properties | 3 +-- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java index a081858fd..e1c997196 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java @@ -146,7 +146,7 @@ public static NormalizedString normalizeString(String str, TokenizerFactory toke // plural s, only when no stemming is done // an even better normalization would be to use the lemma, of course Range tokenOffsets = Range.between(tokenizer.lastTokenStartPosition(), tokenizer.lastTokenEndPosition()); - if (normalizePlural && !stemming && token.endsWith("s") && pluralPositions.locate(tokenOffsets).isOverlappedBy(tokenOffsets)) + if (normalizePlural && !stemming && token.endsWith("s") && !pluralPositions.isEmpty() && pluralPositions.locate(tokenOffsets).isOverlappedBy(tokenOffsets)) token = token.substring(0, token.length() - 1); sb.append(token); int newStartOffset = sb.length() - token.length(); diff --git a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java index 556b4f0ee..f7b1a6e8f 100644 --- a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java +++ b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotatorTest.java @@ -378,7 +378,10 @@ public void testAnnotatorWithPluralNormalization() annotator.process(jCas); Collection entityMentions = JCasUtil.select(jCas, EntityMention.class); - assertEquals("Expected a single entity", 1, entityMentions.size()); + assertEquals("Expected a single entity", 2, entityMentions.size()); + Iterator iterator = entityMentions.iterator(); + assertEquals("Unexpected covered entity text", "lipoprotein", iterator.next().getCoveredText()); + assertEquals("Unexpected covered entity text", "lipoproteins", iterator.next().getCoveredText()); } @Test diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/normalizePlural.dict b/jcore-lingpipegazetteer-ae/src/test/resources/normalizePlural.dict index 713dbb370..a59e0435f 100644 --- a/jcore-lingpipegazetteer-ae/src/test/resources/normalizePlural.dict +++ b/jcore-lingpipegazetteer-ae/src/test/resources/normalizePlural.dict @@ -1 +1 @@ -lipoproteins Group +lipoprotein Group diff --git a/jcore-lingpipegazetteer-ae/src/test/resources/normalizepluralgazetteer.properties b/jcore-lingpipegazetteer-ae/src/test/resources/normalizepluralgazetteer.properties index 2100ebeaf..025fd2fa7 100644 --- a/jcore-lingpipegazetteer-ae/src/test/resources/normalizepluralgazetteer.properties +++ b/jcore-lingpipegazetteer-ae/src/test/resources/normalizepluralgazetteer.properties @@ -1,5 +1,4 @@ -DictionaryFile=src/test/resources/dictionary.tst -#StopWordFile=src/test/resources/general_english_words +DictionaryFile=src/test/resources/normalizePlural.dict StopWordFile=src/test/resources/reducedStopWordList.txt NormalizeText=true NormalizePlural=true From bbbae04639257793bea92172f6b7c46b7c0333b9 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 9 Apr 2021 10:51:09 +0200 Subject: [PATCH 047/269] The hanging build was due to the flair NER component having an error. This was probably because torch 1.7 has issues with python 3.6. So here we bumped python to 3.7. --- .travis.yml | 6 +++--- ...e-lingpipe-gazetteer-ae-configurable-resource.xml | 12 ++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3b3b4c4e0..51738bd09 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,8 +6,8 @@ addons: sources: - deadsnakes packages: - - python3.6 - - python3.6-dev + - python3.7 + - python3.7-dev env: global: @@ -35,7 +35,7 @@ before_install: - #./travis-deployment/install-flair-nightly.sh - export BOTO_CONFIG=/dev/null install: mvn install -DskipTests=true -Dmaven.javadoc.skip=true -B -V -script: mvn test -B +script: mvn -T 2C test -B cache: directories: diff --git a/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae-configurable-resource.xml b/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae-configurable-resource.xml index 16a94eb70..c070abd9e 100644 --- a/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae-configurable-resource.xml +++ b/jcore-lingpipegazetteer-ae/src/main/resources/de/julielab/jcore/ae/lingpipegazetteer/desc/jcore-lingpipe-gazetteer-ae-configurable-resource.xml @@ -108,6 +108,12 @@ false true + + NormalizePlural + Boolean + false + true + TransliterateText Boolean @@ -152,6 +158,12 @@ true + + NormalizePlural + + false + + TransliterateText From 49472f4874c6fb7893014a151248e12c846d98d2 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 9 Apr 2021 10:54:28 +0200 Subject: [PATCH 048/269] Fixing the python version in the PYTHON variable in .travis.yml. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 51738bd09..f376d31aa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,7 +19,7 @@ env: - # GPG_KEY_NAME - secure: pxYxmA/9xS/9DO6rUAhlbAtYQMmG633jSwG8OIVCnnoQSXS4UILJgNl7Q6dQsAuT27tk+/fin0kXTnxWqCe0URb3c3XgNQwfGAuz1JIYVPHvezoDQLLRQA6LRgqd7GuvBDsyXJvBANozGKJYJVfoeT9gqFosFuMdRZ88eQm+ltX7zVKyMiz2rqKYPoSFInNxDGMOaIQ+RZdf8ai8rLY3E11PxsMC0LgypEDbuC7d9Q+Tu89YfUeuRly0hAuxmW++RrMgeeAs/7BndmZqcHVpkrcX6Drq8nZ2cj0ev4IDJelV/Nd17Vjfg7HgfJ4/d9S+PCg4KhvOY/y9Xad8geIIzXLFD9ZgcaK7MT9+BFGYXj7ExizFSc+Ico5Q822RJA1XZWfc/EgnY+7jEZCCMz/ceHx8oSh0ce1VbPl7c+O+jMXUMQC69Gpys57XC48rdPn0bbjc4/jpSOq46Xv7YdcGuA2BcWEEeQ0WAbi9IDcevpCXiZ7kng5hHTCpfaYVhn63KAIAMKf7tu6C78wFZR63F8Gf4x/jKE37QqvHV3uOzD7ar6nTAuy/ukZK0p4zyeIYe25PnS9K4kpolT1I12i7/l/7MO9NPFdB0aOCBHUNPBEkifwceltX6RP4PDIKdtCEQ4vcqrRNvhtAhO9Vo1udkyaeFx5swbY3j11CjzcfrBE= - # GPG_PASSPHRASE - - PYTHON=/usr/bin/python3.6 + - PYTHON=/usr/bin/python3.7 before_install: - | From 4669d5e2d267307b346fc668d02cf3cebc56e87a Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 9 Apr 2021 11:23:32 +0200 Subject: [PATCH 049/269] The original error with flair seems fixed, still the build hangs again. Removing multithreading once more to find the cause. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f376d31aa..d15ad0a30 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,7 +35,7 @@ before_install: - #./travis-deployment/install-flair-nightly.sh - export BOTO_CONFIG=/dev/null install: mvn install -DskipTests=true -Dmaven.javadoc.skip=true -B -V -script: mvn -T 2C test -B +script: mvn test -B cache: directories: From 943c6cfe941c25f18ef33955bd1b8b348c2dc184 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 9 Apr 2021 11:52:46 +0200 Subject: [PATCH 050/269] Turns out, still had a flair issue because now torch 1.8 was used which changed something with LSTMs. Now explicitly installing flair 0.6.1 and torch 1.7.1. --- .travis.yml | 2 +- jcore-flair-ner-ae/README.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d15ad0a30..2a924b36b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -31,7 +31,7 @@ before_install: if ! find "$HOME/pip-cache" -mindepth 1 -print -quit 2>/dev/null | grep -q .; then $PYTHON -m pip download --destination-directory="$HOME/pip-cache" flair fi - sudo -H $PYTHON -m pip install --find-links="$HOME/pip-cache" flair==0.6.1 + sudo -H $PYTHON -m pip install --find-links="$HOME/pip-cache" flair==0.6.1 torch==1.7.1 - #./travis-deployment/install-flair-nightly.sh - export BOTO_CONFIG=/dev/null install: mvn install -DskipTests=true -Dmaven.javadoc.skip=true -B -V diff --git a/jcore-flair-ner-ae/README.md b/jcore-flair-ner-ae/README.md index a06e8a4d7..69d4b0ee0 100644 --- a/jcore-flair-ner-ae/README.md +++ b/jcore-flair-ner-ae/README.md @@ -12,6 +12,8 @@ The python executable lookup works as follows: 2. Otherwise, if the environment variable `PYTHON` is set, this value is used. 3. Otherwise, the `python` command is used. +Tested with flair 0.6.1 and PyTorch 1.7.1. + **1. Parameters** | Parameter Name | Parameter Type | Mandatory | Multivalued | Description | From dcea03bc951f992bfe2c3862ada5732e6a8eb4d9 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 9 Apr 2021 12:21:37 +0200 Subject: [PATCH 051/269] Travis build passed, enabling multithreading again. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 2a924b36b..bce762cc1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,7 +35,7 @@ before_install: - #./travis-deployment/install-flair-nightly.sh - export BOTO_CONFIG=/dev/null install: mvn install -DskipTests=true -Dmaven.javadoc.skip=true -B -V -script: mvn test -B +script: mvn -T 2C test -B cache: directories: From 9bbba6a38ac40576624c7f19264574c375a66465 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 9 Apr 2021 14:36:37 +0200 Subject: [PATCH 052/269] NPE check --- .../jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java | 1 - .../ae/lingpipegazetteer/utils/StringNormalizerForChunking.java | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java index 539c0a918..e663228b3 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java @@ -315,7 +315,6 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { docText = transliterator.transform(docText); NormalizedString normalizedDocText = null; if (provider.getNormalize()) { - boolean hasPosTags = aJCas.getAnnotationIndex(PennBioIEPOSTag.type).iterator().hasNext(); if (provider.getNormalizePlural()) { OffsetSet pluralOffsets = StreamSupport.stream(Spliterators.spliterator(aJCas.getAnnotationIndex(PennBioIEPOSTag.type).iterator(), 0, 0), false).filter(tag -> tag.getValue().equals("NNS")).map(tag -> Range.between(tag.getBegin(), tag.getEnd())).collect(Collectors.toCollection(OffsetSet::new)); normalizedDocText = StringNormalizerForChunking.normalizeString(docText, normalizationTokenFactory, true, pluralOffsets, transliterator); diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java index e1c997196..9e50f845a 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java @@ -146,7 +146,7 @@ public static NormalizedString normalizeString(String str, TokenizerFactory toke // plural s, only when no stemming is done // an even better normalization would be to use the lemma, of course Range tokenOffsets = Range.between(tokenizer.lastTokenStartPosition(), tokenizer.lastTokenEndPosition()); - if (normalizePlural && !stemming && token.endsWith("s") && !pluralPositions.isEmpty() && pluralPositions.locate(tokenOffsets).isOverlappedBy(tokenOffsets)) + if (normalizePlural && !stemming && token.endsWith("s") && pluralPositions != null && !pluralPositions.isEmpty() && pluralPositions.locate(tokenOffsets).isOverlappedBy(tokenOffsets)) token = token.substring(0, token.length() - 1); sb.append(token); int newStartOffset = sb.length() - token.length(); From a41c662a1ea88b6492aebdb94bc52cfa9c73174d Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 4 May 2021 14:31:50 +0200 Subject: [PATCH 053/269] Emergency fix of invalid offsets. Not fixing the actual issue (multi byte encoding) but just avoiding offsets outside of the range of the document text. --- .../lingpipegazetteer/uima/GazetteerAnnotator.java | 7 +++++-- .../utils/StringNormalizerForChunking.java | 14 ++++++++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java index e663228b3..35e02f576 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java @@ -532,8 +532,11 @@ private void add2Cas(JCas aJCas, Chunk chunk, NormalizedString normalizedDocText return; } - int start = provider.getNormalize() ? normalizedDocText.getOriginalOffset(chunk.start()) : chunk.start(); - int end = provider.getNormalize() ? normalizedDocText.getOriginalOffset(chunk.end()) : chunk.end(); + // The Math.min(, Math.max(0, )) application is a security measure. I rare cases they are issues with multi + // byte character encodings. This security measure won't correct the underlying error but avoid errors + // due to invalid offsets. + int start = Math.min(aJCas.getDocumentText().length(), Math.max(0, provider.getNormalize() ? normalizedDocText.getOriginalOffset(chunk.start()) : chunk.start())); + int end = Math.min(aJCas.getDocumentText().length(), Math.max(0,provider.getNormalize() ? normalizedDocText.getOriginalOffset(chunk.end()) : chunk.end())); try { if (mantraMode) { diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java index 9e50f845a..b12b5de39 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java @@ -146,8 +146,18 @@ public static NormalizedString normalizeString(String str, TokenizerFactory toke // plural s, only when no stemming is done // an even better normalization would be to use the lemma, of course Range tokenOffsets = Range.between(tokenizer.lastTokenStartPosition(), tokenizer.lastTokenEndPosition()); - if (normalizePlural && !stemming && token.endsWith("s") && pluralPositions != null && !pluralPositions.isEmpty() && pluralPositions.locate(tokenOffsets).isOverlappedBy(tokenOffsets)) - token = token.substring(0, token.length() - 1); + try { + if (normalizePlural && !stemming && token.endsWith("s") && pluralPositions != null && !pluralPositions.isEmpty() && Optional.ofNullable(pluralPositions.locate(tokenOffsets)).orElse(Range.between(0, 0)).isOverlappedBy(tokenOffsets)) + token = token.substring(0, token.length() - 1); + } catch (Exception e) { + System.out.println("normalizePlural: " + normalizePlural); + System.out.println("stemming: " + stemming); + System.out.println("Token: " + token); + System.out.println("PluralPositions: " + pluralPositions); + System.out.println("TokenOffsets: " + tokenOffsets); + System.out.println("pluralPositions.locate(tokenOffsets): " + pluralPositions.locate(tokenOffsets)); + e.printStackTrace(); + } sb.append(token); int newStartOffset = sb.length() - token.length(); int newEndOffset = sb.length(); From 5f6a218044dd222cc9ce67595792bca9d1db643a Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 3 Jun 2021 15:37:57 +0200 Subject: [PATCH 054/269] Created code to fetch the hashes of existing XMI documents in the XMI table. However, the code is currently in the wrong place. It must go the XML reader. We need to compute the hash directly after parsing the document text from XML so we can then compare it to the hashes in the database. When we would use the document text in the XMI reader this would always be the same as the database hash because we have read the document from the very same database table. So, next up is code movement to the XML reader. --- jcore-types/pom.xml | 3 + .../jcore/types/jcore-casflow-types.xml | 28 +++++ .../flowcontroller/AnnotationDefinedFlow.java | 39 ++++++ .../AnnotationDefinedFlowController.java | 23 ++++ .../xmi/flowcontroller/FixedInnerFlow.java | 40 ++++++ .../HashComparisonFlowController.java | 117 ++++++++++++++++++ .../HashComparisonOuterFlow.java | 72 +++++++++++ .../flowcontroller/FlowControllerTest.java | 101 +++++++++++++++ .../src/test/resources/logback-test.xml | 1 + .../jcore/consumer/xmi/XMIDBWriter.java | 2 + .../jcore/consumer/xmi/XmiDataInserter.java | 17 +-- 11 files changed, 436 insertions(+), 7 deletions(-) create mode 100644 jcore-types/src/main/resources/de/julielab/jcore/types/jcore-casflow-types.xml create mode 100644 jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/AnnotationDefinedFlow.java create mode 100644 jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/AnnotationDefinedFlowController.java create mode 100644 jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/FixedInnerFlow.java create mode 100644 jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/HashComparisonFlowController.java create mode 100644 jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/HashComparisonOuterFlow.java create mode 100644 jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/flowcontroller/FlowControllerTest.java diff --git a/jcore-types/pom.xml b/jcore-types/pom.xml index e9571839f..99b9f0134 100644 --- a/jcore-types/pom.xml +++ b/jcore-types/pom.xml @@ -36,6 +36,9 @@ src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-dbtable-multiplier-types.xml + + src/main/resources/de/julielab/jcore/types/jcore-casflow-types.xml + diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-casflow-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-casflow-types.xml new file mode 100644 index 000000000..6d3e20b4c --- /dev/null +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-casflow-types.xml @@ -0,0 +1,28 @@ + + + JCoRe CAS Flow Types + This is a type system to facilitate the routing of CASes through AggregateAnalysisEngines via + FlowControllers. The types herein serve to indicate which components should be visited for the CAS + carrying annotations of this type. + + 2.6.0-SNAPSHOT + JULIE Lab Jena, Germany + + + de.julielab.jcore.types.casflow.ToVisit + Contains a list of delegate analysis engine names that the CAS, having this annotation, should + visit. Other components will be skipped. The names must the delegate keys specified in the aggregate + descriptor. + + uima.tcas.Annotation + + + delegateKeys + The keys of the delegates to visit. The keys are the names given to the delegate analysis engines in the aggregate. + uima.cas.StringArray + uima.cas.String + + + + + \ No newline at end of file diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/AnnotationDefinedFlow.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/AnnotationDefinedFlow.java new file mode 100644 index 000000000..c48c75193 --- /dev/null +++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/AnnotationDefinedFlow.java @@ -0,0 +1,39 @@ +package de.julielab.jcore.reader.xmi.flowcontroller; + +import de.julielab.jcore.types.casflow.ToVisit; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.analysis_engine.metadata.FixedFlow; +import org.apache.uima.analysis_engine.metadata.FlowConstraints; +import org.apache.uima.flow.JCasFlow_ImplBase; +import org.apache.uima.flow.SimpleStep; +import org.apache.uima.flow.Step; + +/** + *

Returns steps according an existing {@link ToVisit} annotation of the CAS or, if not present, the default aggregate flow.

+ */ +public class AnnotationDefinedFlow extends JCasFlow_ImplBase { + private String[] toVisitKeys; + private String[] fixedFlow; + private int currentPos; + + public AnnotationDefinedFlow(ToVisit toVisit, FlowConstraints flowConstraints) throws AnalysisEngineProcessException { + if (!(flowConstraints instanceof FixedFlow)) + throw new AnalysisEngineProcessException(new IllegalArgumentException("This flow requires the FixedFlow to determine the default processing order. However, the flow constraints are of type " + flowConstraints.getClass().getCanonicalName())); + this.fixedFlow = toVisit != null ? ((FixedFlow) flowConstraints).getFixedFlow() : null; + this.toVisitKeys = toVisit.getDelegateKeys().toArray(); + this.currentPos = 0; + } + + /** + *

Routes the CAS to the next component defined by the CAS'es {@link ToVisit} annotation or, + * if ToVisit was not found, to the next component as defined by the default fixed flow.

+ * + * @return The next component to visit or the next default flow component. + */ + @Override + public Step next() { + String nextAEKey = toVisitKeys != null ? toVisitKeys[currentPos] : fixedFlow[currentPos]; + ++currentPos; + return new SimpleStep(nextAEKey); + } +} diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/AnnotationDefinedFlowController.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/AnnotationDefinedFlowController.java new file mode 100644 index 000000000..359d8eb7d --- /dev/null +++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/AnnotationDefinedFlowController.java @@ -0,0 +1,23 @@ +package de.julielab.jcore.reader.xmi.flowcontroller; + +import de.julielab.jcore.types.casflow.ToVisit; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.flow.Flow; +import org.apache.uima.flow.JCasFlowController_ImplBase; +import org.apache.uima.jcas.JCas; + +/** + *

Routes CASes through an aggregate analysis engine according to the {@link ToVisit} annotation present in the CAS.

+ *

If there is not ToVisit annotation, the default (fixed) flow will be used. Thus, the fixed flow constraint + * must be set on the aggregate engine.

+ */ +public class AnnotationDefinedFlowController extends JCasFlowController_ImplBase { + @Override + public Flow computeFlow(JCas jCas) throws AnalysisEngineProcessException { + boolean exists = JCasUtil.exists(jCas, ToVisit.class); + ToVisit toVisit = exists ? JCasUtil.selectSingle(jCas, ToVisit.class) : null; + // When toVisit is null, the default, fixed flow is used. + return new AnnotationDefinedFlow(toVisit, getContext().getAggregateMetadata().getFlowConstraints()); + } +} diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/FixedInnerFlow.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/FixedInnerFlow.java new file mode 100644 index 000000000..21d84a60d --- /dev/null +++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/FixedInnerFlow.java @@ -0,0 +1,40 @@ +package de.julielab.jcore.reader.xmi.flowcontroller; + +import org.apache.uima.flow.FinalStep; +import org.apache.uima.flow.JCasFlow_ImplBase; +import org.apache.uima.flow.SimpleStep; +import org.apache.uima.flow.Step; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + *

This flow is supposed to route the output CASes of the {@link de.julielab.jcore.reader.xmi.XmiDBMultiplier} in + * a fixed, sequential manner through the aggregate engine. It just skips the first delegate - the multiplier itself - then continues with the rest.

+ */ +public class FixedInnerFlow extends JCasFlow_ImplBase { + private final static Logger log = LoggerFactory.getLogger(FixedInnerFlow.class); + private int currentPosition; + private String[] fixedFlow; + + public FixedInnerFlow(String[] fixedFlow) { + this.fixedFlow = fixedFlow; + this.currentPosition = 0; + } + + public Step next() { + Step step = null; + for (; currentPosition < fixedFlow.length && step == null; currentPosition++) { + String aeKey = fixedFlow[currentPosition]; + // The first analysis engine is the multiplier + if (currentPosition > 0) { + log.trace("Inner next AE is: " + aeKey); + step = new SimpleStep(aeKey); + } + } + if (step == null) { + // no appropriate AEs to call - end of flow + log.trace("Inner flow Complete."); + } + return step == null ? new FinalStep() : step; + } +} diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/HashComparisonFlowController.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/HashComparisonFlowController.java new file mode 100644 index 000000000..717566675 --- /dev/null +++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/HashComparisonFlowController.java @@ -0,0 +1,117 @@ +package de.julielab.jcore.reader.xmi.flowcontroller; + +import de.julielab.costosys.configuration.FieldConfig; +import de.julielab.costosys.dbconnection.CoStoSysConnection; +import de.julielab.costosys.dbconnection.DataBaseConnector; +import de.julielab.jcore.reader.db.DBReader; +import de.julielab.jcore.types.casmultiplier.RowBatch; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.flow.Flow; +import org.apache.uima.flow.FlowControllerContext; +import org.apache.uima.flow.JCasFlowController_ImplBase; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.StringArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.FileNotFoundException; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.*; +import java.util.stream.Collectors; + +/** + *

Prereque

+ *

Expects a jCas as being output by the {@link de.julielab.jcore.reader.xmi.XmiDBMultiplierReader}, i.e. the CAS + * should contain a {@link de.julielab.jcore.types.casmultiplier.RowBatch} annotation. Then, Retrieves the sha256 hashes for + * the passed documents from the database.

+ */ +@ResourceMetaData(name = "JCoRe Hash Comparison Flow Controller", description = "This flow controller aims to skip processing for CASes that already exist in the database and haven't changed with regards to a newly read version. For this purpose, the sha256 hash of the CAS document text is compared to the the existing hash in the database for the same document ID. If the hashes match, the text is the same and, thus, the annotations will be the same.") +public class HashComparisonFlowController extends JCasFlowController_ImplBase { + public static final String PARAM_ADD_SHA_HASH = "AddShaHash"; + public static final String PARAM_TABLE_DOCUMENT = "DocumentTable"; + private final static Logger log = LoggerFactory.getLogger(HashComparisonFlowController.class); + @ConfigurationParameter(name = DBReader.PARAM_COSTOSYS_CONFIG_NAME, description = "Path to the CoStoSys configuration XML file that specifies the database this pipeline writes to, i.e. the same file that the DB XMI Writer is using. If there is no DB Writer in use, this flow controller is not applicable.") + private String costosysConfig; + @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, description = "Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS skips all component except the DBCheckpointAE to mark the document as processed.") + private String documentItemToHash; + @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT, description = "String parameter indicating the name of the " + + "table where the XMI data will be stored. The name must be schema qualified.") + private String docTableParamValue; + + private DataBaseConnector dbc; + + @Override + public void initialize(FlowControllerContext aContext) throws ResourceInitializationException { + this.costosysConfig = (String) aContext.getConfigParameterValue(DBReader.PARAM_COSTOSYS_CONFIG_NAME); + this.documentItemToHash = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_ADD_SHA_HASH)).orElse("document_text"); + try { + dbc = new DataBaseConnector(this.costosysConfig); + } catch (FileNotFoundException e) { + log.error("Could not create the CoStoSys DatabaseConnector:", e); + throw new ResourceInitializationException(e); + } + } + + @Override + public Flow computeFlow(JCas jCas) throws AnalysisEngineProcessException { + RowBatch rowBatch; + try { + rowBatch = JCasUtil.selectSingle(jCas, RowBatch.class); + } catch (IllegalArgumentException e) { + log.error("Could not select the RowBatch annotation from the JCas:", e); + throw new AnalysisEngineProcessException(e); + } + Map id2hash = fetchCurrentHashesFromDatabase(rowBatch); + return new HashComparisonOuterFlow(id2hash, documentItemToHash, getContext().getAggregateMetadata().getFlowConstraints()); + } + + /** + *

Fetches the hashes of the currently stored documents in the database.

+ * @param rowBatch The annotation specifying which documents should be fetched by the multiplier and then be processed by the aggregate. + * @return A map from a string representation of the RowBatches document IDs to the hashes for the respective IDs. + * @throws AnalysisEngineProcessException If the SQL request fails. + */ + private Map fetchCurrentHashesFromDatabase(RowBatch rowBatch) throws AnalysisEngineProcessException { + String dataTable = dbc.getNextDataTable(rowBatch.getTableName()); + String hashColumn = documentItemToHash + "_sha256"; + // Extract the document IDs in this RowBatch. The IDs could be composite keys. + List documentIds = new ArrayList<>(rowBatch.getIdentifiers().size()); + Iterator documentIDsIt = rowBatch.getIdentifiers().iterator(); + while (documentIDsIt.hasNext()) { + StringArray pkArray = (StringArray) documentIDsIt.next(); + documentIds.add(pkArray.toStringArray()); + } + Map id2hash = new HashMap<>(documentIds.size()); + // This is the map we want to fill that lets us look up the hash of the document text by document ID. + String sql = null; + // Query the database for the document IDs in the current RowBatch and retrieve hashes. + try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { + FieldConfig activeTableFieldConfiguration = dbc.getActiveTableFieldConfiguration(); + String idQuery = documentIds.stream() + .map(key -> Arrays.stream(key).map(part -> "%s='" + part + '"').toArray(String[]::new)) + .map(activeTableFieldConfiguration::expandPKNames).map(expandedKeys -> String.join(" AND ", expandedKeys)) + .collect(Collectors.joining(" OR ")); + sql = String.format("SELECT %s,%s FROM %s WHERE %s", activeTableFieldConfiguration.getPrimaryKeyString(), hashColumn, dataTable, idQuery); + ResultSet rs = conn.createStatement().executeQuery(sql); + while (rs.next()) { + StringBuilder pkSb = new StringBuilder(); + for (int i = 0; i < activeTableFieldConfiguration.getPrimaryKey().length; i++) + pkSb.append(rs.getString(i)).append(','); + // Remove training comma + pkSb.deleteCharAt(pkSb.length()); + String hash = rs.getString(activeTableFieldConfiguration.getPrimaryKey().length); + id2hash.put(pkSb.toString(), hash); + } + } catch (SQLException e) { + log.error("Could not retrieve hashes from the database. SQL query was {}:", sql, e); + throw new AnalysisEngineProcessException(e); + } + return id2hash; + } +} diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/HashComparisonOuterFlow.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/HashComparisonOuterFlow.java new file mode 100644 index 000000000..09178fa29 --- /dev/null +++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/HashComparisonOuterFlow.java @@ -0,0 +1,72 @@ +package de.julielab.jcore.reader.xmi.flowcontroller; + +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.analysis_engine.metadata.FixedFlow; +import org.apache.uima.analysis_engine.metadata.FlowConstraints; +import org.apache.uima.flow.*; +import org.apache.uima.jcas.JCas; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Map; + +/** + *

Note: This flow can only be used in an aggregate analysis engine where the {@link de.julielab.jcore.reader.xmi.XmiDBMultiplier} is the first component.

+ *

This flow is created by the {@link HashComparisonFlowController} and routes the CAS that was filled by the {@link de.julielab.jcore.reader.xmi.XmiDBMultiplierReader}. + * This CAS contains an instance of {@link de.julielab.jcore.types.casmultiplier.RowBatch} which contains the information which documents should be read + * from which database table.

+ *

Within this flow, the reader CAS is passed to the multiplier, the first component. For CASes created by the multiplier, + * the method {@link #newCasProduced(JCas, String)} is called for which a new flow concerning the processing order of the + * multiplier-created CASes within the aggregate is determined.

+ */ +public class HashComparisonOuterFlow extends JCasFlow_ImplBase { + private final static Logger log = LoggerFactory.getLogger(HashComparisonOuterFlow.class); + private String[] fixedFlow; + private int currentPosition; + private Map id2hash; + private String documentItemToHash; + + public HashComparisonOuterFlow(Map id2hash, String documentItemToHash, FlowConstraints flowConstraints) throws AnalysisEngineProcessException { + this.id2hash = id2hash; + this.documentItemToHash = documentItemToHash; + if (!(flowConstraints instanceof FixedFlow)) { + throw new AnalysisEngineProcessException(new IllegalArgumentException("This flow requires the original FixedFlow to know the order of the delegate engines but the given flow is of type " + flowConstraints.getClass())); + } + FixedFlow fixedFlow = (FixedFlow) flowConstraints; + this.fixedFlow = fixedFlow.getFixedFlow(); + this.currentPosition = 0; + } + + @Override + protected Flow newCasProduced(JCas newCas, String producedBy) throws AnalysisEngineProcessException { + String newHash = getHash(newCas); + return new FixedInnerFlow(fixedFlow); + } + + private String getHash(JCas newCas) { + final String documentText = newCas.getDocumentText(); + final byte[] sha = DigestUtils.sha256(documentText.getBytes()); + return Base64.encodeBase64String(sha); + } + + public Step next() { + Step step = null; + for (; currentPosition < fixedFlow.length && step == null; currentPosition++) { + String aeKey = fixedFlow[currentPosition]; + + // The outer flow only passes the CAS to the CAS multiplier. The multiplier creates more CASes which + // are then passed to newCasProduced() and are then routed by the InnerFlow. + if (currentPosition == 0) { + log.trace("Outer next AE is: " + aeKey); + step = new SimpleStep(aeKey); + } + } + if (step == null) { + // no appropriate AEs to call - end of flow + log.trace("Outer flow Complete."); + } + return step == null ? new FinalStep() : step; + } +} diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/flowcontroller/FlowControllerTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/flowcontroller/FlowControllerTest.java new file mode 100644 index 000000000..5c3d69e64 --- /dev/null +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/flowcontroller/FlowControllerTest.java @@ -0,0 +1,101 @@ +package de.julielab.jcore.reader.xmi.flowcontroller; + +import de.julielab.jcore.types.Header; +import de.julielab.jcore.types.casmultiplier.RowBatch; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_component.JCasMultiplier_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.AbstractCas; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.FlowControllerFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.flow.FlowControllerDescription; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.StringArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +public class FlowControllerTest { + @Test + public void testFlowController() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types"); + RowBatch rowBatch = new RowBatch(jCas); + for (int i = 0; i < 10; i++) { + StringArray id = new StringArray(jCas, 1); + id.set(0, String.valueOf(i)); + rowBatch.setIdentifiers(JCoReTools.addToFSArray(rowBatch.getIdentifiers(), id)); + } + rowBatch.addToIndexes(); + + FlowControllerDescription flowControllerDescription = FlowControllerFactory.createFlowControllerDescription(HashComparisonFlowController.class); + AnalysisEngineDescription multiplierDesc = AnalysisEngineFactory.createEngineDescription(TestMultiplier.class); + AnalysisEngineDescription testAeDesc1 = AnalysisEngineFactory.createEngineDescription(TestAE.class, "name", "TestAE 1"); + AnalysisEngineDescription testAeDesc2 = AnalysisEngineFactory.createEngineDescription(TestAE.class, "name", "TestAE 2"); + AnalysisEngineDescription aaeWithFlowController = AnalysisEngineFactory.createEngineDescription(flowControllerDescription, multiplierDesc, testAeDesc1, testAeDesc2); + AnalysisEngine aae = AnalysisEngineFactory.createEngine(aaeWithFlowController); + + aae.process(jCas); + } + + public static class TestAE extends JCasAnnotator_ImplBase { + private final static Logger log = LoggerFactory.getLogger(TestAE.class); + + @ConfigurationParameter(name = "name") + private String name; + + @Override + public void initialize(UimaContext context) throws ResourceInitializationException { + name = (String) context.getConfigParameterValue("name"); + } + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + log.debug("Running AE: {}", name); + log.debug("JCas text: " + jCas.getDocumentText()); + } + } + + public static class TestMultiplier extends JCasMultiplier_ImplBase { + private List idsToRead = new ArrayList<>(); + private int currentIndex; + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + RowBatch rowbatch = JCasUtil.selectSingle(jCas, RowBatch.class); + idsToRead.clear(); + currentIndex = 0; + for (int i = 0; i < rowbatch.getIdentifiers().size() && rowbatch.getIdentifiers(i) != null; i++) { + // In this test, the document IDs consist only of a single element + idsToRead.add(rowbatch.getIdentifiers(i).get(0)); + } + } + + @Override + public boolean hasNext() throws AnalysisEngineProcessException { + return currentIndex < idsToRead.size(); + } + + @Override + public AbstractCas next() throws AnalysisEngineProcessException { + JCas emptyJCas = getEmptyJCas(); + Header header = new Header(emptyJCas); + String docId = idsToRead.get(currentIndex); + header.setDocId(docId); + header.addToIndexes(); + emptyJCas.setDocumentText("ID: " + docId); + ++currentIndex; + return emptyJCas; + } + } + +} diff --git a/jcore-xmi-db-reader/src/test/resources/logback-test.xml b/jcore-xmi-db-reader/src/test/resources/logback-test.xml index 37c8a721c..b8337ca9b 100644 --- a/jcore-xmi-db-reader/src/test/resources/logback-test.xml +++ b/jcore-xmi-db-reader/src/test/resources/logback-test.xml @@ -10,6 +10,7 @@ + diff --git a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java index 380c0b232..004c085d9 100644 --- a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java +++ b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java @@ -115,6 +115,7 @@ public class XMIDBWriter extends JCasAnnotator_ImplBase { public static final String PARAM_FEATURES_TO_MAP_DRYRUN = "BinaryFeaturesToMapDryRun"; public static final String PARAM_BINARY_FEATURES_BLACKLIST = "BinaryFeaturesBlacklist"; public static final String PARAM_ADD_SHA_HASH = "AddShaHash"; + public static final String PARAM_SKIP_MATCHING_HASH = "SkipMatchingHash"; private static final Logger log = LoggerFactory.getLogger(XMIDBWriter.class); // The mappings are keyed by the costosys.xml path and the table schema, see 'mappingCacheKey'. // The idea is to save costly database connections by sharing updating mapping across threads. @@ -249,6 +250,7 @@ public class XMIDBWriter extends JCasAnnotator_ImplBase { private String[] binaryFeaturesBlacklistParameter; @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, mandatory = false, description = "Possible values: document_text. If this parameter is set to a valid value, the SHA256 hash for the given value will be calculated, base64 encoded and added to each document as a new column in the document table. The column will be named after the parameter value, suffixed by '_sha256'.") private String documentItemToHash; + @ConfigurationParameter(name =PARAM_SKIP_MATCHING_HASH, mandatory = false, description = "Only in effect, if: " + PARAM_ADD_SHA_HASH + " is active; if the target XMI table has also been read from by the XMI DB reader and the reader has been configured to read the document's current hash value. Then, compares the hash value retrieved and relied by the XMI DB reader to the ") private Map shaMap; private String mappingCacheKey; private DocumentReleaseCheckpoint docReleaseCheckpoint; diff --git a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java index 31fb146ef..080ffd613 100644 --- a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java +++ b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java @@ -57,19 +57,20 @@ public XmiDataInserter(Set annotationModuleColumnNames, * update. It will just be inserted otherwise (throwing an error if there * will be a primary key constraint violation, i.e. duplicates). * - * @param serializedCASes + * @param annotationModules * @param storeBaseDocument * @param deleteObsolete * @param shaMap * @throws XmiDataInsertionException * @throws AnalysisEngineProcessException */ - public void sendXmiDataToDatabase(String xmiTableName, List serializedCASes, String subsetTableName, Boolean storeBaseDocument, Boolean deleteObsolete, Map shaMap) throws XmiDataInsertionException { + public void sendXmiDataToDatabase(String xmiTableName, List annotationModules, String subsetTableName, Boolean storeBaseDocument, Boolean deleteObsolete, Map shaMap) throws XmiDataInsertionException { if (log.isTraceEnabled()) { - log.trace("Sending XMI data for {} tables to the database", serializedCASes.size()); - log.trace("Sending {} XMI data items", serializedCASes.size()); + log.trace("Sending XMI data for {} tables to the database", annotationModules.size()); + log.trace("Sending {} XMI data items", annotationModules.size()); } - final Map> dataByDoc = serializedCASes.stream().collect(Collectors.groupingBy(XmiData::getDocId)); + final Map> dataByDoc = annotationModules.stream().collect(Collectors.groupingBy(XmiData::getDocId)); + // Collect all document IDs we want to add something for into the database. This can be annotations or the hash. final Set documentIdsWithValues = shaMap != null ? Sets.union(dataByDoc.keySet(), shaMap.keySet()) : dataByDoc.keySet(); class RowIterator implements Iterator> { @@ -163,15 +164,17 @@ public void remove() { try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { conn.setAutoCommit(false); + // This is the private in-line defined class from above. All values are already contained in the class + // definition. RowIterator iterator = new RowIterator(); try { if (updateMode) { log.debug("Updating {} XMI CAS data in database table '{}'.", - serializedCASes.size(), xmiTableName); + annotationModules.size(), xmiTableName); dbc.updateFromRowIterator(iterator, xmiTableName, false, storeBaseDocument, schemaDocument); } else { log.debug("Inserting {} XMI CAS data into database table '{}'.", - serializedCASes.size(), xmiTableName); + annotationModules.size(), xmiTableName); dbc.importFromRowIterator(iterator, xmiTableName, false, schemaDocument); } } catch (Exception e) { From 8d0b7a4dd57971e521ce9375a215abb4bd4d33bd Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 4 Jun 2021 10:19:52 +0200 Subject: [PATCH 055/269] Adding DB hash equality check to the `XMLDBMultiplier`, the `AnnotationDefinedFlowController` and flow controller support for the `DescriptorCreator`. All those changes head towards the possibility to skip most of the pipeline in case the document is already present in the database without changes (as determined by the hash value). --- jcore-descriptor-creator/component.meta | 20 +++ .../jcore/misc/DescriptorCreator.java | 49 +++--- jcore-flow-controllers/pom.xml | 71 +++++++++ .../AnnotationDefinedFlow.java | 64 ++++++++ .../AnnotationDefinedFlowController.java | 4 +- .../annotationdefined}/FixedInnerFlow.java | 2 +- .../HashComparisonFlowController.java | 117 ++++++++++++++ .../HashComparisonOuterFlow.java | 72 +++++++++ ...core-annotation-defined-flowcontroller.xml | 19 +++ .../AnnotationDefinedFlowControllerTest.java | 143 ++++++++++++++++++ .../jcore/types/jcore-casflow-types.xml | 2 +- .../flowcontroller/AnnotationDefinedFlow.java | 39 ----- .../HashComparisonFlowController.java | 117 -------------- .../HashComparisonOuterFlow.java | 72 --------- .../flowcontroller/FlowControllerTest.java | 101 ------------- .../jcore/reader/xml/XMLDBMultiplier.java | 125 ++++++++++++++- pom.xml | 50 +++--- scripts/createMetaDescriptors.py | 13 +- 18 files changed, 695 insertions(+), 385 deletions(-) create mode 100644 jcore-descriptor-creator/component.meta create mode 100644 jcore-flow-controllers/pom.xml create mode 100644 jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlow.java rename {jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller => jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined}/AnnotationDefinedFlowController.java (62%) rename {jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller => jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined}/FixedInnerFlow.java (96%) create mode 100644 jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/HashComparisonFlowController.java create mode 100644 jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/HashComparisonOuterFlow.java create mode 100644 jcore-flow-controllers/src/main/resources/de/julielab/jcore/flow/annotationdefined/desc/jcore-annotation-defined-flowcontroller.xml create mode 100644 jcore-flow-controllers/src/test/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowControllerTest.java delete mode 100644 jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/AnnotationDefinedFlow.java delete mode 100644 jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/HashComparisonFlowController.java delete mode 100644 jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/HashComparisonOuterFlow.java delete mode 100644 jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/flowcontroller/FlowControllerTest.java diff --git a/jcore-descriptor-creator/component.meta b/jcore-descriptor-creator/component.meta new file mode 100644 index 000000000..6eae55fd0 --- /dev/null +++ b/jcore-descriptor-creator/component.meta @@ -0,0 +1,20 @@ +{ + "categories": [ + "reader" + ], + "description": "A simple project for the automatic creation of descriptors for UIMAfit-enabled components.", + "descriptors": [ + { + "category": "reader", + "location": "de.julielab.jcore.reader.testreader.desc.de.julielab.jcore.reader.testreader.TestReader" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-descriptor-creator", + "groupId": "de.julielab", + "version": "2.6.0-SNAPSHOT" + }, + "name": "JCoRe Descriptor Creator" +} diff --git a/jcore-descriptor-creator/src/main/java/de/julielab/jcore/misc/DescriptorCreator.java b/jcore-descriptor-creator/src/main/java/de/julielab/jcore/misc/DescriptorCreator.java index 92c3178a1..69253935b 100644 --- a/jcore-descriptor-creator/src/main/java/de/julielab/jcore/misc/DescriptorCreator.java +++ b/jcore-descriptor-creator/src/main/java/de/julielab/jcore/misc/DescriptorCreator.java @@ -1,17 +1,6 @@ package de.julielab.jcore.misc; -import static java.util.stream.Collectors.joining; -import static java.util.stream.Collectors.toList; - -import java.io.*; -import java.lang.reflect.Modifier; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.function.Predicate; -import java.util.stream.Stream; - +import de.julielab.java.utilities.FileUtilities; import io.github.classgraph.ClassGraph; import io.github.classgraph.ScanResult; import org.apache.commons.lang.StringUtils; @@ -21,14 +10,26 @@ import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.fit.factory.FlowControllerFactory; import org.apache.uima.fit.factory.TypeSystemDescriptionFactory; +import org.apache.uima.flow.FlowController; +import org.apache.uima.flow.FlowControllerDescription; import org.apache.uima.resource.ResourceCreationSpecifier; import org.apache.uima.resource.metadata.TypeSystemDescription; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; -import de.julielab.java.utilities.FileUtilities; +import java.io.*; +import java.lang.reflect.Modifier; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Stream; + +import static java.util.stream.Collectors.joining; +import static java.util.stream.Collectors.toList; public class DescriptorCreator { @@ -56,19 +57,27 @@ public void run(String basePackage) throws Exception { public void run(String basePackage, String outputRoot) throws Exception { List> readers = findSubclasses(CollectionReader.class.getCanonicalName()); List> aes = findSubclasses(AnalysisComponent.class.getCanonicalName()); + List> flowControllers = findSubclasses(FlowController.class.getCanonicalName()); - readers = readers.stream().filter(c -> c.getPackage().getName().startsWith(basePackage) && (c.getPackage().getName().endsWith("reader") || c.getName().toLowerCase().endsWith("reader"))) + // Now filter all found classes for being in the target package and adhering to the naming conventions. + readers = readers.stream().filter(c -> c.getPackage().getName().startsWith(basePackage) && (c.getPackage().getName().contains("reader") || c.getName().toLowerCase().contains("reader"))) .collect(toList()); // Since consumers and also multipliers can be or are AnalysisComponents, we may list all component categories here. // Also, remove abstract classes aes = aes.stream().filter(c -> !Modifier.isAbstract(c.getModifiers())). filter(c -> c.getPackage().getName().startsWith(basePackage) && - (c.getPackage().getName().endsWith("ae") || c.getName().toLowerCase().endsWith("ae") || c.getName().toLowerCase().endsWith("annotator") - || c.getPackage().getName().endsWith("consumer") || c.getName().toLowerCase().endsWith("consumer") || c.getName().toLowerCase().endsWith("writer") - || c.getPackage().getName().endsWith("multiplier") || c.getName().toLowerCase().endsWith("multiplier")) + (c.getPackage().getName().contains("ae") || c.getName().toLowerCase().contains("ae") || c.getName().toLowerCase().contains("annotator") + || c.getPackage().getName().contains("consumer") || c.getName().toLowerCase().contains("consumer") || c.getName().toLowerCase().contains("writer") + || c.getPackage().getName().contains("multiplier") || c.getName().toLowerCase().contains("multiplier")) ).collect(toList()); - if (readers.isEmpty() && aes.isEmpty()) { + flowControllers = flowControllers.stream().filter(c -> !Modifier.isAbstract((c.getModifiers()))). + filter(c -> c.getPackage().getName().startsWith(basePackage) && + (c.getPackage().getName().contains("flow") || c.getPackage().getName().toLowerCase().contains("flow"))) + .collect(toList()); + + + if (readers.isEmpty() && aes.isEmpty() && flowControllers.isEmpty()) { log.warn("No JCoRe UIMA component classes were found."); } else { Stream typeDescNamesStream = Stream.of(TypeSystemDescriptionFactory.scanTypeDescriptors()). @@ -88,6 +97,10 @@ public void run(String basePackage, String outputRoot) throws Exception { AnalysisEngineDescription d = AnalysisEngineFactory.createEngineDescription(cls, tsd); writeComponentDescriptor(outputRoot, cls, d, "analysis engine / consumer"); } + for (Class cls : flowControllers) { + FlowControllerDescription d = FlowControllerFactory.createFlowControllerDescription(cls); + writeComponentDescriptor(outputRoot, cls, d, "flow controller"); + } } } diff --git a/jcore-flow-controllers/pom.xml b/jcore-flow-controllers/pom.xml new file mode 100644 index 000000000..a316b81ad --- /dev/null +++ b/jcore-flow-controllers/pom.xml @@ -0,0 +1,71 @@ + + + + jcore-base + de.julielab + 2.6.0-SNAPSHOT + + 4.0.0 + + jcore-flow-controllers + + + + org.slf4j + slf4j-api + + + de.julielab + jcore-types + ${jcore-types-version} + + + junit + junit + test + + + ch.qos.logback + logback-classic + provided + + + org.assertj + assertj-core + test + + + de.julielab + jcore-utilities + ${jcore-utilities-version} + + + org.jetbrains + annotations + RELEASE + compile + + + de.julielab + jcore-descriptor-creator + + + + JCoRe Flow Controllers + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-flow-controllers + Flow controllers can be used to control the route a (J)CAS takes through an aggregate analysis engine. + This project contains Flow Controllers developed at the JULIE Lab. + + + + BSD-2-Clause + https://opensource.org/licenses/BSD-2-Clause + + + \ No newline at end of file diff --git a/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlow.java b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlow.java new file mode 100644 index 000000000..0243a7f36 --- /dev/null +++ b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlow.java @@ -0,0 +1,64 @@ +package de.julielab.jcore.flow.annotationdefined; + +import de.julielab.jcore.types.casflow.ToVisit; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.analysis_engine.metadata.FixedFlow; +import org.apache.uima.analysis_engine.metadata.FlowConstraints; +import org.apache.uima.flow.FinalStep; +import org.apache.uima.flow.JCasFlow_ImplBase; +import org.apache.uima.flow.SimpleStep; +import org.apache.uima.flow.Step; +import org.jetbrains.annotations.Nullable; + +/** + *

Returns steps according an existing {@link ToVisit} annotation of the CAS or, if not present, the default aggregate flow.

+ *

This is, for example, used by the XMLDBMultiplier to let CASes skip large parts of the pipeline when + * the currently read document already exists in the database.

+ */ +public class AnnotationDefinedFlow extends JCasFlow_ImplBase { + private String[] toVisitKeys; + private String[] fixedFlow; + private int currentPos; + + /** + *

Creates a flow that follows to entries in {@link ToVisit#getDelegateKeys()} of toVisit or, if + * toVisit is null, falls back to the default fixed flow.

+ *

If toVisit is not null but the delegateKeys are null or empty, no component in the aggregate using this flow will process the respective CAS.

+ * @param toVisit An annotation containing the keys of the delegate AEs to visit. May be null which case the default fixed flow will be used. + * @param flowConstraints The default fixed flow of the aggregate analysis engine. + * @throws AnalysisEngineProcessException If flowConstraints is not a fixed flow. + */ + public AnnotationDefinedFlow(@Nullable ToVisit toVisit, FlowConstraints flowConstraints) throws AnalysisEngineProcessException { + if (!(flowConstraints instanceof FixedFlow)) + throw new AnalysisEngineProcessException(new IllegalArgumentException("This flow requires the FixedFlow to determine the default processing order. However, the flow constraints are of type " + flowConstraints.getClass().getCanonicalName())); + this.fixedFlow = ((FixedFlow) flowConstraints).getFixedFlow(); + // We have the following cases: + // 1. There are given keys to visit, use them. + // 2. There are no keys given but the ToVisit annotation is not null, skip all components. + // 3. There is not ToVisit annotation at all, use the default fixed flow. + if(toVisit != null && toVisit.getDelegateKeys() != null) + toVisitKeys = toVisit.getDelegateKeys().toArray(); + else if (toVisit != null) + toVisitKeys = new String[0]; + else + toVisitKeys = null; + this.currentPos = 0; + } + + /** + *

Routes the CAS to the next component defined by the CAS'es {@link ToVisit} annotation or, + * if ToVisit was not found, to the next component as defined by the default fixed flow.

+ * + * @return The next component to visit or the next default flow component. + */ + @Override + public Step next() { + // If toVisitKeys was not given, we just use the fixedFlow. + if ((toVisitKeys == null && currentPos < fixedFlow.length) || (toVisitKeys != null && currentPos < toVisitKeys.length)) { + String nextAEKey = toVisitKeys != null ? toVisitKeys[currentPos] : fixedFlow[currentPos]; + ++currentPos; + return new SimpleStep(nextAEKey); + } + return new FinalStep(); + } +} diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/AnnotationDefinedFlowController.java b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowController.java similarity index 62% rename from jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/AnnotationDefinedFlowController.java rename to jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowController.java index 359d8eb7d..77a803e23 100644 --- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/AnnotationDefinedFlowController.java +++ b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowController.java @@ -1,7 +1,8 @@ -package de.julielab.jcore.reader.xmi.flowcontroller; +package de.julielab.jcore.flow.annotationdefined; import de.julielab.jcore.types.casflow.ToVisit; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.flow.Flow; import org.apache.uima.flow.JCasFlowController_ImplBase; @@ -12,6 +13,7 @@ *

If there is not ToVisit annotation, the default (fixed) flow will be used. Thus, the fixed flow constraint * must be set on the aggregate engine.

*/ +@ResourceMetaData(name = "JCoRe Annotation Defined Flow Controller", description = "This flow controller relies on an annotation of type ToVisit to be present in the CAS. If there is no such annotation, the default fixed flow of the aggregate engine using this flow controller is used. Otherwise, die names of the components to pass the CAS to are taken from the annotation. If the annotation exists but defines to components to be visited by the CAS, no components are visited at all.", vendor = "JULIE Lab, Germany", version = "placeholder") public class AnnotationDefinedFlowController extends JCasFlowController_ImplBase { @Override public Flow computeFlow(JCas jCas) throws AnalysisEngineProcessException { diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/FixedInnerFlow.java b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/FixedInnerFlow.java similarity index 96% rename from jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/FixedInnerFlow.java rename to jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/FixedInnerFlow.java index 21d84a60d..eeae85f0a 100644 --- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/FixedInnerFlow.java +++ b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/FixedInnerFlow.java @@ -1,4 +1,4 @@ -package de.julielab.jcore.reader.xmi.flowcontroller; +package de.julielab.jcore.flow.annotationdefined; import org.apache.uima.flow.FinalStep; import org.apache.uima.flow.JCasFlow_ImplBase; diff --git a/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/HashComparisonFlowController.java b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/HashComparisonFlowController.java new file mode 100644 index 000000000..bdbf88c9c --- /dev/null +++ b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/HashComparisonFlowController.java @@ -0,0 +1,117 @@ +//package de.julielab.jcore.flow.annotationdefined; +// +//import de.julielab.costosys.configuration.FieldConfig; +//import de.julielab.costosys.dbconnection.CoStoSysConnection; +//import de.julielab.costosys.dbconnection.DataBaseConnector; +//import de.julielab.jcore.reader.db.DBReader; +//import de.julielab.jcore.types.casmultiplier.RowBatch; +//import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +//import org.apache.uima.cas.FeatureStructure; +//import org.apache.uima.fit.descriptor.ConfigurationParameter; +//import org.apache.uima.fit.descriptor.ResourceMetaData; +//import org.apache.uima.fit.util.JCasUtil; +//import org.apache.uima.flow.Flow; +//import org.apache.uima.flow.FlowControllerContext; +//import org.apache.uima.flow.JCasFlowController_ImplBase; +//import org.apache.uima.jcas.JCas; +//import org.apache.uima.jcas.cas.StringArray; +//import org.apache.uima.resource.ResourceInitializationException; +//import org.slf4j.Logger; +//import org.slf4j.LoggerFactory; +// +//import java.io.FileNotFoundException; +//import java.sql.ResultSet; +//import java.sql.SQLException; +//import java.util.*; +//import java.util.stream.Collectors; +// +///** +// *

Prereque

+// *

Expects a jCas as being output by the {@link de.julielab.jcore.reader.xmi.XmiDBMultiplierReader}, i.e. the CAS +// * should contain a {@link de.julielab.jcore.types.casmultiplier.RowBatch} annotation. Then, Retrieves the sha256 hashes for +// * the passed documents from the database.

+// */ +//@ResourceMetaData(name = "JCoRe Hash Comparison Flow Controller", description = "This flow controller aims to skip processing for CASes that already exist in the database and haven't changed with regards to a newly read version. For this purpose, the sha256 hash of the CAS document text is compared to the the existing hash in the database for the same document ID. If the hashes match, the text is the same and, thus, the annotations will be the same.") +//public class HashComparisonFlowController extends JCasFlowController_ImplBase { +// public static final String PARAM_ADD_SHA_HASH = "AddShaHash"; +// public static final String PARAM_TABLE_DOCUMENT = "DocumentTable"; +// private final static Logger log = LoggerFactory.getLogger(HashComparisonFlowController.class); +// @ConfigurationParameter(name = DBReader.PARAM_COSTOSYS_CONFIG_NAME, description = "Path to the CoStoSys configuration XML file that specifies the database this pipeline writes to, i.e. the same file that the DB XMI Writer is using. If there is no DB Writer in use, this flow controller is not applicable.") +// private String costosysConfig; +// @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, description = "Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS skips all component except the DBCheckpointAE to mark the document as processed.") +// private String documentItemToHash; +// @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT, description = "String parameter indicating the name of the " + +// "table where the XMI data will be stored. The name must be schema qualified.") +// private String docTableParamValue; +// +// private DataBaseConnector dbc; +// +// @Override +// public void initialize(FlowControllerContext aContext) throws ResourceInitializationException { +// this.costosysConfig = (String) aContext.getConfigParameterValue(DBReader.PARAM_COSTOSYS_CONFIG_NAME); +// this.documentItemToHash = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_ADD_SHA_HASH)).orElse("document_text"); +// try { +// dbc = new DataBaseConnector(this.costosysConfig); +// } catch (FileNotFoundException e) { +// log.error("Could not create the CoStoSys DatabaseConnector:", e); +// throw new ResourceInitializationException(e); +// } +// } +// +// @Override +// public Flow computeFlow(JCas jCas) throws AnalysisEngineProcessException { +// RowBatch rowBatch; +// try { +// rowBatch = JCasUtil.selectSingle(jCas, RowBatch.class); +// } catch (IllegalArgumentException e) { +// log.error("Could not select the RowBatch annotation from the JCas:", e); +// throw new AnalysisEngineProcessException(e); +// } +// Map id2hash = fetchCurrentHashesFromDatabase(rowBatch); +// return new HashComparisonOuterFlow(id2hash, documentItemToHash, getContext().getAggregateMetadata().getFlowConstraints()); +// } +// +// /** +// *

Fetches the hashes of the currently stored documents in the database.

+// * @param rowBatch The annotation specifying which documents should be fetched by the multiplier and then be processed by the aggregate. +// * @return A map from a string representation of the RowBatches document IDs to the hashes for the respective IDs. +// * @throws AnalysisEngineProcessException If the SQL request fails. +// */ +// private Map fetchCurrentHashesFromDatabase(RowBatch rowBatch) throws AnalysisEngineProcessException { +// String dataTable = dbc.getNextDataTable(rowBatch.getTableName()); +// String hashColumn = documentItemToHash + "_sha256"; +// // Extract the document IDs in this RowBatch. The IDs could be composite keys. +// List documentIds = new ArrayList<>(rowBatch.getIdentifiers().size()); +// Iterator documentIDsIt = rowBatch.getIdentifiers().iterator(); +// while (documentIDsIt.hasNext()) { +// StringArray pkArray = (StringArray) documentIDsIt.next(); +// documentIds.add(pkArray.toStringArray()); +// } +// Map id2hash = new HashMap<>(documentIds.size()); +// // This is the map we want to fill that lets us look up the hash of the document text by document ID. +// String sql = null; +// // Query the database for the document IDs in the current RowBatch and retrieve hashes. +// try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { +// FieldConfig activeTableFieldConfiguration = dbc.getActiveTableFieldConfiguration(); +// String idQuery = documentIds.stream() +// .map(key -> Arrays.stream(key).map(part -> "%s='" + part + '"').toArray(String[]::new)) +// .map(activeTableFieldConfiguration::expandPKNames).map(expandedKeys -> String.join(" AND ", expandedKeys)) +// .collect(Collectors.joining(" OR ")); +// sql = String.format("SELECT %s,%s FROM %s WHERE %s", activeTableFieldConfiguration.getPrimaryKeyString(), hashColumn, dataTable, idQuery); +// ResultSet rs = conn.createStatement().executeQuery(sql); +// while (rs.next()) { +// StringBuilder pkSb = new StringBuilder(); +// for (int i = 0; i < activeTableFieldConfiguration.getPrimaryKey().length; i++) +// pkSb.append(rs.getString(i)).append(','); +// // Remove training comma +// pkSb.deleteCharAt(pkSb.length()); +// String hash = rs.getString(activeTableFieldConfiguration.getPrimaryKey().length); +// id2hash.put(pkSb.toString(), hash); +// } +// } catch (SQLException e) { +// log.error("Could not retrieve hashes from the database. SQL query was {}:", sql, e); +// throw new AnalysisEngineProcessException(e); +// } +// return id2hash; +// } +//} diff --git a/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/HashComparisonOuterFlow.java b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/HashComparisonOuterFlow.java new file mode 100644 index 000000000..896b52892 --- /dev/null +++ b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/HashComparisonOuterFlow.java @@ -0,0 +1,72 @@ +//package de.julielab.jcore.flow.annotationdefined; +// +//import org.apache.commons.codec.binary.Base64; +//import org.apache.commons.codec.digest.DigestUtils; +//import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +//import org.apache.uima.analysis_engine.metadata.FixedFlow; +//import org.apache.uima.analysis_engine.metadata.FlowConstraints; +//import org.apache.uima.flow.*; +//import org.apache.uima.jcas.JCas; +//import org.slf4j.Logger; +//import org.slf4j.LoggerFactory; +// +//import java.util.Map; +// +///** +// *

Note: This flow can only be used in an aggregate analysis engine where the {@link de.julielab.jcore.reader.xmi.XmiDBMultiplier} is the first component.

+// *

This flow is created by the {@link HashComparisonFlowController} and routes the CAS that was filled by the {@link de.julielab.jcore.reader.xmi.XmiDBMultiplierReader}. +// * This CAS contains an instance of {@link de.julielab.jcore.types.casmultiplier.RowBatch} which contains the information which documents should be read +// * from which database table.

+// *

Within this flow, the reader CAS is passed to the multiplier, the first component. For CASes created by the multiplier, +// * the method {@link #newCasProduced(JCas, String)} is called for which a new flow concerning the processing order of the +// * multiplier-created CASes within the aggregate is determined.

+// */ +//public class HashComparisonOuterFlow extends JCasFlow_ImplBase { +// private final static Logger log = LoggerFactory.getLogger(HashComparisonOuterFlow.class); +// private String[] fixedFlow; +// private int currentPosition; +// private Map id2hash; +// private String documentItemToHash; +// +// public HashComparisonOuterFlow(Map id2hash, String documentItemToHash, FlowConstraints flowConstraints) throws AnalysisEngineProcessException { +// this.id2hash = id2hash; +// this.documentItemToHash = documentItemToHash; +// if (!(flowConstraints instanceof FixedFlow)) { +// throw new AnalysisEngineProcessException(new IllegalArgumentException("This flow requires the original FixedFlow to know the order of the delegate engines but the given flow is of type " + flowConstraints.getClass())); +// } +// FixedFlow fixedFlow = (FixedFlow) flowConstraints; +// this.fixedFlow = fixedFlow.getFixedFlow(); +// this.currentPosition = 0; +// } +// +// @Override +// protected Flow newCasProduced(JCas newCas, String producedBy) throws AnalysisEngineProcessException { +// String newHash = getHash(newCas); +// return new FixedInnerFlow(fixedFlow); +// } +// +// private String getHash(JCas newCas) { +// final String documentText = newCas.getDocumentText(); +// final byte[] sha = DigestUtils.sha256(documentText.getBytes()); +// return Base64.encodeBase64String(sha); +// } +// +// public Step next() { +// Step step = null; +// for (; currentPosition < fixedFlow.length && step == null; currentPosition++) { +// String aeKey = fixedFlow[currentPosition]; +// +// // The outer flow only passes the CAS to the CAS multiplier. The multiplier creates more CASes which +// // are then passed to newCasProduced() and are then routed by the InnerFlow. +// if (currentPosition == 0) { +// log.trace("Outer next AE is: " + aeKey); +// step = new SimpleStep(aeKey); +// } +// } +// if (step == null) { +// // no appropriate AEs to call - end of flow +// log.trace("Outer flow Complete."); +// } +// return step == null ? new FinalStep() : step; +// } +//} diff --git a/jcore-flow-controllers/src/main/resources/de/julielab/jcore/flow/annotationdefined/desc/jcore-annotation-defined-flowcontroller.xml b/jcore-flow-controllers/src/main/resources/de/julielab/jcore/flow/annotationdefined/desc/jcore-annotation-defined-flowcontroller.xml new file mode 100644 index 000000000..2babe5cd5 --- /dev/null +++ b/jcore-flow-controllers/src/main/resources/de/julielab/jcore/flow/annotationdefined/desc/jcore-annotation-defined-flowcontroller.xml @@ -0,0 +1,19 @@ + + + org.apache.uima.java + de.julielab.jcore.flow.annotationdefined.AnnotationDefinedFlowController + + JCoRe Annotation Defined Flow Controller + This flow controller relies on an annotation of type ToVisit to be present in the CAS. If there is no such annotation, the default fixed flow of the aggregate engine using this flow controller is used. Otherwise, die names of the components to pass the CAS to are taken from the annotation. If the annotation exists but defines to components to be visited by the CAS, no components are visited at all. + placeholder + JULIE Lab, Germany + + + + + false + true + false + + + \ No newline at end of file diff --git a/jcore-flow-controllers/src/test/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowControllerTest.java b/jcore-flow-controllers/src/test/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowControllerTest.java new file mode 100644 index 000000000..f5daabd0f --- /dev/null +++ b/jcore-flow-controllers/src/test/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowControllerTest.java @@ -0,0 +1,143 @@ +package de.julielab.jcore.flow.annotationdefined; + +import de.julielab.jcore.types.Token; +import de.julielab.jcore.types.casflow.ToVisit; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.cas.FSIterator; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.FlowControllerFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.flow.FlowControllerDescription; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.StringArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.junit.Test; + +import static java.util.Arrays.asList; +import static org.assertj.core.api.Assertions.assertThat; +public class AnnotationDefinedFlowControllerTest { + @Test + public void testFlowControllerSingleKey() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-casflow-types"); + ToVisit toVisit = new ToVisit(jCas); + StringArray toVisitKeys = new StringArray(jCas, 1); + toVisitKeys.set(0, "TestAE 2"); + toVisit.setDelegateKeys(toVisitKeys); + toVisit.addToIndexes(); + + AnalysisEngine aae = createTestAAE(); + + aae.process(jCas); + + FSIterator it = jCas.getAnnotationIndex(Token.type).iterator(); + assertThat(it).toIterable().extracting(Token::getComponentId).containsExactly("TestAE 2"); + } + + @Test + public void testFlowControllerNoKey() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-casflow-types"); + ToVisit toVisit = new ToVisit(jCas); + StringArray toVisitKeys = new StringArray(jCas, 0); + toVisit.setDelegateKeys(toVisitKeys); + toVisit.addToIndexes(); + + AnalysisEngine aae = createTestAAE(); + + aae.process(jCas); + + FSIterator it = jCas.getAnnotationIndex(Token.type).iterator(); + assertThat(it).isExhausted(); + } + + @Test + public void testFlowControllerNullKey() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-casflow-types"); + ToVisit toVisit = new ToVisit(jCas); + toVisit.addToIndexes(); + + AnalysisEngine aae = createTestAAE(); + + aae.process(jCas); + + FSIterator it = jCas.getAnnotationIndex(Token.type).iterator(); + assertThat(it).isExhausted(); + } + + @Test + public void testFlowControllerNoVisitAnnotation() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-casflow-types"); + + AnalysisEngine aae = createTestAAE(); + + aae.process(jCas); + + FSIterator it = jCas.getAnnotationIndex(Token.type).iterator(); + assertThat(it).toIterable().extracting(Token::getComponentId).containsExactly("TestAE 1", "TestAE 2", "TestAE 3"); + } + + private AnalysisEngine createTestAAE() throws ResourceInitializationException { + FlowControllerDescription flowControllerDescription = FlowControllerFactory.createFlowControllerDescription(AnnotationDefinedFlowController.class); + AnalysisEngineDescription testAeDesc1 = AnalysisEngineFactory.createEngineDescription(TestAE.class, "name", "TestAE 1"); + AnalysisEngineDescription testAeDesc2 = AnalysisEngineFactory.createEngineDescription(TestAE.class, "name", "TestAE 2"); + AnalysisEngineDescription testAeDesc3 = AnalysisEngineFactory.createEngineDescription(TestAE.class, "name", "TestAE 3"); + AnalysisEngineDescription aaeWithFlowController = AnalysisEngineFactory.createEngineDescription(asList(testAeDesc1, testAeDesc2, testAeDesc3), asList("TestAE 1", "TestAE 2", "TestAE 3"), null, null, + flowControllerDescription); + AnalysisEngine aae = AnalysisEngineFactory.createEngine(aaeWithFlowController); + return aae; + } + + public static class TestAE extends JCasAnnotator_ImplBase { + @ConfigurationParameter(name = "name") + private String name; + + @Override + public void initialize(UimaContext context) { + name = (String) context.getConfigParameterValue("name"); + } + + @Override + public void process(JCas jCas) { + // Indicate that this jCas was processed by this component. + Token token = new Token(jCas); + token.setComponentId(name); + token.addToIndexes(); + } + } + +// public static class TestMultiplier extends JCasMultiplier_ImplBase { +// private List idsToRead = new ArrayList<>(); +// private int currentIndex; +// @Override +// public void process(JCas jCas) throws AnalysisEngineProcessException { +// RowBatch rowbatch = JCasUtil.selectSingle(jCas, RowBatch.class); +// idsToRead.clear(); +// currentIndex = 0; +// for (int i = 0; i < rowbatch.getIdentifiers().size() && rowbatch.getIdentifiers(i) != null; i++) { +// // In this test, the document IDs consist only of a single element +// idsToRead.add(rowbatch.getIdentifiers(i).get(0)); +// } +// } +// +// @Override +// public boolean hasNext() throws AnalysisEngineProcessException { +// return currentIndex < idsToRead.size(); +// } +// +// @Override +// public AbstractCas next() throws AnalysisEngineProcessException { +// JCas emptyJCas = getEmptyJCas(); +// Header header = new Header(emptyJCas); +// String docId = idsToRead.get(currentIndex); +// header.setDocId(docId); +// header.addToIndexes(); +// emptyJCas.setDocumentText("ID: " + docId); +// ++currentIndex; +// return emptyJCas; +// } +// } + +} diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-casflow-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-casflow-types.xml index 6d3e20b4c..1f371bdf6 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-casflow-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/jcore-casflow-types.xml @@ -18,7 +18,7 @@ delegateKeys - The keys of the delegates to visit. The keys are the names given to the delegate analysis engines in the aggregate. + The keys of the delegates to visit. The keys are the names given to the delegate analysis engines in the aggregate. An empty or null array indicates that no component should be visited. uima.cas.StringArray uima.cas.String diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/AnnotationDefinedFlow.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/AnnotationDefinedFlow.java deleted file mode 100644 index c48c75193..000000000 --- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/AnnotationDefinedFlow.java +++ /dev/null @@ -1,39 +0,0 @@ -package de.julielab.jcore.reader.xmi.flowcontroller; - -import de.julielab.jcore.types.casflow.ToVisit; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.analysis_engine.metadata.FixedFlow; -import org.apache.uima.analysis_engine.metadata.FlowConstraints; -import org.apache.uima.flow.JCasFlow_ImplBase; -import org.apache.uima.flow.SimpleStep; -import org.apache.uima.flow.Step; - -/** - *

Returns steps according an existing {@link ToVisit} annotation of the CAS or, if not present, the default aggregate flow.

- */ -public class AnnotationDefinedFlow extends JCasFlow_ImplBase { - private String[] toVisitKeys; - private String[] fixedFlow; - private int currentPos; - - public AnnotationDefinedFlow(ToVisit toVisit, FlowConstraints flowConstraints) throws AnalysisEngineProcessException { - if (!(flowConstraints instanceof FixedFlow)) - throw new AnalysisEngineProcessException(new IllegalArgumentException("This flow requires the FixedFlow to determine the default processing order. However, the flow constraints are of type " + flowConstraints.getClass().getCanonicalName())); - this.fixedFlow = toVisit != null ? ((FixedFlow) flowConstraints).getFixedFlow() : null; - this.toVisitKeys = toVisit.getDelegateKeys().toArray(); - this.currentPos = 0; - } - - /** - *

Routes the CAS to the next component defined by the CAS'es {@link ToVisit} annotation or, - * if ToVisit was not found, to the next component as defined by the default fixed flow.

- * - * @return The next component to visit or the next default flow component. - */ - @Override - public Step next() { - String nextAEKey = toVisitKeys != null ? toVisitKeys[currentPos] : fixedFlow[currentPos]; - ++currentPos; - return new SimpleStep(nextAEKey); - } -} diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/HashComparisonFlowController.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/HashComparisonFlowController.java deleted file mode 100644 index 717566675..000000000 --- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/HashComparisonFlowController.java +++ /dev/null @@ -1,117 +0,0 @@ -package de.julielab.jcore.reader.xmi.flowcontroller; - -import de.julielab.costosys.configuration.FieldConfig; -import de.julielab.costosys.dbconnection.CoStoSysConnection; -import de.julielab.costosys.dbconnection.DataBaseConnector; -import de.julielab.jcore.reader.db.DBReader; -import de.julielab.jcore.types.casmultiplier.RowBatch; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.FeatureStructure; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.flow.Flow; -import org.apache.uima.flow.FlowControllerContext; -import org.apache.uima.flow.JCasFlowController_ImplBase; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.cas.StringArray; -import org.apache.uima.resource.ResourceInitializationException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.FileNotFoundException; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.*; -import java.util.stream.Collectors; - -/** - *

Prereque

- *

Expects a jCas as being output by the {@link de.julielab.jcore.reader.xmi.XmiDBMultiplierReader}, i.e. the CAS - * should contain a {@link de.julielab.jcore.types.casmultiplier.RowBatch} annotation. Then, Retrieves the sha256 hashes for - * the passed documents from the database.

- */ -@ResourceMetaData(name = "JCoRe Hash Comparison Flow Controller", description = "This flow controller aims to skip processing for CASes that already exist in the database and haven't changed with regards to a newly read version. For this purpose, the sha256 hash of the CAS document text is compared to the the existing hash in the database for the same document ID. If the hashes match, the text is the same and, thus, the annotations will be the same.") -public class HashComparisonFlowController extends JCasFlowController_ImplBase { - public static final String PARAM_ADD_SHA_HASH = "AddShaHash"; - public static final String PARAM_TABLE_DOCUMENT = "DocumentTable"; - private final static Logger log = LoggerFactory.getLogger(HashComparisonFlowController.class); - @ConfigurationParameter(name = DBReader.PARAM_COSTOSYS_CONFIG_NAME, description = "Path to the CoStoSys configuration XML file that specifies the database this pipeline writes to, i.e. the same file that the DB XMI Writer is using. If there is no DB Writer in use, this flow controller is not applicable.") - private String costosysConfig; - @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, description = "Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS skips all component except the DBCheckpointAE to mark the document as processed.") - private String documentItemToHash; - @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT, description = "String parameter indicating the name of the " + - "table where the XMI data will be stored. The name must be schema qualified.") - private String docTableParamValue; - - private DataBaseConnector dbc; - - @Override - public void initialize(FlowControllerContext aContext) throws ResourceInitializationException { - this.costosysConfig = (String) aContext.getConfigParameterValue(DBReader.PARAM_COSTOSYS_CONFIG_NAME); - this.documentItemToHash = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_ADD_SHA_HASH)).orElse("document_text"); - try { - dbc = new DataBaseConnector(this.costosysConfig); - } catch (FileNotFoundException e) { - log.error("Could not create the CoStoSys DatabaseConnector:", e); - throw new ResourceInitializationException(e); - } - } - - @Override - public Flow computeFlow(JCas jCas) throws AnalysisEngineProcessException { - RowBatch rowBatch; - try { - rowBatch = JCasUtil.selectSingle(jCas, RowBatch.class); - } catch (IllegalArgumentException e) { - log.error("Could not select the RowBatch annotation from the JCas:", e); - throw new AnalysisEngineProcessException(e); - } - Map id2hash = fetchCurrentHashesFromDatabase(rowBatch); - return new HashComparisonOuterFlow(id2hash, documentItemToHash, getContext().getAggregateMetadata().getFlowConstraints()); - } - - /** - *

Fetches the hashes of the currently stored documents in the database.

- * @param rowBatch The annotation specifying which documents should be fetched by the multiplier and then be processed by the aggregate. - * @return A map from a string representation of the RowBatches document IDs to the hashes for the respective IDs. - * @throws AnalysisEngineProcessException If the SQL request fails. - */ - private Map fetchCurrentHashesFromDatabase(RowBatch rowBatch) throws AnalysisEngineProcessException { - String dataTable = dbc.getNextDataTable(rowBatch.getTableName()); - String hashColumn = documentItemToHash + "_sha256"; - // Extract the document IDs in this RowBatch. The IDs could be composite keys. - List documentIds = new ArrayList<>(rowBatch.getIdentifiers().size()); - Iterator documentIDsIt = rowBatch.getIdentifiers().iterator(); - while (documentIDsIt.hasNext()) { - StringArray pkArray = (StringArray) documentIDsIt.next(); - documentIds.add(pkArray.toStringArray()); - } - Map id2hash = new HashMap<>(documentIds.size()); - // This is the map we want to fill that lets us look up the hash of the document text by document ID. - String sql = null; - // Query the database for the document IDs in the current RowBatch and retrieve hashes. - try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { - FieldConfig activeTableFieldConfiguration = dbc.getActiveTableFieldConfiguration(); - String idQuery = documentIds.stream() - .map(key -> Arrays.stream(key).map(part -> "%s='" + part + '"').toArray(String[]::new)) - .map(activeTableFieldConfiguration::expandPKNames).map(expandedKeys -> String.join(" AND ", expandedKeys)) - .collect(Collectors.joining(" OR ")); - sql = String.format("SELECT %s,%s FROM %s WHERE %s", activeTableFieldConfiguration.getPrimaryKeyString(), hashColumn, dataTable, idQuery); - ResultSet rs = conn.createStatement().executeQuery(sql); - while (rs.next()) { - StringBuilder pkSb = new StringBuilder(); - for (int i = 0; i < activeTableFieldConfiguration.getPrimaryKey().length; i++) - pkSb.append(rs.getString(i)).append(','); - // Remove training comma - pkSb.deleteCharAt(pkSb.length()); - String hash = rs.getString(activeTableFieldConfiguration.getPrimaryKey().length); - id2hash.put(pkSb.toString(), hash); - } - } catch (SQLException e) { - log.error("Could not retrieve hashes from the database. SQL query was {}:", sql, e); - throw new AnalysisEngineProcessException(e); - } - return id2hash; - } -} diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/HashComparisonOuterFlow.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/HashComparisonOuterFlow.java deleted file mode 100644 index 09178fa29..000000000 --- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/flowcontroller/HashComparisonOuterFlow.java +++ /dev/null @@ -1,72 +0,0 @@ -package de.julielab.jcore.reader.xmi.flowcontroller; - -import org.apache.commons.codec.binary.Base64; -import org.apache.commons.codec.digest.DigestUtils; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.analysis_engine.metadata.FixedFlow; -import org.apache.uima.analysis_engine.metadata.FlowConstraints; -import org.apache.uima.flow.*; -import org.apache.uima.jcas.JCas; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Map; - -/** - *

Note: This flow can only be used in an aggregate analysis engine where the {@link de.julielab.jcore.reader.xmi.XmiDBMultiplier} is the first component.

- *

This flow is created by the {@link HashComparisonFlowController} and routes the CAS that was filled by the {@link de.julielab.jcore.reader.xmi.XmiDBMultiplierReader}. - * This CAS contains an instance of {@link de.julielab.jcore.types.casmultiplier.RowBatch} which contains the information which documents should be read - * from which database table.

- *

Within this flow, the reader CAS is passed to the multiplier, the first component. For CASes created by the multiplier, - * the method {@link #newCasProduced(JCas, String)} is called for which a new flow concerning the processing order of the - * multiplier-created CASes within the aggregate is determined.

- */ -public class HashComparisonOuterFlow extends JCasFlow_ImplBase { - private final static Logger log = LoggerFactory.getLogger(HashComparisonOuterFlow.class); - private String[] fixedFlow; - private int currentPosition; - private Map id2hash; - private String documentItemToHash; - - public HashComparisonOuterFlow(Map id2hash, String documentItemToHash, FlowConstraints flowConstraints) throws AnalysisEngineProcessException { - this.id2hash = id2hash; - this.documentItemToHash = documentItemToHash; - if (!(flowConstraints instanceof FixedFlow)) { - throw new AnalysisEngineProcessException(new IllegalArgumentException("This flow requires the original FixedFlow to know the order of the delegate engines but the given flow is of type " + flowConstraints.getClass())); - } - FixedFlow fixedFlow = (FixedFlow) flowConstraints; - this.fixedFlow = fixedFlow.getFixedFlow(); - this.currentPosition = 0; - } - - @Override - protected Flow newCasProduced(JCas newCas, String producedBy) throws AnalysisEngineProcessException { - String newHash = getHash(newCas); - return new FixedInnerFlow(fixedFlow); - } - - private String getHash(JCas newCas) { - final String documentText = newCas.getDocumentText(); - final byte[] sha = DigestUtils.sha256(documentText.getBytes()); - return Base64.encodeBase64String(sha); - } - - public Step next() { - Step step = null; - for (; currentPosition < fixedFlow.length && step == null; currentPosition++) { - String aeKey = fixedFlow[currentPosition]; - - // The outer flow only passes the CAS to the CAS multiplier. The multiplier creates more CASes which - // are then passed to newCasProduced() and are then routed by the InnerFlow. - if (currentPosition == 0) { - log.trace("Outer next AE is: " + aeKey); - step = new SimpleStep(aeKey); - } - } - if (step == null) { - // no appropriate AEs to call - end of flow - log.trace("Outer flow Complete."); - } - return step == null ? new FinalStep() : step; - } -} diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/flowcontroller/FlowControllerTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/flowcontroller/FlowControllerTest.java deleted file mode 100644 index 5c3d69e64..000000000 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/flowcontroller/FlowControllerTest.java +++ /dev/null @@ -1,101 +0,0 @@ -package de.julielab.jcore.reader.xmi.flowcontroller; - -import de.julielab.jcore.types.Header; -import de.julielab.jcore.types.casmultiplier.RowBatch; -import de.julielab.jcore.utility.JCoReTools; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_component.JCasMultiplier_ImplBase; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.AbstractCas; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.factory.AnalysisEngineFactory; -import org.apache.uima.fit.factory.FlowControllerFactory; -import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.flow.FlowControllerDescription; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.cas.StringArray; -import org.apache.uima.resource.ResourceInitializationException; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.List; - -public class FlowControllerTest { - @Test - public void testFlowController() throws Exception { - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types"); - RowBatch rowBatch = new RowBatch(jCas); - for (int i = 0; i < 10; i++) { - StringArray id = new StringArray(jCas, 1); - id.set(0, String.valueOf(i)); - rowBatch.setIdentifiers(JCoReTools.addToFSArray(rowBatch.getIdentifiers(), id)); - } - rowBatch.addToIndexes(); - - FlowControllerDescription flowControllerDescription = FlowControllerFactory.createFlowControllerDescription(HashComparisonFlowController.class); - AnalysisEngineDescription multiplierDesc = AnalysisEngineFactory.createEngineDescription(TestMultiplier.class); - AnalysisEngineDescription testAeDesc1 = AnalysisEngineFactory.createEngineDescription(TestAE.class, "name", "TestAE 1"); - AnalysisEngineDescription testAeDesc2 = AnalysisEngineFactory.createEngineDescription(TestAE.class, "name", "TestAE 2"); - AnalysisEngineDescription aaeWithFlowController = AnalysisEngineFactory.createEngineDescription(flowControllerDescription, multiplierDesc, testAeDesc1, testAeDesc2); - AnalysisEngine aae = AnalysisEngineFactory.createEngine(aaeWithFlowController); - - aae.process(jCas); - } - - public static class TestAE extends JCasAnnotator_ImplBase { - private final static Logger log = LoggerFactory.getLogger(TestAE.class); - - @ConfigurationParameter(name = "name") - private String name; - - @Override - public void initialize(UimaContext context) throws ResourceInitializationException { - name = (String) context.getConfigParameterValue("name"); - } - - @Override - public void process(JCas jCas) throws AnalysisEngineProcessException { - log.debug("Running AE: {}", name); - log.debug("JCas text: " + jCas.getDocumentText()); - } - } - - public static class TestMultiplier extends JCasMultiplier_ImplBase { - private List idsToRead = new ArrayList<>(); - private int currentIndex; - @Override - public void process(JCas jCas) throws AnalysisEngineProcessException { - RowBatch rowbatch = JCasUtil.selectSingle(jCas, RowBatch.class); - idsToRead.clear(); - currentIndex = 0; - for (int i = 0; i < rowbatch.getIdentifiers().size() && rowbatch.getIdentifiers(i) != null; i++) { - // In this test, the document IDs consist only of a single element - idsToRead.add(rowbatch.getIdentifiers(i).get(0)); - } - } - - @Override - public boolean hasNext() throws AnalysisEngineProcessException { - return currentIndex < idsToRead.size(); - } - - @Override - public AbstractCas next() throws AnalysisEngineProcessException { - JCas emptyJCas = getEmptyJCas(); - Header header = new Header(emptyJCas); - String docId = idsToRead.get(currentIndex); - header.setDocId(docId); - header.addToIndexes(); - emptyJCas.setDocumentText("ID: " + docId); - ++currentIndex; - return emptyJCas; - } - } - -} diff --git a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java index b7e041f2d..d310158cf 100644 --- a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java +++ b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java @@ -1,23 +1,32 @@ package de.julielab.jcore.reader.xml; +import de.julielab.costosys.configuration.FieldConfig; +import de.julielab.costosys.dbconnection.CoStoSysConnection; import de.julielab.jcore.reader.db.DBMultiplier; import de.julielab.jcore.reader.db.DBReader; import de.julielab.jcore.reader.xmlmapper.mapper.XMLMapper; +import de.julielab.jcore.types.casflow.ToVisit; +import de.julielab.jcore.types.casmultiplier.RowBatch; +import de.julielab.jcore.types.ext.DBProcessingMetaData; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.tuple.Pair; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.AbstractCas; +import org.apache.uima.cas.FeatureStructure; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.StringArray; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.*; import java.util.stream.Collectors; @ResourceMetaData(name = "JCoRe XML Database Multiplier", description = "This CAS multiplier receives information about " + @@ -29,9 +38,13 @@ "JeDIS." , vendor = "JULIE Lab Jena, Germany", copyright = "JULIE Lab Jena, Germany") public class XMLDBMultiplier extends DBMultiplier { -private final static Logger log = LoggerFactory.getLogger(XMLDBMultiplier.class); public static final String PARAM_ROW_MAPPING = Initializer.PARAM_ROW_MAPPING; public static final String PARAM_MAPPING_FILE = Initializer.PARAM_MAPPING_FILE; + public static final String PARAM_ADD_SHA_HASH = "AddShaHash"; + public static final String PARAM_TABLE_DOCUMENT = "DocumentTable"; + public static final String PARAM_TO_VISIT_KEYS = "ToVisitKeys"; + + private final static Logger log = LoggerFactory.getLogger(XMLDBMultiplier.class); /** * Mapper which maps medline XML to a CAS with the specified UIMA type system * via an XML configuration file. @@ -41,8 +54,18 @@ public class XMLDBMultiplier extends DBMultiplier { protected String[] rowMappingArray; @ConfigurationParameter(name = PARAM_MAPPING_FILE, description = XMLDBReader.DESC_MAPPING_FILE) protected String mappingFileStr; + @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, mandatory = false, description = "For use with AnnotationDefinedFlowController. Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS skips all component except the DBCheckpointAE to mark the document as processed.") + private String documentItemToHash; + @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT, mandatory = false, description = "For use with AnnotationDefinedFlowController. String parameter indicating the name of the " + + "table where the XMI data and, thus, the hash is stored. The name must be schema qualified. Note that in this component, only the ToVisit annotation is created that determines which components to apply to a CAS with matching (unchanged) hash. The logic to actually control the CAS flow is contained in the AnnotationDefinedFlowController.") + private String xmiStorageDataTable; + @ConfigurationParameter(name = PARAM_TO_VISIT_KEYS, mandatory = false, description = "For use with AnnotationDefinedFlowController. The delegate AE keys of the AEs this CAS should still applied on although the hash has not changed. Can be null or empty indicating that no component should be applied to the CAS. This is, however, the task of the AnnotationDefinedFlowController.") + private String[] toVisitKeys; + + private Row2CasMapper row2CasMapper; private CasPopulator casPopulator; + private Map docId2HashMap; private boolean initialized; @@ -51,7 +74,9 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept super.initialize(aContext); mappingFileStr = (String) aContext.getConfigParameterValue(PARAM_MAPPING_FILE); rowMappingArray = (String[]) aContext.getConfigParameterValue(PARAM_ROW_MAPPING); - + xmiStorageDataTable = (String) aContext.getConfigParameterValue(PARAM_TABLE_DOCUMENT); + documentItemToHash = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_ADD_SHA_HASH)).orElse("document_text"); + toVisitKeys = (String[]) aContext.getConfigParameterValue(PARAM_TO_VISIT_KEYS); // We don't know yet which tables to read. Thus, we leave the row mapping out. // We will now once the DBMultiplier#process(JCas) will have been run. Initializer initializer = new Initializer(mappingFileStr, null, null); @@ -59,6 +84,11 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept initialized = false; } + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException { + super.process(aJCas); + docId2HashMap = fetchCurrentHashesFromDatabase(JCasUtil.selectSingle(aJCas, RowBatch.class)); + } @Override public AbstractCas next() throws AnalysisEngineProcessException { @@ -78,6 +108,7 @@ public AbstractCas next() throws AnalysisEngineProcessException { } byte[][] documentData = documentDataIterator.next(); populateCas(jCas, documentData); + setToVisitAnnotation(jCas); } } catch (Exception e) { log.error("Exception occurred: ", e); @@ -86,6 +117,41 @@ public AbstractCas next() throws AnalysisEngineProcessException { return jCas; } + /** + *

Creates a {@link ToVisit} annotation based on document text hash comparison and the defined parameter values.

+ *

Computes the hash of the newly read CAS and compares it to the hash for the same document retrieved from the + * database, if present. If there was a hash in the database and the hash values are equal, creates the ToVisit + * annotation and adds the toVisitKeys passed in the configuration of this component.

+ * + * @param jCas The newly read JCas. + */ + private void setToVisitAnnotation(JCas jCas) { + if (xmiStorageDataTable != null) { + DBProcessingMetaData dbProcessingMetaData = JCasUtil.selectSingle(jCas, DBProcessingMetaData.class); + StringArray pkArray = dbProcessingMetaData.getPrimaryKey(); + String pkString = String.join(",", pkArray.toArray()); + String existingHash = docId2HashMap.get(pkString); + if (existingHash != null) { + String newHash = getHash(jCas); + if (existingHash.equals(newHash)) { + ToVisit toVisit = new ToVisit(jCas); + if (toVisitKeys != null && toVisitKeys.length != 0) { + StringArray keysArray = new StringArray(jCas, toVisitKeys.length); + keysArray.copyFromArray(toVisitKeys, 0, 0, toVisitKeys.length); + toVisit.setDelegateKeys(keysArray); + } + toVisit.addToIndexes(); + } + } + } + } + + private String getHash(JCas newCas) { + final String documentText = newCas.getDocumentText(); + final byte[] sha = DigestUtils.sha256(documentText.getBytes()); + return Base64.encodeBase64String(sha); + } + private void populateCas(JCas jCas, byte[][] documentData) throws AnalysisEngineProcessException { try { casPopulator.populateCas(jCas, documentData, @@ -100,4 +166,51 @@ protected List> getAllRetrievedColumns() { Pair>> numColumnsAndFields = dbc.getNumColumnsAndFields(tables.length > 1, schemaNames); return numColumnsAndFields.getRight().stream().map(HashMap::new).collect(Collectors.toList()); } + + /** + *

Fetches the hashes of the currently stored documents in the database.

+ * + * @param rowBatch The annotation specifying which documents should be fetched by the multiplier and then be processed by the aggregate. + * @return A map from a string representation of the RowBatches document IDs to the hashes for the respective IDs. + * @throws AnalysisEngineProcessException If the SQL request fails. + */ + private Map fetchCurrentHashesFromDatabase(RowBatch rowBatch) throws AnalysisEngineProcessException { + if (xmiStorageDataTable != null) { + String hashColumn = documentItemToHash + "_sha256"; + // Extract the document IDs in this RowBatch. The IDs could be composite keys. + List documentIds = new ArrayList<>(rowBatch.getIdentifiers().size()); + Iterator documentIDsIt = rowBatch.getIdentifiers().iterator(); + while (documentIDsIt.hasNext()) { + StringArray pkArray = (StringArray) documentIDsIt.next(); + documentIds.add(pkArray.toStringArray()); + } + Map id2hash = new HashMap<>(documentIds.size()); + // This is the map we want to fill that lets us look up the hash of the document text by document ID. + String sql = null; + // Query the database for the document IDs in the current RowBatch and retrieve hashes. + try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { + FieldConfig activeTableFieldConfiguration = dbc.getActiveTableFieldConfiguration(); + String idQuery = documentIds.stream() + .map(key -> Arrays.stream(key).map(part -> "%s='" + part + '"').toArray(String[]::new)) + .map(activeTableFieldConfiguration::expandPKNames).map(expandedKeys -> String.join(" AND ", expandedKeys)) + .collect(Collectors.joining(" OR ")); + sql = String.format("SELECT %s,%s FROM %s WHERE %s", activeTableFieldConfiguration.getPrimaryKeyString(), hashColumn, xmiStorageDataTable, idQuery); + ResultSet rs = conn.createStatement().executeQuery(sql); + while (rs.next()) { + StringBuilder pkSb = new StringBuilder(); + for (int i = 0; i < activeTableFieldConfiguration.getPrimaryKey().length; i++) + pkSb.append(rs.getString(i)).append(','); + // Remove training comma + pkSb.deleteCharAt(pkSb.length()); + String hash = rs.getString(activeTableFieldConfiguration.getPrimaryKey().length); + id2hash.put(pkSb.toString(), hash); + } + } catch (SQLException e) { + log.error("Could not retrieve hashes from the database. SQL query was {}:", sql, e); + throw new AnalysisEngineProcessException(e); + } + return id2hash; + } + return null; + } } diff --git a/pom.xml b/pom.xml index 8768f3a25..87deb6229 100644 --- a/pom.xml +++ b/pom.xml @@ -70,7 +70,9 @@
- + + jcore-annotation-adder-ae + jcore-ace-reader jcore-acronym-ae @@ -78,7 +80,11 @@ jcore-acronym-writer jcore-banner-ae - + + jcore-bc2gm-reader + + jcore-bc2gmformat-writer + jcore-biolemmatizer-ae jcore-bionlpformat-consumer @@ -91,12 +97,16 @@ jcore-coordination-baseline-ae + jcore-cord19-reader + jcore-coreference-writer jcore-ct-reader - + + jcore-db-checkpoint-ae + jcore-descriptor-creator - + jcore-dta-reader jcore-ec-code-ae @@ -112,6 +122,10 @@ jcore-file-reader jcore-flair-ner-ae + + jcore-flair-token-embedding-ae + + jcore-flow-controllers jcore-iexml-consumer @@ -134,7 +148,9 @@ jcore-likelihood-assignment-ae jcore-likelihood-detection-ae - + + jcore-line-multiplier + jcore-lingpipegazetteer-ae jcore-lingpipe-porterstemmer-ae @@ -154,6 +170,8 @@ jcore-muc7-reader jcore-mutationfinder-ae + + jcore-neo4j-relations-consumer jcore-opennlp-chunk-ae @@ -164,7 +182,9 @@ jcore-opennlp-sentence-ae jcore-opennlp-token-ae - + + jcore-ppd-writer + jcore-pmc-reader jcore-pubtator-reader @@ -191,24 +211,6 @@ jedis-parent - jcore-db-checkpoint-ae - - jcore-ppd-writer - - jcore-bc2gmformat-writer - - jcore-bc2gm-reader - - jcore-annotation-adder-ae - - jcore-flair-token-embedding-ae - - jcore-line-multiplier - - jcore-cord19-reader - - jcore-neo4j-relations-consumer - diff --git a/scripts/createMetaDescriptors.py b/scripts/createMetaDescriptors.py index 2da6f7de0..6c940f20b 100755 --- a/scripts/createMetaDescriptors.py +++ b/scripts/createMetaDescriptors.py @@ -12,15 +12,14 @@ -v: The version of the repository -u: If the repository does not yet exist: If is updateable or not """ +import fnmatch +import json import os -import sys import re -from os.path import expanduser -import json -import fnmatch +import sys import xml.etree.ElementTree as ET +from os.path import expanduser from xml.etree.ElementTree import ParseError -from collections import Counter # For testing we define in and out names so we can create new versions and compare META_DESC_IN_NAME = "component.meta" @@ -66,6 +65,8 @@ def getArtifactInfo(pomFile): category = "consumer" if (artifactId.endswith("writer")): category = "consumer" + if (artifactId.endswith("flowcontroller")): + category = "flowcontroller" artifact = {} artifact["artifactId"] = artifactId @@ -111,6 +112,8 @@ def getDescriptors(projectpath): category = "consumer" if descriptorRoot.tag.endswith("casConsumerDescription"): category = "consumer" + if descriptorRoot.tag.endswith("flowControllerDescription"): + category = "flowcontroller" if category != None: # From the complete file name, exclude the system dependent part. That is, make the path relative to the # project directory's src/main/resources directory. From f1488a0cc9e764c3755f9c018efe1dec1d9eca80 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 4 Jun 2021 15:49:54 +0200 Subject: [PATCH 056/269] Currently adding a test for the hash value comparison. Not finished yet. --- jcore-flow-controllers/component.meta | 20 ++++ jcore-flow-controllers/pom.xml | 4 +- .../AnnotationDefinedFlowControllerTest.java | 35 +----- jcore-xml-db-reader/pom.xml | 13 ++- .../jcore/reader/xml/XMLDBMultiplierTest.java | 107 ++++++++++++++++++ jedis-parent/pom.xml | 4 +- 6 files changed, 143 insertions(+), 40 deletions(-) create mode 100644 jcore-flow-controllers/component.meta create mode 100644 jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java diff --git a/jcore-flow-controllers/component.meta b/jcore-flow-controllers/component.meta new file mode 100644 index 000000000..d8f783bd8 --- /dev/null +++ b/jcore-flow-controllers/component.meta @@ -0,0 +1,20 @@ +{ + "categories": [ + "flowcontroller" + ], + "description": "Flow controllers can be used to control the route a (J)CAS takes through an aggregate analysis engine. This project contains Flow Controllers developed at the JULIE Lab.", + "descriptors": [ + { + "category": "flowcontroller", + "location": "de.julielab.jcore.flow.annotationdefined.desc.jcore-annotation-defined-flowcontroller" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-flow-controllers", + "groupId": "de.julielab", + "version": "2.6.0-SNAPSHOT" + }, + "name": "JCoRe Flow Controllers" +} diff --git a/jcore-flow-controllers/pom.xml b/jcore-flow-controllers/pom.xml index a316b81ad..d31933489 100644 --- a/jcore-flow-controllers/pom.xml +++ b/jcore-flow-controllers/pom.xml @@ -22,8 +22,8 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine test diff --git a/jcore-flow-controllers/src/test/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowControllerTest.java b/jcore-flow-controllers/src/test/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowControllerTest.java index f5daabd0f..228e94a49 100644 --- a/jcore-flow-controllers/src/test/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowControllerTest.java +++ b/jcore-flow-controllers/src/test/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowControllerTest.java @@ -15,7 +15,7 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.StringArray; import org.apache.uima.resource.ResourceInitializationException; -import org.junit.Test; +import org.junit.jupiter.api.Test; import static java.util.Arrays.asList; import static org.assertj.core.api.Assertions.assertThat; @@ -107,37 +107,4 @@ public void process(JCas jCas) { token.addToIndexes(); } } - -// public static class TestMultiplier extends JCasMultiplier_ImplBase { -// private List idsToRead = new ArrayList<>(); -// private int currentIndex; -// @Override -// public void process(JCas jCas) throws AnalysisEngineProcessException { -// RowBatch rowbatch = JCasUtil.selectSingle(jCas, RowBatch.class); -// idsToRead.clear(); -// currentIndex = 0; -// for (int i = 0; i < rowbatch.getIdentifiers().size() && rowbatch.getIdentifiers(i) != null; i++) { -// // In this test, the document IDs consist only of a single element -// idsToRead.add(rowbatch.getIdentifiers(i).get(0)); -// } -// } -// -// @Override -// public boolean hasNext() throws AnalysisEngineProcessException { -// return currentIndex < idsToRead.size(); -// } -// -// @Override -// public AbstractCas next() throws AnalysisEngineProcessException { -// JCas emptyJCas = getEmptyJCas(); -// Header header = new Header(emptyJCas); -// String docId = idsToRead.get(currentIndex); -// header.setDocId(docId); -// header.addToIndexes(); -// emptyJCas.setDocumentText("ID: " + docId); -// ++currentIndex; -// return emptyJCas; -// } -// } - } diff --git a/jcore-xml-db-reader/pom.xml b/jcore-xml-db-reader/pom.xml index 145fcc69a..8447584a9 100644 --- a/jcore-xml-db-reader/pom.xml +++ b/jcore-xml-db-reader/pom.xml @@ -64,8 +64,17 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine + test + + + de.julielab + jcore-db-test-utilities + + + org.assertj + assertj-core
https://github.com/JULIELab/jcore-base/jcore-xml-db-reader diff --git a/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java b/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java new file mode 100644 index 000000000..910c682ba --- /dev/null +++ b/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java @@ -0,0 +1,107 @@ +package de.julielab.jcore.reader.xml; + + +import de.julielab.costosys.dbconnection.CoStoSysConnection; +import de.julielab.costosys.dbconnection.DataBaseConnector; +import de.julielab.jcore.db.test.DBTestUtils; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.configuration2.ex.ConfigurationException; +import org.apache.uima.UIMAException; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.PostgreSQLContainer; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; +public class XMLDBMultiplierTest { + + public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer("postgres:11.12"); + private static String costosysConfig; + private static final String SOURCE_XML_TABLE = "source_xml_table"; + private static final String TARGET_XMI_TABLE = "target_xmi_table"; + private static final String PMID_FIELD_NAME = "pmid"; + private static final String DOCID_FIELD_NAME = "docid"; + private static final String XML_FIELD_NAME = "xml"; + private static final String BASE_DOCUMENT_FIELD_NAME = "base_document"; + private static final String HASH_FIELD_NAME = "documentText_sha256"; + private static final String MAX_XMI_ID_FIELD_NAME = "max_xmi_id"; + private static final String SOFA_MAPPING_FIELD_NAME = "sofa_mapping"; + + @BeforeAll + public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { + postgres.start(); + DBTestUtils.createAndSetHiddenConfig(Path.of("src", "test", "resources", "hiddenConfig").toString(), postgres); + + DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setActiveTableSchema("medline_2016_nozip"); + costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2016_nozip", 1, postgres); + new File(costosysConfig).deleteOnExit(); + try(CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { + prepareSourceXMLTable(dbc, conn); + prepareTargetXMITable(dbc, conn); + } + assertThat(dbc.getNumRows(SOURCE_XML_TABLE)).isEqualTo(10); + assertThat(dbc.getNumRows(TARGET_XMI_TABLE)).isEqualTo(5); + dbc.close(); + } + + private static void prepareSourceXMLTable(DataBaseConnector dbc, CoStoSysConnection conn) throws SQLException { + String xmlFmt = "%d"; + dbc.createTable(SOURCE_XML_TABLE, "Test table for hash comparison test."); + String sql = String.format("INSERT INTO %s (%s,%s) VALUES (?,?)", SOURCE_XML_TABLE, PMID_FIELD_NAME, XML_FIELD_NAME); + PreparedStatement ps = conn.prepareStatement(sql); + for (int i = 0; i < 10; i++) { + String xml = String.format(xmlFmt, i); + ps.setString(1, String.valueOf(i)); + ps.setString(2, xml); + ps.addBatch(); + } + ps.executeBatch(); + } + + private static void prepareTargetXMITable(DataBaseConnector dbc, CoStoSysConnection conn) throws SQLException { + // Note that the root is "xmi" and not "xml" + String xmlFmt = "%d"; + dbc.createTable(TARGET_XMI_TABLE, "xmi_text", "Test table for hash comparison test."); + dbc.assureColumnsExist(TARGET_XMI_TABLE, List.of(HASH_FIELD_NAME), "text"); + String sql = String.format("INSERT INTO %s (%s,%s,%s,%s,%s) VALUES (?,XMLPARSE(CONTENT ?),?,?,?)", TARGET_XMI_TABLE, DOCID_FIELD_NAME, BASE_DOCUMENT_FIELD_NAME, HASH_FIELD_NAME, MAX_XMI_ID_FIELD_NAME, SOFA_MAPPING_FIELD_NAME); + PreparedStatement ps = conn.prepareStatement(sql); + // Note that we only add half of the documents compared to the source XML import. This way we test + // if the code behaves right when the target document does not yet exist at all. + for (int i = 0; i < 5; i++) { + String xml = String.format(xmlFmt, i); + ps.setString(1, String.valueOf(i)); + ps.setString(2, xml); + ps.setString(3, getHash(xml)); + ps.setInt(4, 0); + ps.setString(5, "dummy"); + ps.addBatch(); + } + ps.executeBatch(); + } + + @AfterAll + public static void tearDown() { + postgres.stop(); + } + + private static String getHash(String str) { + final byte[] sha = DigestUtils.sha256(str.getBytes()); + return Base64.encodeBase64String(sha); + } + + @Test + public void testHashComparison() { + + } + + +} diff --git a/jedis-parent/pom.xml b/jedis-parent/pom.xml index 71ffa5ceb..48d783b6d 100644 --- a/jedis-parent/pom.xml +++ b/jedis-parent/pom.xml @@ -17,12 +17,12 @@ de.julielab costosys - 1.5.1 + 1.5.2-SNAPSHOT de.julielab jcore-db-test-utilities - 2.5.0 + 2.5.1-SNAPSHOT de.julielab From 50c442f00b60eaf64f8ef6fa58fa9a9ffc41c4a7 Mon Sep 17 00:00:00 2001 From: khituras Date: Sun, 6 Jun 2021 11:52:48 +0200 Subject: [PATCH 057/269] XML Mapper: Adding documentation regarding the mapping file structure. --- jcore-xml-mapper/README.md | 99 ++++++++++++++++++- .../medlineMappingFileStructuredAbstract.xml | 3 +- 2 files changed, 100 insertions(+), 2 deletions(-) diff --git a/jcore-xml-mapper/README.md b/jcore-xml-mapper/README.md index 5bcf986c3..6eacab174 100644 --- a/jcore-xml-mapper/README.md +++ b/jcore-xml-mapper/README.md @@ -3,7 +3,7 @@ NOTE: This is not a UIMA component but rather a library used by some JCoRe compo This is a generic XML mapper to create CAS instances reflecting contents of XML documents. ### Objective -The JULIE Lab XMLMapper is a mapper which maps XML elements from an XML document onto (UIMA) Types or Type Features. For that task it uses a mapping file, which comes as an input. +The JULIE Lab XMLMapper is a mapper which maps XML elements from an XML document onto (UIMA) types or type features. For that task it uses a mapping file, which comes as an input. Examples for mapping files are found in some [jcore-projects](https://github.com/JULIELab/jcore-projects) components, for example the [jcore-pubmed-reader](https://github.com/JULIELab/jcore-projects/tree/master/jcore-pubmed-reader), its MEDLINE-pendant or the database versions of both. @@ -14,4 +14,101 @@ The input and output of an AE is done via annotation objects. The classes corres ### Using the AE - Descriptor Configuration In UIMA, each component is configured by a descriptor in XML. Such a preconfigured descriptor is available under `src/main/resources/de/julielab/jcore/ ` but it can be further edited if so desired; see [UIMA SDK User's Guide](https://uima.apache.org/downloads/releaseDocs/2.1.0-incubating/docs/html/tools/tools.html#ugr.tools.cde) for further information. +### Mapping File Syntax +Please note that this section is incomplete. The mapping file of the [jcore-pubmed-reader](https://github.com/JULIELab/jcore-projects/tree/master/jcore-pubmed-reader) includes examples for all supported features. + +The basic structure of the mapping file consists of the `` root element, a `` root child element and an arbitrary number of `` ('type system type', referring to the UIMA type system to be employed) root child elements: + +```xml + + + ... + + + ... + + + ... + + ... + +``` + +## Document Text +The CAS document text is populated with the `` mapping element. It defines an arbitrary number of `` elements of whose mapping values the document text will be comprised, in the order of the `` elements in the mapping file. Each document part is given a manually defined ID which can be referred to in order to create a UIMA annotation covering the respective document part text. The location of the actual character data in the mapped document XML files is specified via XPath. + +```xml + + + /MedlineCitation/Article/ArticleTitle + + + /MedlineCitation/Article/Abstract + + +``` + +This example collects the article title, and the abstract of a MEDLINE XML document for the CAS document text. + +The `` may have an optional child element named ``. This is useful or even a necessity when the document structure for this element is not static, i.e. has a varying number of children. In such a case, a user-delivered class on the classpath can be specified. This class must implement the `de.julielab.jcore.reader.xmlmapper.mapper.DocumentTextPartParser` interface and received the document XML element that the XPath in the mapping file points to. It then returns a list of strings using to comprise the respective part of the document text: + +```xml + + /MedlineCitation/Article/Abstract + + de.julielab.jcore.reader.xmlmapper.mapper.StructuredAbstractParser + + +``` + +The `StructuredAbstractParser` is able to parse the child elements of `/MedlineCitation/Article/Abstract`, namely `AbstractText` elements which also have attributes, `Label` and `NlmCategory`. Those are details to the MEDLINE XML format and are just use here as an example use case for external parsers. + +## UIMA Type Annotations + +Annotations are added with the `` element. Its main children are `` and ``, defining the actual type to be instantiated and any feature values that should be added to the type. Since a UIMA type feature can itself be a type, `` elements can be nested. Then, the `` child of a `` element is resolved *relative* the `` of the parent `` element. Thus, when the parent `` element does not specify an `` element, which is perfectly legal, the given xpath is resolved from the XML document root: + +```xml + + fully qualified UIMA type name + + feature name of the type + true if the feature value is a UIMA feature structure (annotation) itself + + The value data type of the feature as it is passed to the setter for this feature in Java code. + This can also be an array type, e.g. org.apache.uima.jcas.cas.FSArray. + + + optional if the parent tsFullClassName is an array type + true + + absolute xpath since the parent does not specify an xpath + + + fully qualified UIMA type name of this nested type + + + + name of this feature relative to the parent fsFullClassName type + relative xpath to the parent xpath + a primitive data type (or a string) since this is not a UIMA type itself (missing isType element). + + + + +``` +The above example showcases the structure of a nested annotation, i.e. a feature path. The outer type will have another type as feature value which in turn has a primitive value as the final feature value. + +**Important** The `` values are evaluated for *all occurrences* of the respective XPath in the XML document. Thus, the above annotations will be created for all XPath matches. This holds true for every level of `` specifications. This allows collecting child XML document elements into arrays. An outer xpath points to the collection document elements, and an inner xpath points the children. + +The `` element again accepts the child element ``. In this case, the external parser needs to implement the `de.julielab.jcore.reader.xmlmapper.typeParser.TypeParser` interface. It might be helpful to extend the class `de.julielab.jcore.reader.xmlmapper.typeParser.StandardTypeParser` and use its `parseSingleType` method. + +Finally, the `` element accepts the `` child element which can point to a part of document text, thus create an annotation for the respective document text part as identified by its ID: + +```xml + + + 0 + + +``` diff --git a/jcore-xml-mapper/src/test/resources/medlineMappingFileStructuredAbstract.xml b/jcore-xml-mapper/src/test/resources/medlineMappingFileStructuredAbstract.xml index 9badb769f..eca924537 100644 --- a/jcore-xml-mapper/src/test/resources/medlineMappingFileStructuredAbstract.xml +++ b/jcore-xml-mapper/src/test/resources/medlineMappingFileStructuredAbstract.xml @@ -5,7 +5,8 @@ /MedlineCitation/Article/Abstract - de.julielab.jcore.reader.xmlmapper.mapper.StructuredAbstractParser + + de.julielab.jcore.reader.xmlmapper.mapper.StructuredAbstractParser From 29b8b7b0c3888108fa6b4aedf4ca599bda13bd85 Mon Sep 17 00:00:00 2001 From: khituras Date: Sun, 6 Jun 2021 14:14:28 +0200 Subject: [PATCH 058/269] Adapted some error messages. Removed fest reflect from the `StandardTypeBuilder` and the pom.xml and replaced it with default Java reflect. --- jcore-xml-mapper/README.md | 8 +++---- jcore-xml-mapper/pom.xml | 10 -------- .../xmlmapper/genericTypes/TypeFactory.java | 23 ++++++++----------- .../typeBuilder/StandardTypeBuilder.java | 9 +++++--- 4 files changed, 20 insertions(+), 30 deletions(-) diff --git a/jcore-xml-mapper/README.md b/jcore-xml-mapper/README.md index 6eacab174..e1fa47aac 100644 --- a/jcore-xml-mapper/README.md +++ b/jcore-xml-mapper/README.md @@ -35,7 +35,7 @@ The basic structure of the mapping file consists of the `` root elemen ``` ## Document Text -The CAS document text is populated with the `` mapping element. It defines an arbitrary number of `` elements of whose mapping values the document text will be comprised, in the order of the `` elements in the mapping file. Each document part is given a manually defined ID which can be referred to in order to create a UIMA annotation covering the respective document part text. The location of the actual character data in the mapped document XML files is specified via XPath. +The CAS document text is populated with the `` mapping element. It defines an arbitrary number of `` elements of whose mapping values the document text will be comprised, in the order of the `` elements in the mapping file. Each document part is given a mandatory, manually defined ID which can be referred to in order to create a UIMA annotation covering the respective document part text. The location of the actual character data in the mapped document XML files is specified via XPath. ```xml @@ -106,9 +106,9 @@ Finally, the `` element accepts the `` child element which ca ```xml - - 0 - + + 0 + ``` diff --git a/jcore-xml-mapper/pom.xml b/jcore-xml-mapper/pom.xml index ded5b9a9f..7264e7079 100644 --- a/jcore-xml-mapper/pom.xml +++ b/jcore-xml-mapper/pom.xml @@ -22,16 +22,6 @@ jcore-types ${jcore-types-version} - - org.easytesting - fest-reflect - 1.2 - - - org.easytesting - fest-util - 1.1.4 - de.julielab julie-xml-tools diff --git a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/genericTypes/TypeFactory.java b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/genericTypes/TypeFactory.java index f9408edad..0a36ccc70 100644 --- a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/genericTypes/TypeFactory.java +++ b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/genericTypes/TypeFactory.java @@ -65,11 +65,11 @@ public class TypeFactory { /** * creates a new instance of the TypeFactory * - * @param mappingFile + * @param mappingFileData The mapping file contents. */ public TypeFactory(byte[] mappingFileData) { this.mappingFileData = mappingFileData; - types = new ArrayList(); + types = new ArrayList<>(); this.documentTextParser = new DocumentTextHandler(); } @@ -132,7 +132,7 @@ public List createTemplates() throws CollectionException { } } else { if (!nodeName.equals(ROOT)) { - LOGGER.warn("unknown tag in mapping file: " + nodeName + "!!"); + LOGGER.warn("unknown tag in mapping file (note that element names are case sensitive): " + nodeName); } } } @@ -156,7 +156,8 @@ private void fillDocumentParser(XMLEventReader reader) throws XMLStreamException id = Integer.parseInt(next.getValue()); documentTextParser.addPartOfDocumentTextXPath(id); } - } else { + } + else { LOGGER.error("no id for " + PART_OF_DOCUMENT_TEXT); throw new RuntimeException(); } @@ -172,7 +173,7 @@ private void fillDocumentParser(XMLEventReader reader) throws XMLStreamException if (xpath.length() > 0 && id >= 0) { documentTextParser.setXPathForPartOfDocumentText(id, xpath); } else { - LOGGER.error("Unkown data in " + DOCUMENT_TEXT + "/" + VALUE_X_PATH + " tag "); + LOGGER.error("Unknown data in " + DOCUMENT_TEXT + "/" + VALUE_X_PATH + " tag "); } } else if (nodeName.equals(EXTERNAL_PARSER)){ event = reader.nextEvent(); @@ -183,7 +184,7 @@ private void fillDocumentParser(XMLEventReader reader) throws XMLStreamException if (externalParserClassName.length() > 0 && id >= 0) { documentTextParser.setExternalParserForPartOfDocument(id, externalParserClassName); } else { - LOGGER.error("Unkown data in " + DOCUMENT_TEXT + "/" + VALUE_X_PATH + " tag "); + LOGGER.error("Unknown data in " + DOCUMENT_TEXT + "/" + VALUE_X_PATH + " tag "); } } else { @@ -242,15 +243,11 @@ private TypeTemplate parseType(XMLEventReader reader) throws XMLStreamException, type.addAdditionalData(event.asCharacters().getData().trim(), index); } } else { - LOGGER.warn("unknown tag in mapping file: " + nodeName + "!!"); + LOGGER.warn("unknown tag in mapping file (note that element names are case sensitive): " + nodeName); } } event = reader.nextEvent(); } - // reflection type anlegen - // iteration über alle features - // if(feature.type==null) - // über getter bestimmen return type; } @@ -273,7 +270,7 @@ private void parseOffset(TypeTemplate type, XMLEventReader reader) throws XMLStr } } } else { - LOGGER.error("Unknown element in mapping file: " + nodeName); + LOGGER.error("Unknown element in mapping file (note that element names are case sensitive): " + nodeName); } } } @@ -315,7 +312,7 @@ private FeatureTemplate parseFeature(XMLEventReader reader) throws XMLStreamExce FeatureTemplate newFeature = parseFeature(reader); feature.addFeature(newFeature); } else { - LOGGER.warn("unknown tag in mapping file: " + nodeName + "!!"); + LOGGER.warn("unknown tag in mapping file (note that element names are case sensitive): " + nodeName); } } event = reader.nextEvent(); diff --git a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/typeBuilder/StandardTypeBuilder.java b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/typeBuilder/StandardTypeBuilder.java index ca3bbec18..9fa1c46b2 100644 --- a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/typeBuilder/StandardTypeBuilder.java +++ b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/typeBuilder/StandardTypeBuilder.java @@ -27,7 +27,6 @@ import java.util.HashMap; -import static org.fest.reflect.core.Reflection.constructor; import static org.fest.reflect.core.Reflection.method; /** @@ -120,7 +119,11 @@ private Annotation buildSingleInstance(ConcreteType concreteType, JCas jcas) thr if (concreteType.getConcreteFeatures() != null) { // Create the UIMA type corresponding to the type description in // concreteType. - type = (Annotation) constructor().withParameterTypes(JCas.class).in(typeClass).newInstance(jcas); + try { + type = (Annotation) typeClass.getConstructor(JCas.class).newInstance(jcas); + } catch (Exception e){ + throw new CollectionException(e); + } // For each feature this type has, set the corret feature value. for (ConcreteFeature concreteFeature : concreteType.getConcreteFeatures()) { @@ -151,7 +154,7 @@ private Annotation buildSingleInstance(ConcreteType concreteType, JCas jcas) thr .invoke(parseValueStringToValueType(concreteFeature.getValue(), concreteFeature.getFullClassName())); } else if (concreteFeature.getFullClassName().equals("String") || concreteFeature.getFullClassName().equals("java.lang.String")) { featureClass = Class.forName(concreteFeature.getFullClassName()); - method(methodName).withParameterTypes(featureClass).in(type).invoke(concreteFeature.getValue()); + typeClass.getMethod(methodName, featureClass).invoke(type, concreteFeature.getValue()); } else { String featureClassName = concreteFeature.getFullClassName(); if (StringUtils.isBlank(featureClassName)) From 4e29844a7a2809cc65bb9dfc0ac16c21d320a721 Mon Sep 17 00:00:00 2001 From: khituras Date: Sun, 6 Jun 2021 15:18:31 +0200 Subject: [PATCH 059/269] Continued with the test for hash comparison. Not yet finished. --- .../jcore/reader/db/DBMultiplier.java | 2 + .../jcore/reader/xml/XMLDBMultiplier.java | 29 ++++-- .../jcore/reader/xml/XMLDBMultiplierTest.java | 97 +++++++++++++++++-- .../src/test/resources/test-mappingfile.xml | 17 ++++ 4 files changed, 127 insertions(+), 18 deletions(-) create mode 100644 jcore-xml-db-reader/src/test/resources/test-mappingfile.xml diff --git a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java index 195e30de7..17040c15e 100644 --- a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java +++ b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java @@ -9,6 +9,7 @@ import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.OperationalProperties; import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; @@ -34,6 +35,7 @@ "populate CASes with them. This component is a part of the Jena Document Information System, JeDIS.", vendor = "JULIE Lab Jena, Germany", copyright = "JULIE Lab Jena, Germany") @OperationalProperties(outputsNewCases = true) +@TypeCapability(inputs = {"de.julielab.jcore.types.casmultiplier.RowBatch"}) public abstract class DBMultiplier extends JCasMultiplier_ImplBase { protected DataBaseConnector dbc; diff --git a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java index d310158cf..d6c1f7186 100644 --- a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java +++ b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java @@ -17,6 +17,7 @@ import org.apache.uima.cas.FeatureStructure; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.StringArray; @@ -37,11 +38,13 @@ "CAS with them via the 'RowMapping' parameter. This component is part of the Jena Document Information System, " + "JeDIS." , vendor = "JULIE Lab Jena, Germany", copyright = "JULIE Lab Jena, Germany") +@TypeCapability(inputs = {"de.julielab.jcore.types.casmultiplier.RowBatch"}, outputs = {"de.julielab.jcore.types.casflow.ToVisit"}) public class XMLDBMultiplier extends DBMultiplier { public static final String PARAM_ROW_MAPPING = Initializer.PARAM_ROW_MAPPING; public static final String PARAM_MAPPING_FILE = Initializer.PARAM_MAPPING_FILE; public static final String PARAM_ADD_SHA_HASH = "AddShaHash"; public static final String PARAM_TABLE_DOCUMENT = "DocumentTable"; + public static final String PARAM_TABLE_DOCUMENT_SCHEMA = "DocumentTableSchema"; public static final String PARAM_TO_VISIT_KEYS = "ToVisitKeys"; private final static Logger log = LoggerFactory.getLogger(XMLDBMultiplier.class); @@ -59,6 +62,8 @@ public class XMLDBMultiplier extends DBMultiplier { @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT, mandatory = false, description = "For use with AnnotationDefinedFlowController. String parameter indicating the name of the " + "table where the XMI data and, thus, the hash is stored. The name must be schema qualified. Note that in this component, only the ToVisit annotation is created that determines which components to apply to a CAS with matching (unchanged) hash. The logic to actually control the CAS flow is contained in the AnnotationDefinedFlowController.") private String xmiStorageDataTable; + @ConfigurationParameter(name= PARAM_TABLE_DOCUMENT_SCHEMA, mandatory = false, description = "For use with AnnotationDefinedFlowController. The name of the schema that the document table - given with the "+PARAM_TABLE_DOCUMENT+" parameter - adheres to. Only the primary key part is required for hash value retrieval.") + private String xmiStorageDataTableSchema; @ConfigurationParameter(name = PARAM_TO_VISIT_KEYS, mandatory = false, description = "For use with AnnotationDefinedFlowController. The delegate AE keys of the AEs this CAS should still applied on although the hash has not changed. Can be null or empty indicating that no component should be applied to the CAS. This is, however, the task of the AnnotationDefinedFlowController.") private String[] toVisitKeys; @@ -75,6 +80,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept mappingFileStr = (String) aContext.getConfigParameterValue(PARAM_MAPPING_FILE); rowMappingArray = (String[]) aContext.getConfigParameterValue(PARAM_ROW_MAPPING); xmiStorageDataTable = (String) aContext.getConfigParameterValue(PARAM_TABLE_DOCUMENT); + xmiStorageDataTableSchema = (String) aContext.getConfigParameterValue(PARAM_TABLE_DOCUMENT_SCHEMA); documentItemToHash = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_ADD_SHA_HASH)).orElse("document_text"); toVisitKeys = (String[]) aContext.getConfigParameterValue(PARAM_TO_VISIT_KEYS); // We don't know yet which tables to read. Thus, we leave the row mapping out. @@ -82,6 +88,12 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept Initializer initializer = new Initializer(mappingFileStr, null, null); xmlMapper = initializer.getXmlMapper(); initialized = false; + + if (!(xmiStorageDataTable == null && xmiStorageDataTableSchema == null && documentItemToHash == null) && !(xmiStorageDataTable != null && xmiStorageDataTableSchema != null && documentItemToHash != null)) { + String errorMsg = String.format("From the parameters '%s', '%s' and '%s' some are specified and some aren't. To activate hash value comparison in order to add aggregate component keys for CAS visit, specify all those parameters. Otherwise, specify none.", PARAM_TABLE_DOCUMENT, PARAM_TABLE_DOCUMENT_SCHEMA, PARAM_ADD_SHA_HASH); + log.error(errorMsg); + throw new ResourceInitializationException(new IllegalArgumentException(errorMsg)); + } } @Override @@ -162,7 +174,6 @@ private void populateCas(JCas jCas, byte[][] documentData) throws AnalysisEngine } protected List> getAllRetrievedColumns() { - List> fields = new ArrayList>(); Pair>> numColumnsAndFields = dbc.getNumColumnsAndFields(tables.length > 1, schemaNames); return numColumnsAndFields.getRight().stream().map(HashMap::new).collect(Collectors.toList()); } @@ -189,20 +200,20 @@ private Map fetchCurrentHashesFromDatabase(RowBatch rowBatch) th String sql = null; // Query the database for the document IDs in the current RowBatch and retrieve hashes. try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { - FieldConfig activeTableFieldConfiguration = dbc.getActiveTableFieldConfiguration(); + FieldConfig xmiTableSchema = dbc.getFieldConfiguration(xmiStorageDataTableSchema); String idQuery = documentIds.stream() - .map(key -> Arrays.stream(key).map(part -> "%s='" + part + '"').toArray(String[]::new)) - .map(activeTableFieldConfiguration::expandPKNames).map(expandedKeys -> String.join(" AND ", expandedKeys)) + .map(key -> Arrays.stream(key).map(part -> "%s='" + part + "'").toArray(String[]::new)) + .map(xmiTableSchema::expandPKNames).map(expandedKeys -> String.join(" AND ", expandedKeys)) .collect(Collectors.joining(" OR ")); - sql = String.format("SELECT %s,%s FROM %s WHERE %s", activeTableFieldConfiguration.getPrimaryKeyString(), hashColumn, xmiStorageDataTable, idQuery); + sql = String.format("SELECT %s,%s FROM %s WHERE %s", xmiTableSchema.getPrimaryKeyString(), hashColumn, xmiStorageDataTable, idQuery); ResultSet rs = conn.createStatement().executeQuery(sql); while (rs.next()) { StringBuilder pkSb = new StringBuilder(); - for (int i = 0; i < activeTableFieldConfiguration.getPrimaryKey().length; i++) - pkSb.append(rs.getString(i)).append(','); + for (int i = 0; i < xmiTableSchema.getPrimaryKey().length; i++) + pkSb.append(rs.getString(i+1)).append(','); // Remove training comma - pkSb.deleteCharAt(pkSb.length()); - String hash = rs.getString(activeTableFieldConfiguration.getPrimaryKey().length); + pkSb.deleteCharAt(pkSb.length()-1); + String hash = rs.getString(xmiTableSchema.getPrimaryKey().length+1); id2hash.put(pkSb.toString(), hash); } } catch (SQLException e) { diff --git a/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java b/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java index 910c682ba..75dc1659e 100644 --- a/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java +++ b/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java @@ -4,10 +4,23 @@ import de.julielab.costosys.dbconnection.CoStoSysConnection; import de.julielab.costosys.dbconnection.DataBaseConnector; import de.julielab.jcore.db.test.DBTestUtils; +import de.julielab.jcore.types.casflow.ToVisit; +import de.julielab.jcore.types.casmultiplier.RowBatch; +import de.julielab.jcore.utility.JCoReTools; import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.configuration2.ex.ConfigurationException; import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.JCasIterator; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.factory.TypeSystemDescriptionFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.jcas.cas.StringArray; +import org.apache.uima.resource.metadata.TypeSystemDescription; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -18,13 +31,14 @@ import java.nio.file.Path; import java.sql.PreparedStatement; import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collection; import java.util.List; import static org.assertj.core.api.Assertions.assertThat; + public class XMLDBMultiplierTest { - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer("postgres:11.12"); - private static String costosysConfig; private static final String SOURCE_XML_TABLE = "source_xml_table"; private static final String TARGET_XMI_TABLE = "target_xmi_table"; private static final String PMID_FIELD_NAME = "pmid"; @@ -34,6 +48,9 @@ public class XMLDBMultiplierTest { private static final String HASH_FIELD_NAME = "documentText_sha256"; private static final String MAX_XMI_ID_FIELD_NAME = "max_xmi_id"; private static final String SOFA_MAPPING_FIELD_NAME = "sofa_mapping"; + private static final String SUBSET_TABLE = "test_subset"; + public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer("postgres:11.12"); + private static String costosysConfig; @BeforeAll public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { @@ -44,22 +61,24 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf dbc.setActiveTableSchema("medline_2016_nozip"); costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2016_nozip", 1, postgres); new File(costosysConfig).deleteOnExit(); - try(CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { + try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { prepareSourceXMLTable(dbc, conn); prepareTargetXMITable(dbc, conn); } + dbc.defineSubset(SUBSET_TABLE, SOURCE_XML_TABLE, "Test subset"); assertThat(dbc.getNumRows(SOURCE_XML_TABLE)).isEqualTo(10); assertThat(dbc.getNumRows(TARGET_XMI_TABLE)).isEqualTo(5); + dbc.close(); } private static void prepareSourceXMLTable(DataBaseConnector dbc, CoStoSysConnection conn) throws SQLException { - String xmlFmt = "%d"; + String xmlFmt = "%dThis is document text number %d"; dbc.createTable(SOURCE_XML_TABLE, "Test table for hash comparison test."); String sql = String.format("INSERT INTO %s (%s,%s) VALUES (?,?)", SOURCE_XML_TABLE, PMID_FIELD_NAME, XML_FIELD_NAME); PreparedStatement ps = conn.prepareStatement(sql); for (int i = 0; i < 10; i++) { - String xml = String.format(xmlFmt, i); + String xml = String.format(xmlFmt, i, i); ps.setString(1, String.valueOf(i)); ps.setString(2, xml); ps.addBatch(); @@ -69,7 +88,7 @@ private static void prepareSourceXMLTable(DataBaseConnector dbc, CoStoSysConnect private static void prepareTargetXMITable(DataBaseConnector dbc, CoStoSysConnection conn) throws SQLException { // Note that the root is "xmi" and not "xml" - String xmlFmt = "%d"; + String xmlFmt = "%dThis is document text number %d"; dbc.createTable(TARGET_XMI_TABLE, "xmi_text", "Test table for hash comparison test."); dbc.assureColumnsExist(TARGET_XMI_TABLE, List.of(HASH_FIELD_NAME), "text"); String sql = String.format("INSERT INTO %s (%s,%s,%s,%s,%s) VALUES (?,XMLPARSE(CONTENT ?),?,?,?)", TARGET_XMI_TABLE, DOCID_FIELD_NAME, BASE_DOCUMENT_FIELD_NAME, HASH_FIELD_NAME, MAX_XMI_ID_FIELD_NAME, SOFA_MAPPING_FIELD_NAME); @@ -77,7 +96,7 @@ private static void prepareTargetXMITable(DataBaseConnector dbc, CoStoSysConnect // Note that we only add half of the documents compared to the source XML import. This way we test // if the code behaves right when the target document does not yet exist at all. for (int i = 0; i < 5; i++) { - String xml = String.format(xmlFmt, i); + String xml = String.format(xmlFmt, i, i); ps.setString(1, String.valueOf(i)); ps.setString(2, xml); ps.setString(3, getHash(xml)); @@ -99,9 +118,69 @@ private static String getHash(String str) { } @Test - public void testHashComparison() { - + public void testMultiplier() throws Exception { + JCas jCas = prepareCas(); + AnalysisEngine engine = AnalysisEngineFactory.createEngine(XMLDBMultiplier.class, XMLDBMultiplier.PARAM_MAPPING_FILE, Path.of("src", "test", "resources", "test-mappingfile.xml").toString()); + JCasIterator jCasIterator = engine.processAndOutputNewCASes(jCas); + List documentTexts = new ArrayList<>(); + while (jCasIterator.hasNext()) { + JCas newCas = jCasIterator.next(); + documentTexts.add(newCas.getDocumentText()); + System.out.println(newCas.getDocumentText()); + newCas.release(); + } + assertThat(documentTexts).containsExactly("This is document text number 0", "This is document text number 1", "This is document text number 2", "This is document text number 3", "This is document text number 4", "This is document text number 5", "This is document text number 6", "This is document text number 7", "This is document text number 8", "This is document text number 9"); } + /** + * Creates a JCas and adds a RowBatch for all 10 documents in the source XML table as well as the data table and subset table and schema names. + * + * @return A JCas prepared for the tests in this class. + * @throws UIMAException If some UIMA operation fails. + */ + private JCas prepareCas() throws UIMAException { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.jcore-casflow-types"); + RowBatch rowBatch = new RowBatch(jCas); + StringArray dataTable = new StringArray(jCas, 1); + dataTable.set(0, SOURCE_XML_TABLE); + rowBatch.setTables(dataTable); + StringArray tableSchema = new StringArray(jCas, 1); + tableSchema.set(0, "medline_2016_nozip"); + rowBatch.setTableSchemas(tableSchema); + rowBatch.setTableName(SUBSET_TABLE); + FSArray pks = new FSArray(jCas, 10); + // Read all documents + for (int i = 0; i < 10; i++) { + StringArray pk = new StringArray(jCas, 1); + pk.set(0, String.valueOf(i)); + pks = JCoReTools.addToFSArray(pks, pk); + } + rowBatch.setIdentifiers(pks); + rowBatch.setCostosysConfiguration(costosysConfig); + rowBatch.addToIndexes(); + return jCas; + } + @Test + public void testHashComparison() throws Exception { + JCas jCas = prepareCas(); + TypeSystemDescription tsDesc = TypeSystemDescriptionFactory.createTypeSystemDescription("de.julielab.jcore.types.jcore-document-meta-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types","de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-casflow-types"); + AnalysisEngine engine = AnalysisEngineFactory.createEngine(XMLDBMultiplier.class,tsDesc, + XMLDBMultiplier.PARAM_MAPPING_FILE, Path.of("src", "test", "resources", "test-mappingfile.xml").toString(), + XMLDBMultiplier.PARAM_ADD_SHA_HASH, "documentText", + XMLDBMultiplier.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE, + XMLDBMultiplier.PARAM_TABLE_DOCUMENT_SCHEMA, "xmi_text", + XMLDBMultiplier.PARAM_TO_VISIT_KEYS, "ThisIsTheVisitKey" + ); + JCasIterator jCasIterator = engine.processAndOutputNewCASes(jCas); + List documentTexts = new ArrayList<>(); + while (jCasIterator.hasNext()) { + JCas newCas = jCasIterator.next(); +// System.out.println(newCas.getTypeSystem()); + Collection select = JCasUtil.select(newCas, ToVisit.class); + System.out.println(select); + newCas.release(); + break; + } + } } diff --git a/jcore-xml-db-reader/src/test/resources/test-mappingfile.xml b/jcore-xml-db-reader/src/test/resources/test-mappingfile.xml new file mode 100644 index 000000000..22af9d7cc --- /dev/null +++ b/jcore-xml-db-reader/src/test/resources/test-mappingfile.xml @@ -0,0 +1,17 @@ + + + + /xml/text + + + + de.julielab.jcore.types.Header + + + docId + + /xml/docid + java.lang.String + + + \ No newline at end of file From be868d16274ab2b6112e27625056a422a9e8bfb0 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 7 Jun 2021 07:16:58 +0200 Subject: [PATCH 060/269] Finished the tests for the XMLDBMultiplier testing whether the ToVisit annotation addition for hash comparison is working right. --- .../jcore/reader/xml/XMLDBMultiplier.java | 4 +- .../jcore/reader/xml/XMLDBMultiplierTest.java | 56 +++++++++++++++---- jedis-parent/pom.xml | 2 +- 3 files changed, 49 insertions(+), 13 deletions(-) diff --git a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java index d6c1f7186..8cd4ce9b4 100644 --- a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java +++ b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java @@ -89,8 +89,8 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept xmlMapper = initializer.getXmlMapper(); initialized = false; - if (!(xmiStorageDataTable == null && xmiStorageDataTableSchema == null && documentItemToHash == null) && !(xmiStorageDataTable != null && xmiStorageDataTableSchema != null && documentItemToHash != null)) { - String errorMsg = String.format("From the parameters '%s', '%s' and '%s' some are specified and some aren't. To activate hash value comparison in order to add aggregate component keys for CAS visit, specify all those parameters. Otherwise, specify none.", PARAM_TABLE_DOCUMENT, PARAM_TABLE_DOCUMENT_SCHEMA, PARAM_ADD_SHA_HASH); + if (!(xmiStorageDataTable == null && xmiStorageDataTableSchema == null) && !(xmiStorageDataTable != null && xmiStorageDataTableSchema != null && documentItemToHash != null)) { + String errorMsg = String.format("From the parameters '%s' and '%s' some are specified and some aren't. To activate hash value comparison in order to add aggregate component keys for CAS visit, specify all those parameters. Otherwise, specify none.", PARAM_TABLE_DOCUMENT, PARAM_TABLE_DOCUMENT_SCHEMA); log.error(errorMsg); throw new ResourceInitializationException(new IllegalArgumentException(errorMsg)); } diff --git a/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java b/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java index 75dc1659e..a56950c00 100644 --- a/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java +++ b/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java @@ -62,6 +62,14 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2016_nozip", 1, postgres); new File(costosysConfig).deleteOnExit(); try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { + // We create two tables. One is the XML table the multiplier reads from and maps the contents to the JCas. + // The other is a simulation of an XMI table used to serialize CAS instances via the jcore-xmi-db-writer. + // We need that target table to test the hash value comparison mechanism: If a document does not exist + // in the target table or has a non-matching hash on its document text, proceed as normal. + // But if the hash matches, we want to reserve the possibility to skip most part of the subsequent pipeline. + // For this, we could use the AnnnotationDefinedFlowController for jcore-flow-controllers. This controller + // looks for annotations of the ToVisit type that specify which exact components in an aggregate should + // be applied to the CAS carrying the ToVisit annotation. prepareSourceXMLTable(dbc, conn); prepareTargetXMITable(dbc, conn); } @@ -88,7 +96,7 @@ private static void prepareSourceXMLTable(DataBaseConnector dbc, CoStoSysConnect private static void prepareTargetXMITable(DataBaseConnector dbc, CoStoSysConnection conn) throws SQLException { // Note that the root is "xmi" and not "xml" - String xmlFmt = "%dThis is document text number %d"; + String documentTextFmt = "This is document text number %d"; dbc.createTable(TARGET_XMI_TABLE, "xmi_text", "Test table for hash comparison test."); dbc.assureColumnsExist(TARGET_XMI_TABLE, List.of(HASH_FIELD_NAME), "text"); String sql = String.format("INSERT INTO %s (%s,%s,%s,%s,%s) VALUES (?,XMLPARSE(CONTENT ?),?,?,?)", TARGET_XMI_TABLE, DOCID_FIELD_NAME, BASE_DOCUMENT_FIELD_NAME, HASH_FIELD_NAME, MAX_XMI_ID_FIELD_NAME, SOFA_MAPPING_FIELD_NAME); @@ -96,10 +104,14 @@ private static void prepareTargetXMITable(DataBaseConnector dbc, CoStoSysConnect // Note that we only add half of the documents compared to the source XML import. This way we test // if the code behaves right when the target document does not yet exist at all. for (int i = 0; i < 5; i++) { - String xml = String.format(xmlFmt, i, i); + String xml = String.format(documentTextFmt, i, i); ps.setString(1, String.valueOf(i)); ps.setString(2, xml); - ps.setString(3, getHash(xml)); + // For one document in the "target XMI" table we put in a wrong hash. Thus, this document should not trigger + // the "toVisit" mechanism. + if (i != 3) + ps.setString(3, getHash(xml)); + else ps.setString(3, "someanotherhash"); ps.setInt(4, 0); ps.setString(5, "dummy"); ps.addBatch(); @@ -164,23 +176,47 @@ private JCas prepareCas() throws UIMAException { @Test public void testHashComparison() throws Exception { JCas jCas = prepareCas(); - TypeSystemDescription tsDesc = TypeSystemDescriptionFactory.createTypeSystemDescription("de.julielab.jcore.types.jcore-document-meta-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types","de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-casflow-types"); - AnalysisEngine engine = AnalysisEngineFactory.createEngine(XMLDBMultiplier.class,tsDesc, + TypeSystemDescription tsDesc = TypeSystemDescriptionFactory.createTypeSystemDescription("de.julielab.jcore.types.jcore-document-meta-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-casflow-types"); + AnalysisEngine engine = AnalysisEngineFactory.createEngine(XMLDBMultiplier.class, tsDesc, XMLDBMultiplier.PARAM_MAPPING_FILE, Path.of("src", "test", "resources", "test-mappingfile.xml").toString(), XMLDBMultiplier.PARAM_ADD_SHA_HASH, "documentText", XMLDBMultiplier.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE, XMLDBMultiplier.PARAM_TABLE_DOCUMENT_SCHEMA, "xmi_text", XMLDBMultiplier.PARAM_TO_VISIT_KEYS, "ThisIsTheVisitKey" - ); + ); JCasIterator jCasIterator = engine.processAndOutputNewCASes(jCas); - List documentTexts = new ArrayList<>(); + List toVisitKeys = new ArrayList<>(); + while (jCasIterator.hasNext()) { + JCas newCas = jCasIterator.next(); + Collection select = JCasUtil.select(newCas, ToVisit.class); + select.forEach(tv -> tv.getDelegateKeys().forEach(k -> toVisitKeys.add(k))); + newCas.release(); + } + // There are 4 documents in the target table with the correct hash so we expect the delegate key 5 times + assertThat(toVisitKeys).containsExactly("ThisIsTheVisitKey", "ThisIsTheVisitKey", "ThisIsTheVisitKey", "ThisIsTheVisitKey"); + } + + @Test + public void testHashComparison2() throws Exception { + JCas jCas = prepareCas(); + TypeSystemDescription tsDesc = TypeSystemDescriptionFactory.createTypeSystemDescription("de.julielab.jcore.types.jcore-document-meta-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-casflow-types"); + // In this test, we do not specify the keys to visit; the whole subsequent pipeline should be skipped. + // To indicate that, there should be ToVisit annotations but they should be null. + AnalysisEngine engine = AnalysisEngineFactory.createEngine(XMLDBMultiplier.class, tsDesc, + XMLDBMultiplier.PARAM_MAPPING_FILE, Path.of("src", "test", "resources", "test-mappingfile.xml").toString(), + XMLDBMultiplier.PARAM_ADD_SHA_HASH, "documentText", + XMLDBMultiplier.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE, + XMLDBMultiplier.PARAM_TABLE_DOCUMENT_SCHEMA, "xmi_text" + ); + JCasIterator jCasIterator = engine.processAndOutputNewCASes(jCas); + List emptyToVisitAnnotation = new ArrayList<>(); while (jCasIterator.hasNext()) { JCas newCas = jCasIterator.next(); -// System.out.println(newCas.getTypeSystem()); Collection select = JCasUtil.select(newCas, ToVisit.class); - System.out.println(select); + select.stream().filter(tv -> tv.getDelegateKeys() == null).forEach(emptyToVisitAnnotation::add); newCas.release(); - break; } + // There are 4 documents in the target table with the correct hash so we expect the delegate key 5 times + assertThat(emptyToVisitAnnotation).hasSize(4); } } diff --git a/jedis-parent/pom.xml b/jedis-parent/pom.xml index 48d783b6d..226e35c36 100644 --- a/jedis-parent/pom.xml +++ b/jedis-parent/pom.xml @@ -22,7 +22,7 @@ de.julielab jcore-db-test-utilities - 2.5.1-SNAPSHOT + 2.5.1 de.julielab From c5e1de6ba14b9b1570fd4dc75c644864ad76a5a3 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 7 Jun 2021 10:19:34 +0200 Subject: [PATCH 061/269] Minor change. --- jcore-flow-controllers/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-flow-controllers/pom.xml b/jcore-flow-controllers/pom.xml index d31933489..d17ecac74 100644 --- a/jcore-flow-controllers/pom.xml +++ b/jcore-flow-controllers/pom.xml @@ -44,7 +44,7 @@ org.jetbrains annotations - RELEASE + 21.0.1 compile From 100a4120666d862bd412c211b4456b7d67792c59 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 8 Jun 2021 07:07:32 +0200 Subject: [PATCH 062/269] Fixing a log of tests for jUnit 5 compatibility. --- jcore-ace-reader/pom.xml | 4 +- .../jcore/reader/ace/AceReaderTest.java | 163 ++++++++-------- .../jcore/reader/ace/data/out/PLACEHOLDER | 0 jcore-acronym-ae/pom.xml | 4 +- .../main/AcronymAnnotatorTest.java | 8 +- jcore-acronym-writer/pom.xml | 4 +- jcore-annotation-adder-ae/pom.xml | 4 +- .../AnnotationAdderAnnotatorTest.java | 2 +- jcore-banner-ae/pom.xml | 4 +- .../src/main/java/banner/annotation/BEAT.java | 2 +- .../jcore/ae/banner/BANNERAnnotatorTest.java | 4 +- .../jcore/ae/banner/ModelTrainTest.java | 4 +- .../dataset/JCoReEntityDatasetTest.java | 6 +- jcore-bc2gmformat-writer/pom.xml | 4 +- .../bc2gmformat/BC2GMFormatWriterTest.java | 6 +- jcore-biolemmatizer-ae/pom.xml | 6 +- .../ae/biolemmatizer/BioLemmatizerTest.java | 6 +- jcore-bionlpformat-consumer/pom.xml | 4 +- .../bionlpformat/main/SegmentConsumer.java | 3 - .../bionlpformat/main/EventConsumerTest.java | 14 +- .../utils/DocumentWriterTest.java | 6 +- .../bionlpformat/utils/EntityWriterTest.java | 10 +- .../utils/EventTriggerWriterTest.java | 10 +- .../bionlpformat/utils/EventWriterTest.java | 6 +- .../bionlpformat/utils/ProteinWriterTest.java | 10 +- jcore-bionlpformat-reader/pom.xml | 4 +- .../utils/AnnotationFileMapper.java | 2 +- .../main/CoreferenceReadingTest.java | 6 +- .../bionlp09event/main/EventReaderTest.java | 14 +- .../utils/AbstractFileMapperTest.java | 10 +- .../utils/AnnotationFileMapperTest.java | 10 +- .../utils/OntoFormatReaderTest.java | 2 +- jcore-biosem-ae/pom.xml | 4 +- .../jcore/ae/biosem/BioSemEventAnnotator.java | 2 +- .../ae/biosem/BioSemEventAnnotatorTest.java | 8 +- jcore-conll-consumer/pom.xml | 4 +- .../cas2conll/test/ConllConsumerTest.java | 4 +- jcore-coordination-baseline-ae/pom.xml | 4 +- .../main/ConjunctAnnotatorTest.java | 24 +-- .../main/CoordinationAnnotatorTest.java | 16 +- .../coordbaseline/main/EEEAnnotatorTest.java | 17 +- .../main/EllipsisAnnotatorTest.java | 17 +- jcore-cord19-reader/pom.xml | 4 +- .../cord19/Cord19MultiplierReaderTest.java | 2 +- .../jcore/reader/cord19/JsonFormatTest.java | 2 +- jcore-coreference-writer/pom.xml | 4 +- .../coreference/CoreferenceWriter.java | 6 - jcore-cpe-db-runner/pom.xml | 4 +- jcore-ct-reader/pom.xml | 4 +- .../reader/ct/ClinicalTrialsReaderTest.java | 4 +- .../reader/db/DBMultiplierReaderTest.java | 8 +- .../jcore/reader/db/DBMultiplierTest.java | 8 +- .../jcore/reader/db/DBReaderTest.java | 8 +- jcore-descriptor-creator/pom.xml | 4 +- .../jcore/misc/DescriptorCreatorTest.java | 21 +- .../jcore/reader/testreader/TestReader.java | 4 +- jcore-dta-reader/pom.xml | 6 +- .../jcore/reader/dta/DTAFileReaderTest.java | 4 +- .../jcore/reader/dta/util/DTAUtilsTest.java | 6 +- jcore-ec-code-ae/pom.xml | 4 +- .../jcore/ae/ec/ECNumberAnnotatorTest.java | 4 +- jcore-elasticsearch-consumer/pom.xml | 8 + .../es/AbstractFieldGeneratorTest.java | 2 +- .../consumer/es/ElasticSearchConsumerIT.java | 11 +- jcore-embedding-writer/pom.xml | 8 +- .../jcore/consumer/ew/DecoderTest.java | 2 +- .../consumer/ew/EmbeddingWriterTest.java | 2 +- jcore-event-flattener-ae/pom.xml | 102 +++++----- .../julielab/jules/ae/EventFlattenerTest.java | 182 +++++++++--------- jcore-feature-value-replacement-ae/pom.xml | 4 +- .../FeatureValueReplacementAnnotatorTest.java | 4 +- jcore-file-reader/pom.xml | 4 +- .../reader/file/main/FileReaderTest.java | 10 +- .../jcore/ae/flairner/NerTaggingResponse.java | 1 - .../jcore/ae/flairner/PythonConnector.java | 1 - .../src/test/resources/1681975.xmi | 6 +- jcore-flair-token-embedding-ae/pom.xml | 4 +- .../jcore/ae/fte/python/getEmbeddingScript.py | 14 +- .../jcore/ae/fte/EmbeddingScriptTest.java | 22 +-- .../fte/FlairTokenEmbeddingAnnotatorTest.java | 11 +- jcore-flow-controllers/pom.xml | 1 - jcore-iexml-consumer/pom.xml | 4 +- jcore-iexml-reader/pom.xml | 4 +- jcore-ign-reader/pom.xml | 4 +- .../jcore/reader/ign/IGNReaderTest.java | 4 +- jcore-iob-consumer/pom.xml | 4 +- .../consumer/cas2iob/utils/UIMAUtils.java | 1 + .../cas2iob/main/ToIOBConsumerTest.java | 2 +- jcore-jnet-ae/pom.xml | 4 +- .../ae/jnet/cli/JNETApplicationTest.java | 4 +- .../jcore/ae/jnet/tagger/NETaggerTest.java | 2 +- .../jcore/ae/jnet/uima/MiniTestapp.java | 8 +- jcore-jpos-ae/pom.xml | 4 +- .../ae/jpos/postagger/POSAnnotatorTest.java | 4 +- jcore-jsbd-ae/pom.xml | 9 +- .../jcore/ae/jsbd/Abstract2UnitPipeTest.java | 6 +- .../jcore/ae/jsbd/SentenceSplitterTest.java | 6 +- .../ae/jsbd/main/SentenceAnnotatorTest.java | 6 +- jcore-jtbd-ae/pom.xml | 4 +- .../julielab/jcore/ae/jtbd/TokenizerTest.java | 4 +- .../ae/jtbd/main/TokenAnnotatorTest.java | 2 +- .../pom.xml | 4 +- .../EntityEvaluatorConsumerTest.java | 6 +- jcore-likelihood-assignment-ae/pom.xml | 4 +- .../LikelihoodAssignmentAnnotatorTest.java | 4 +- jcore-likelihood-detection-ae/pom.xml | 4 +- .../LikelihoodDetectionAnnotatorTest.java | 4 +- jcore-line-multiplier/pom.xml | 4 +- .../multiplier/line/LineMultiplierTest.java | 4 +- .../multiplier/line/LineMultiplierTest.class | Bin 3011 -> 3039 bytes jcore-lingpipe-porterstemmer-ae/pom.xml | 4 +- .../LingpipePorterstemmerAnnotatorTest.java | 6 +- jcore-lingpipegazetteer-ae/pom.xml | 5 +- .../StringNormalizerForChunkingTest.java | 92 ++++----- .../desc/ConfigurableDescriptorTest.java | 4 +- .../uima/GazetteerAnnotatorTest.java | 64 +++--- .../uima/OverlappingChunkTest.java | 4 +- jcore-lingscope-ae/pom.xml | 4 +- .../ae/lingscope/LingscopePosAnnotator.java | 2 +- .../julielab/LingscopePosAnnotatorTest.java | 2 +- jcore-linnaeus-species-ae/pom.xml | 4 +- .../ae/linnaeus/LinnaeusMatcherProvider.java | 2 - .../LinnaeusSpeciesAnnotatorTest.java | 7 +- jcore-medxn-ae/pom.xml | 4 +- .../jcore/ae/medxn/MedAttrAnnotatorTest.java | 27 +-- jcore-msdoc-reader/pom.xml | 4 +- .../reader/msdoc/main/MSdocReaderTest.java | 14 +- jcore-mstparser-ae/pom.xml | 4 +- .../ae/mstparser/main/MSTParserTest.java | 20 +- jcore-muc7-reader/pom.xml | 4 +- jcore-muc7-reader/scripts/muc7_SGML2XML.py | 2 - .../jcore/reader/muc7/MUC7ReaderTest.java | 36 ++-- jcore-mutationfinder-ae/pom.xml | 13 +- .../mutationfinder/MutationAnnotatorTest.java | 11 +- .../nlp/ei/mutation/MutationFinderTest.java | 37 +++- .../ccp/nlp/ei/mutation/MutationTest.java | 9 +- .../nlp/ei/mutation/PointMutationTest.java | 23 ++- jcore-neo4j-relations-consumer/pom.xml | 5 + ...Neo4jRelationsConsumerIntegrationTest.java | 2 +- .../Neo4jRelationsConsumerTest.java | 2 +- jcore-opennlp-chunk-ae/pom.xml | 4 +- .../ae/opennlp/chunk/ChunkAnnotatorTest.java | 15 +- jcore-opennlp-parser-ae/pom.xml | 4 +- .../main/ParseAnnotatorTest.java | 14 +- jcore-opennlp-postag-ae/pom.xml | 4 +- .../opennlp/postag/PosTagAnnotatorTest.java | 6 +- .../opennlp/postag/PosTagDictCreatorTest.java | 6 +- jcore-opennlp-sentence-ae/pom.xml | 4 +- .../ae/jsentsplit/SentenceAnnotatorTest.java | 11 +- jcore-opennlp-token-ae/pom.xml | 4 +- .../ae/opennlp/token/TokenAnnotatorTest.java | 5 +- jcore-pmc-reader/pom.xml | 4 +- .../jcore/reader/pmc/NXMLURIIteratorTest.java | 4 +- .../jcore/reader/pmc/PMCMultiplierTest.java | 2 +- .../jcore/reader/pmc/PMCReaderTest.java | 8 +- .../pmc/parser/ContribGroupParserTest.java | 6 +- .../reader/pmc/parser/ContribParserTest.java | 4 +- .../reader/pmc/parser/FrontParserTest.java | 4 +- .../pmc/parser/NxmlElementParserTest.java | 4 +- .../reader/pmc/parser/SectionParserTest.java | 4 +- .../reader/pmc/parser/XRefParserTest.java | 6 +- jcore-ppd-writer/pom.xml | 4 +- .../jcore/consumer/ppd/PPDWriterTest.java | 6 +- jcore-pubtator-reader/pom.xml | 4 +- .../reader/pubtator/PubtatorReaderTest.java | 6 +- jcore-stanford-lemmatizer-ae/pom.xml | 4 +- .../lemma/StanfordLemmatizerTest.java | 7 +- jcore-topic-indexing-ae/pom.xml | 4 +- .../julielab/jcore/ae/TopicIndexingTest.java | 6 +- jcore-topics-writer/pom.xml | 4 +- .../consumer/topics/TopicsWriterTest.java | 10 +- jcore-txt-consumer/pom.xml | 4 +- .../txt/SentenceTokenConsumerTest.java | 4 +- jcore-utilities/pom.xml | 4 +- .../utility/JCoReAnnotationToolsTest.java | 36 ++-- .../JCoReCondensedDocumentTextTest.java | 4 +- .../utility/JCoReFSListIteratorTest.java | 4 +- .../jcore/utility/JCoReFeaturePathTest.java | 22 +-- .../jcore/utility/JCoReToolsTest.java | 22 +-- .../jcore/utility/index/ComparatorsTest.java | 6 +- .../index/JCoReCoverAnnotationIndexTest.java | 6 +- .../index/JCoReMapAnnotationIndexTest.java | 4 +- .../JCoReOverlapAnnotationIndexTest.java | 6 +- .../index/JCoReSetAnnotationIndexTest.java | 6 +- .../JCoReTreeMapAnnotationIndexTest.java | 4 +- .../utility/index/TermGeneratorsTest.java | 6 +- .../jcore/reader/xmi/CasPopulator.java | 1 - .../XmiDBMultiplierDifferentNsSchemaTest.java | 8 +- .../jcore/reader/xmi/XmiDBMultiplierTest.java | 8 +- .../xmi/XmiDBReaderBinaryFormatTest.java | 16 +- .../xmi/XmiDBReaderDifferentNsSchemaTest.java | 20 +- .../xmi/XmiDBReaderGzippedDataTest.java | 8 +- .../XmiDBReaderMonolithicDocumentsTest.java | 8 +- .../jcore/reader/xmi/XmiDBReaderTest.java | 12 +- jcore-xmi-db-writer/pom.xml | 6 + .../xmi/XmiDBWriterBinaryFormatTest.java | 20 +- .../XmiDBWriterMonolithicDocumentTest.java | 22 +-- .../jcore/consumer/xmi/XmiDBWriterTest.java | 10 +- jcore-xmi-reader/pom.xml | 4 +- .../reader/xmi/XmiCollectionReaderTest.java | 4 +- jcore-xmi-writer/pom.xml | 4 +- .../consumer/xmi/CasToXmiConsumerTest.java | 12 +- jcore-xml-db-reader/pom.xml | 1 - jcore-xml-mapper/pom.xml | 4 +- .../xmlmapper/genericTypes/TypeTemplate.java | 8 +- .../xmlmapper/mapper/DocumentTextHandler.java | 12 +- .../typeBuilder/StandardTypeBuilder.java | 7 +- .../jcore/reader/xmlmapper/EncodingTest.java | 4 +- .../jcore/reader/xmlmapper/XMLMapperTest.java | 4 +- .../jcore/reader/xmlmapper/XMLReaderTest.java | 110 +++++------ jcore-xml-reader/pom.xml | 4 +- .../jcore/reader/XMLMultiplierReaderTest.java | 14 +- .../jcore/reader/XMLMultiplierTest.java | 6 +- .../julielab/jcore/reader/XMLReaderTest.java | 82 ++++---- 214 files changed, 1144 insertions(+), 1076 deletions(-) create mode 100644 jcore-ace-reader/src/test/resources/de/julielab/jcore/reader/ace/data/out/PLACEHOLDER diff --git a/jcore-ace-reader/pom.xml b/jcore-ace-reader/pom.xml index fdf961ad1..b983bbb5a 100644 --- a/jcore-ace-reader/pom.xml +++ b/jcore-ace-reader/pom.xml @@ -32,8 +32,8 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine de.julielab diff --git a/jcore-ace-reader/src/test/java/de/julielab/jcore/reader/ace/AceReaderTest.java b/jcore-ace-reader/src/test/java/de/julielab/jcore/reader/ace/AceReaderTest.java index 465a384f7..b6bd606e4 100644 --- a/jcore-ace-reader/src/test/java/de/julielab/jcore/reader/ace/AceReaderTest.java +++ b/jcore-ace-reader/src/test/java/de/julielab/jcore/reader/ace/AceReaderTest.java @@ -21,7 +21,6 @@ import de.julielab.jcore.types.ArgumentMention; import de.julielab.jcore.types.EntityMention; import de.julielab.jcore.types.ace.*; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.metadata.AnalysisEngineMetaData; import org.apache.uima.cas.CAS; @@ -38,6 +37,8 @@ import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; import org.apache.uima.util.XMLSerializer; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.w3c.dom.Node; import org.xml.sax.SAXException; @@ -50,7 +51,9 @@ import java.util.ArrayList; import java.util.Iterator; -public class AceReaderTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class AceReaderTest { /** * Path to the MedlineReader descriptor */ @@ -65,47 +68,46 @@ public class AceReaderTest extends TestCase { /** * Object to be tested */ - private CollectionReader aceReader; + private static CollectionReader aceReader; /** * Auxiliary collection reader */ - private CollectionReader testReader; + private static CollectionReader testReader; /** * CAS array list with CAS objects that where processed by the aceReader */ - private ArrayList casArrayList = new ArrayList(); + private static ArrayList casArrayList = new ArrayList(); /** * Auxiliary CAS objects */ - private CAS aceReaderCas; + private static CAS aceReaderCas; - private CAS testReaderCas; + private static CAS testReaderCas; - private JCas aceReaderJCas; + private static JCas aceReaderJCas; - private JCas testReaderJCas; + private static JCas testReaderJCas; - LOC entity1_1; + static LOC entity1_1; - LOC entity1_2; + static LOC entity1_2; - GPE entity2_1; + static GPE entity2_1; - GPE entity2_2; + static GPE entity2_2; - GPE entity2_3; + static GPE entity2_3; - GPE entity2_4; + static GPE entity2_4; /*----------------------------------------------------------------------------------------------*/ - @Override - protected void setUp() throws Exception { + @BeforeAll + protected static void setUp() throws Exception { aceReader = getCollectionReader(ACE_READER_DESCRIPTOR); processAllCases(); - super.setUp(); System.out.println("ALL CASes were processed"); } // of setUp @@ -118,7 +120,7 @@ protected void setUp() throws Exception { * @throws SAXException * @throws ParserConfigurationException */ - private void processAllCases() throws CASException, SAXException, ParserConfigurationException { + private static void processAllCases() throws CASException, SAXException, ParserConfigurationException { try { while (aceReader.hasNext()) { @@ -157,13 +159,13 @@ private void processAllCases() throws CASException, SAXException, ParserConfigur } // of processAllCases /*----------------------------------------------------------------------------------------------*/ - private void compareCASes() { - assertTrue("Invalid source file attributes!", checkSourceFile()); - assertTrue("Invalid generated Jules Components!", checkGeneratedJulesComponents()); + private static void compareCASes() { + assertTrue(checkSourceFile(), "Invalid source file attributes!"); + assertTrue(checkGeneratedJulesComponents(), "Invalid generated Jules Components!"); } // compareCASes /*----------------------------------------------------------------------------------------------*/ - private boolean checkGeneratedJulesComponents() { + private static boolean checkGeneratedJulesComponents() { System.out.println("CALL checkGeneratedJulesComponents()"); boolean julesComponentsEqual = true; @@ -185,7 +187,7 @@ private boolean checkGeneratedJulesComponents() { } // checkGeneratedJulesComponents /*----------------------------------------------------------------------------------------------*/ - private boolean checkJulesEntities() { + private static boolean checkJulesEntities() { System.out.println("CALL checkJulesEntities()"); boolean julesEntityEqual = true; @@ -237,7 +239,7 @@ private boolean checkJulesEntities() { } // of checkJulesEntities /*----------------------------------------------------------------------------------------------*/ - private boolean checkJulesRelations() { + private static boolean checkJulesRelations() { System.out.println("CALL checkJulesRelations()"); boolean juleRelationEqual = true; @@ -286,8 +288,8 @@ private boolean checkJulesRelations() { } // of checkJulesRelations /*----------------------------------------------------------------------------------------------*/ - private boolean checkJulesRelationArguments(de.julielab.jcore.types.RelationMention aceReaderRelation, - de.julielab.jcore.types.RelationMention testReaderRelation) { + private static boolean checkJulesRelationArguments(de.julielab.jcore.types.RelationMention aceReaderRelation, + de.julielab.jcore.types.RelationMention testReaderRelation) { System.out.println("CALL checkJulesRelationArguments()"); boolean julesRelationArgumentEqual = true; @@ -449,7 +451,7 @@ private boolean checkJulesEventArguments(de.julielab.jcore.types.EventMention ac } // of checkJulesEventArguments /*----------------------------------------------------------------------------------------------*/ - private boolean checkSourceFile() { + private static boolean checkSourceFile() { boolean sourceFileEqual = true; Iterator aceReaderIterator = getTypeIterator(aceReaderCas, de.julielab.jcore.types.ace.SourceFile.type); @@ -499,7 +501,7 @@ private boolean checkSourceFile() { } // checkSourceFile /*----------------------------------------------------------------------------------------------*/ - private boolean checkDocument() { + private static boolean checkDocument() { boolean documentEqual = true; Iterator aceReaderIterator = getTypeIterator(aceReaderCas, de.julielab.jcore.types.ace.Document.type); @@ -568,7 +570,7 @@ private boolean checkDocument() { } // of checkDocument /*----------------------------------------------------------------------------------------------*/ - private boolean checkEvents(Document aceReaderDocument, Document testReaderDocument) { + private static boolean checkEvents(Document aceReaderDocument, Document testReaderDocument) { System.out.println("CALL checkEvents()"); boolean eventEqual = true; @@ -641,7 +643,7 @@ private boolean checkEvents(Document aceReaderDocument, Document testReaderDocum } // of checkEvents /*----------------------------------------------------------------------------------------------*/ - private boolean checkEventMentions(Event aceReaderEvent, Event testReaderEvent) { + private static boolean checkEventMentions(Event aceReaderEvent, Event testReaderEvent) { boolean eventMentionEqual = true; FSArray aceReaderEventMentionFSArray = aceReaderEvent.getMentions(); @@ -703,7 +705,7 @@ private boolean checkEventMentions(Event aceReaderEvent, Event testReaderEvent) } // checkEventMentions /*----------------------------------------------------------------------------------------------*/ - private boolean checkEventMentionArguments(EventMention aceReaderEventMention, EventMention testReaderEventMention) { + private static boolean checkEventMentionArguments(EventMention aceReaderEventMention, EventMention testReaderEventMention) { boolean eventMentionArgumentEqual = true; FSArray aceReaderEventMentionArgumentFSArray = aceReaderEventMention.getArguments(); @@ -740,7 +742,7 @@ private boolean checkEventMentionArguments(EventMention aceReaderEventMention, E } // of checkEventMentionArguments /*----------------------------------------------------------------------------------------------*/ - private boolean checkEventArguments(Event aceReaderEvent, Event testReaderEvent) { + private static boolean checkEventArguments(Event aceReaderEvent, Event testReaderEvent) { boolean eventArgumentEqual = true; FSArray aceReaderEventArgumentFSArray = aceReaderEvent.getArguments(); @@ -767,7 +769,7 @@ private boolean checkEventArguments(Event aceReaderEvent, Event testReaderEvent) } // of checkEventArguments /*----------------------------------------------------------------------------------------------*/ - private boolean checkRelations(Document aceReaderDocument, Document testReaderDocument) { + private static boolean checkRelations(Document aceReaderDocument, Document testReaderDocument) { boolean relationEqual = true; FSArray aceReaderRelationFSArray = aceReaderDocument.getRelations(); @@ -830,7 +832,7 @@ private boolean checkRelations(Document aceReaderDocument, Document testReaderDo } // of checkRelations /*----------------------------------------------------------------------------------------------*/ - private boolean checkRelationMentions(Relation aceReaderRelation, Relation testReaderRelation) { + private static boolean checkRelationMentions(Relation aceReaderRelation, Relation testReaderRelation) { boolean relationMentionEqual = true; FSArray aceReaderRelationMentionFSArray = aceReaderRelation.getMentions(); @@ -885,8 +887,8 @@ private boolean checkRelationMentions(Relation aceReaderRelation, Relation testR } // checkRelationMentions /*----------------------------------------------------------------------------------------------*/ - private boolean checkRelationMentionArguments(RelationMention aceReaderRelationMention, - RelationMention testReaderRelationMention) { + private static boolean checkRelationMentionArguments(RelationMention aceReaderRelationMention, + RelationMention testReaderRelationMention) { boolean relationMentionArgumentEqual = true; FSArray aceReaderRelationMentionArgumentFSArray = aceReaderRelationMention.getArguments(); @@ -925,7 +927,7 @@ private boolean checkRelationMentionArguments(RelationMention aceReaderRelationM } /*----------------------------------------------------------------------------------------------*/ - private boolean checkRelationArguments(Relation aceReaderRelation, Relation testReaderRelation) { + private static boolean checkRelationArguments(Relation aceReaderRelation, Relation testReaderRelation) { boolean relationArgumentEqual = true; FSArray aceReaderRelationArgumentFSArray = aceReaderRelation.getArguments(); @@ -952,7 +954,7 @@ private boolean checkRelationArguments(Relation aceReaderRelation, Relation test } // checkRelationArguments /*----------------------------------------------------------------------------------------------*/ - private boolean checkTimex2(Document aceReaderDocument, Document testReaderDocument) { + private static boolean checkTimex2(Document aceReaderDocument, Document testReaderDocument) { boolean timex2Equal = true; FSArray aceReaderTimex2FSArray = aceReaderDocument.getTimex2(); @@ -985,7 +987,7 @@ private boolean checkTimex2(Document aceReaderDocument, Document testReaderDocum } // checkTimex2 /*----------------------------------------------------------------------------------------------*/ - private boolean checkTimex2Mentions(Timex2 aceReaderTimex2, Timex2 testReaderTimex2) { + private static boolean checkTimex2Mentions(Timex2 aceReaderTimex2, Timex2 testReaderTimex2) { boolean timex2MentionEqual = true; FSArray aceReaderTimex2MentionFSArray = aceReaderTimex2.getMentions(); @@ -1017,7 +1019,7 @@ private boolean checkTimex2Mentions(Timex2 aceReaderTimex2, Timex2 testReaderTim } // of checkTimex2Mentions /*----------------------------------------------------------------------------------------------*/ - private boolean checkValues(Document aceReaderDocument, Document testReaderDocument) { + private static boolean checkValues(Document aceReaderDocument, Document testReaderDocument) { boolean valueEqual = true; FSArray aceReaderValueFSArray = aceReaderDocument.getValues(); @@ -1060,7 +1062,7 @@ private boolean checkValues(Document aceReaderDocument, Document testReaderDocum } // of checkValues /*----------------------------------------------------------------------------------------------*/ - private boolean checkValueMentions(Value aceReaderValue, Value testReaderValue) { + private static boolean checkValueMentions(Value aceReaderValue, Value testReaderValue) { boolean valueMentionEqual = true; FSArray aceReaderValueMentionFSArray = aceReaderValue.getMentions(); @@ -1093,7 +1095,7 @@ private boolean checkValueMentions(Value aceReaderValue, Value testReaderValue) } // of checkValueMentions /*----------------------------------------------------------------------------------------------*/ - private boolean checkEntities() { + private static boolean checkEntities() { boolean entityEqual = true; Iterator aceReaderIterator = getTypeIterator(aceReaderCas, de.julielab.jcore.types.ace.Entity.type); @@ -1176,7 +1178,7 @@ private boolean checkEntities() { } // checkEntities /*----------------------------------------------------------------------------------------------*/ - private boolean checkEntityAttributes(Entity aceReaderEntity, Entity testReaderEntity) { + private static boolean checkEntityAttributes(Entity aceReaderEntity, Entity testReaderEntity) { boolean entityAttributeEqual = true; FSArray aceReaderEntityAttributeFSArray = aceReaderEntity.getEntity_attributes(); FSArray testReaderEntityAttributeFSArray = testReaderEntity.getEntity_attributes(); @@ -1208,8 +1210,8 @@ private boolean checkEntityAttributes(Entity aceReaderEntity, Entity testReaderE } // of checkEntityAttributes /*----------------------------------------------------------------------------------------------*/ - private boolean checkEntityAttributesNames(EntityAttribute aceReaderEntityAttribute, - EntityAttribute testReaderEntityAttribute) { + private static boolean checkEntityAttributesNames(EntityAttribute aceReaderEntityAttribute, + EntityAttribute testReaderEntityAttribute) { boolean entityAttributesNamesEqual = true; FSArray aceReaderEntityAttributesNamesFSArray = aceReaderEntityAttribute.getNames(); FSArray testReaderEntityAttributesNamesFSArray = testReaderEntityAttribute.getNames(); @@ -1241,7 +1243,7 @@ private boolean checkEntityAttributesNames(EntityAttribute aceReaderEntityAttrib } // checkEntityAttributesNames /*----------------------------------------------------------------------------------------------*/ - private boolean checkEntityMentions(Entity aceReaderEntity, Entity testReaderEntity) { + private static boolean checkEntityMentions(Entity aceReaderEntity, Entity testReaderEntity) { boolean entityMentionEqual = true; FSArray aceReaderEntityMentionFSArray = aceReaderEntity.getEntity_mentions(); FSArray testReaderEntityMentionFSArray = testReaderEntity.getEntity_mentions(); @@ -1309,7 +1311,7 @@ private boolean checkEntityMentions(Entity aceReaderEntity, Entity testReaderEnt } // of checkEntityMentions /*----------------------------------------------------------------------------------------------*/ - private void buildSourceFile(JCas jcas) throws SAXException, IOException, ParserConfigurationException { + private static void buildSourceFile(JCas jcas) throws SAXException, IOException, ParserConfigurationException { de.julielab.jcore.types.ace.SourceFile sourceFile = new de.julielab.jcore.types.ace.SourceFile(jcas); sourceFile.setUri("XIN_ENG_20030624.0085.sgm"); @@ -1329,14 +1331,14 @@ private void buildSourceFile(JCas jcas) throws SAXException, IOException, Parser } // buildSourceFile /*----------------------------------------------------------------------------------------------*/ - private void setDocumentText(CAS testReaderCas2, org.w3c.dom.Document sgmDomDocument) { + private static void setDocumentText(CAS testReaderCas2, org.w3c.dom.Document sgmDomDocument) { Node documentNode = sgmDomDocument.getDocumentElement(); String documentText = documentNode.getTextContent(); testReaderCas2.setDocumentText(documentText); } // of setDocumentText /*----------------------------------------------------------------------------------------------*/ - private void buildDocument(JCas jcas, SourceFile sourceFile) { + private static void buildDocument(JCas jcas, SourceFile sourceFile) { de.julielab.jcore.types.ace.Document document = new de.julielab.jcore.types.ace.Document(jcas); document.setDocid("XIN_ENG_20030624.0085"); buildEntities(jcas, document); @@ -1401,7 +1403,7 @@ private void buildJulesEventArgs(JCas jcas, Transaction event1) { } // buildJulesEventArgs /*----------------------------------------------------------------------------------------------*/ - private void buildJulesRelations(JCas jcas, Document document) { + private static void buildJulesRelations(JCas jcas, Document document) { System.out.println("CALL buildJulesRelations()"); PART_WHOLE relation1_1 = new PART_WHOLE(jcas); relation1_1.setBegin(543); @@ -1490,7 +1492,7 @@ private void buildJulesRelations(JCas jcas, Document document) { } // of buildJulesRelations /*----------------------------------------------------------------------------------------------*/ - private void buildJulesEntities(JCas jcas, Document document) { + private static void buildJulesEntities(JCas jcas, Document document) { System.out.println("CALL buildJulesEntities()"); entity1_1 = new LOC(jcas); @@ -1562,7 +1564,7 @@ private void buildJulesEntities(JCas jcas, Document document) { } // of buildJulesEntities /*----------------------------------------------------------------------------------------------*/ - private void buildEvents(JCas jcas, Document document) { + private static void buildEvents(JCas jcas, Document document) { de.julielab.jcore.types.ace.Event event = new de.julielab.jcore.types.ace.Event(jcas); event.setGenericity("Specific"); @@ -1583,7 +1585,7 @@ private void buildEvents(JCas jcas, Document document) { } // of buildEvents /*----------------------------------------------------------------------------------------------*/ - private void buildEventMentions(JCas jcas, Event event) { + private static void buildEventMentions(JCas jcas, Event event) { de.julielab.jcore.types.ace.EventMention eventMention = new de.julielab.jcore.types.ace.EventMention(jcas); eventMention.setId("XIN_ENG_20030405.0080-EV2-1"); eventMention.setBegin(625); @@ -1612,7 +1614,7 @@ private void buildEventMentions(JCas jcas, Event event) { } // of buildEventMentions /*----------------------------------------------------------------------------------------------*/ - private void buildEventMentionArguments(JCas jcas, EventMention eventMention) { + private static void buildEventMentionArguments(JCas jcas, EventMention eventMention) { de.julielab.jcore.types.ace.EventMentionArgument eventMentionArgument1 = new de.julielab.jcore.types.ace.EventMentionArgument( jcas); eventMentionArgument1.setAce_role("Recipient"); @@ -1637,7 +1639,7 @@ private void buildEventMentionArguments(JCas jcas, EventMention eventMention) { } // of buildEventMentionArguments /*----------------------------------------------------------------------------------------------*/ - private void buildEventArguments(JCas jcas, Event event) { + private static void buildEventArguments(JCas jcas, Event event) { de.julielab.jcore.types.ace.EventArgument eventArgument1 = new de.julielab.jcore.types.ace.EventArgument(jcas); eventArgument1.setAce_role("Recipient"); eventArgument1.setRefid("XIN_ENG_20030405.0080-E1"); @@ -1656,7 +1658,7 @@ private void buildEventArguments(JCas jcas, Event event) { } // of buildEventArguments /*----------------------------------------------------------------------------------------------*/ - private void buildRelations(JCas jcas, Document document) { + private static void buildRelations(JCas jcas, Document document) { de.julielab.jcore.types.ace.Relation relation1 = new de.julielab.jcore.types.ace.Relation(jcas); relation1.setModality("Asserted"); relation1.setTense("Unspecified"); @@ -1685,7 +1687,7 @@ private void buildRelations(JCas jcas, Document document) { } // of buildRelations /*----------------------------------------------------------------------------------------------*/ - private void buildRelationMentions2(JCas jcas, Relation relation2) { + private static void buildRelationMentions2(JCas jcas, Relation relation2) { de.julielab.jcore.types.ace.RelationMention relationMention2_1 = new de.julielab.jcore.types.ace.RelationMention( jcas); relationMention2_1.setLexical_condition("Preposition"); @@ -1714,7 +1716,7 @@ private void buildRelationMentions2(JCas jcas, Relation relation2) { } // of buildRelationMentions2 /*----------------------------------------------------------------------------------------------*/ - private void buildRelationMentionArgument2_2(JCas jcas, RelationMention relationMention2_2) { + private static void buildRelationMentionArgument2_2(JCas jcas, RelationMention relationMention2_2) { de.julielab.jcore.types.ace.RelationMentionArgument argument1 = new de.julielab.jcore.types.ace.RelationMentionArgument( jcas); argument1.setAce_role("Arg-2"); @@ -1739,7 +1741,7 @@ private void buildRelationMentionArgument2_2(JCas jcas, RelationMention relation } // of buildRelationMentionArgument2_2 /*----------------------------------------------------------------------------------------------*/ - private void buildRelationMentionArguments2_1(JCas jcas, RelationMention relationMention1) { + private static void buildRelationMentionArguments2_1(JCas jcas, RelationMention relationMention1) { de.julielab.jcore.types.ace.RelationMentionArgument argument1 = new de.julielab.jcore.types.ace.RelationMentionArgument( jcas); argument1.setAce_role("Arg-2"); @@ -1764,7 +1766,7 @@ private void buildRelationMentionArguments2_1(JCas jcas, RelationMention relatio } // of buildRelationMentionArguments2_1 /*----------------------------------------------------------------------------------------------*/ - private void buildRelationArguments2(JCas jcas, Relation relation2) { + private static void buildRelationArguments2(JCas jcas, Relation relation2) { de.julielab.jcore.types.ace.RelationArgument argument1 = new de.julielab.jcore.types.ace.RelationArgument(jcas); argument1.setAce_role("Arg-2"); argument1.setRefid("XIN_ENG_20030624.0085-E1"); @@ -1782,7 +1784,7 @@ private void buildRelationArguments2(JCas jcas, Relation relation2) { } // of buildRelationArguments2 /*----------------------------------------------------------------------------------------------*/ - private void buildRelationMentions1(JCas jcas, Relation relation) { + private static void buildRelationMentions1(JCas jcas, Relation relation) { de.julielab.jcore.types.ace.RelationMention relationMention1 = new de.julielab.jcore.types.ace.RelationMention( jcas); relationMention1.setLexical_condition("Preposition"); @@ -1811,7 +1813,7 @@ private void buildRelationMentions1(JCas jcas, Relation relation) { } // buildRelationMentions /*----------------------------------------------------------------------------------------------*/ - private void buildRelationMentionArguments1_2(JCas jcas, RelationMention relationMention2) { + private static void buildRelationMentionArguments1_2(JCas jcas, RelationMention relationMention2) { de.julielab.jcore.types.ace.RelationMentionArgument argument1 = new de.julielab.jcore.types.ace.RelationMentionArgument( jcas); argument1.setAce_role("Arg-1"); @@ -1836,7 +1838,7 @@ private void buildRelationMentionArguments1_2(JCas jcas, RelationMention relatio } // buildRelationMentionArguments2 /*----------------------------------------------------------------------------------------------*/ - private void buildRelationMentionArguments1_1(JCas jcas, RelationMention relationMention1) { + private static void buildRelationMentionArguments1_1(JCas jcas, RelationMention relationMention1) { de.julielab.jcore.types.ace.RelationMentionArgument argument1 = new de.julielab.jcore.types.ace.RelationMentionArgument( jcas); argument1.setAce_role("Arg-1"); @@ -1861,7 +1863,7 @@ private void buildRelationMentionArguments1_1(JCas jcas, RelationMention relatio } // buildRelationMentionArguments1 /*----------------------------------------------------------------------------------------------*/ - private void buildRelationAgruments1(JCas jcas, Relation relation) { + private static void buildRelationAgruments1(JCas jcas, Relation relation) { de.julielab.jcore.types.ace.RelationArgument argument1 = new de.julielab.jcore.types.ace.RelationArgument(jcas); argument1.setAce_role("Arg-1"); argument1.setRefid("XIN_ENG_20030624.0085-E1"); @@ -1880,7 +1882,7 @@ private void buildRelationAgruments1(JCas jcas, Relation relation) { } // buildRelationAgruments /*----------------------------------------------------------------------------------------------*/ - private void buildTimex2(JCas jcas, Document document) { + private static void buildTimex2(JCas jcas, Document document) { de.julielab.jcore.types.ace.Timex2 timex2_1 = new de.julielab.jcore.types.ace.Timex2(jcas); timex2_1.setId("XIN_ENG_20030624.0085-T4"); buildTimex2Mentions1(jcas, timex2_1); @@ -1897,7 +1899,7 @@ private void buildTimex2(JCas jcas, Document document) { } // buildTimex2 /*----------------------------------------------------------------------------------------------*/ - private void buildTimex2Mentions2(JCas jcas, Timex2 timex2_2) { + private static void buildTimex2Mentions2(JCas jcas, Timex2 timex2_2) { de.julielab.jcore.types.ace.Timex2Mention timex2Mention = new de.julielab.jcore.types.ace.Timex2Mention(jcas); timex2Mention.setId("XIN_ENG_20030624.0085-T8-1"); timex2Mention.setBegin(1327); @@ -1911,7 +1913,7 @@ private void buildTimex2Mentions2(JCas jcas, Timex2 timex2_2) { } // buildTimex2Mentions2 /*----------------------------------------------------------------------------------------------*/ - private void buildTimex2Mentions1(JCas jcas, Timex2 timex2_1) { + private static void buildTimex2Mentions1(JCas jcas, Timex2 timex2_1) { de.julielab.jcore.types.ace.Timex2Mention timex2Mention = new de.julielab.jcore.types.ace.Timex2Mention(jcas); timex2Mention.setId("XIN_ENG_20030624.0085-T4-1"); timex2Mention.setBegin(327); @@ -1925,7 +1927,7 @@ private void buildTimex2Mentions1(JCas jcas, Timex2 timex2_1) { } // buildTimex2Mentions1 /*----------------------------------------------------------------------------------------------*/ - private void buildValues(JCas jcas, Document document) { + private static void buildValues(JCas jcas, Document document) { de.julielab.jcore.types.ace.Value value1 = new de.julielab.jcore.types.ace.Value(jcas); value1.setAce_type("Numeric"); value1.setAce_subtype("Money"); @@ -1948,7 +1950,7 @@ private void buildValues(JCas jcas, Document document) { } // buildValues /*----------------------------------------------------------------------------------------------*/ - private void buildValueMentuions2(JCas jcas, Value value2) { + private static void buildValueMentuions2(JCas jcas, Value value2) { de.julielab.jcore.types.ace.ValueMention valueMention = new de.julielab.jcore.types.ace.ValueMention(jcas); valueMention.setId("XIN_ENG_20030624.0085-V3-1"); valueMention.setBegin(1079); @@ -1962,7 +1964,7 @@ private void buildValueMentuions2(JCas jcas, Value value2) { } // buildValueMentuions2 /*----------------------------------------------------------------------------------------------*/ - private void buildValueMentions1(JCas jcas, Value value1) { + private static void buildValueMentions1(JCas jcas, Value value1) { de.julielab.jcore.types.ace.ValueMention valueMention = new de.julielab.jcore.types.ace.ValueMention(jcas); valueMention.setId("XIN_ENG_20030624.0085-V2-1"); valueMention.setBegin(826); @@ -1976,7 +1978,7 @@ private void buildValueMentions1(JCas jcas, Value value1) { } // buildValueMentions1 /*----------------------------------------------------------------------------------------------*/ - private void buildEntities(JCas jcas, de.julielab.jcore.types.ace.Document document) { + private static void buildEntities(JCas jcas, de.julielab.jcore.types.ace.Document document) { Entity entity1 = new Entity(jcas); entity1.setAce_class("USP"); entity1.setAce_type("LOC"); @@ -2003,14 +2005,14 @@ private void buildEntities(JCas jcas, de.julielab.jcore.types.ace.Document docum } // of buildEntities /*----------------------------------------------------------------------------------------------*/ - private void buildEntityAttributes1(JCas jcas, Entity entity1) { + private static void buildEntityAttributes1(JCas jcas, Entity entity1) { FSArray entityAttributeFSArray = new FSArray(jcas, 0); entityAttributeFSArray.addToIndexes(); entity1.setEntity_attributes(entityAttributeFSArray); } // buildEntityAttributes1 /*----------------------------------------------------------------------------------------------*/ - private void buildEntityAttributes2(JCas jcas, Entity entity2) { + private static void buildEntityAttributes2(JCas jcas, Entity entity2) { de.julielab.jcore.types.ace.EntityAttribute entityAttribute = new de.julielab.jcore.types.ace.EntityAttribute( jcas); @@ -2024,7 +2026,7 @@ private void buildEntityAttributes2(JCas jcas, Entity entity2) { } // ofbuildEntityAttributes2 /*----------------------------------------------------------------------------------------------*/ - private void buildEntityAttributeNames(JCas jcas, de.julielab.jcore.types.ace.EntityAttribute entityAttribute) { + private static void buildEntityAttributeNames(JCas jcas, de.julielab.jcore.types.ace.EntityAttribute entityAttribute) { FSArray nameFSArray = new FSArray(jcas, 4); de.julielab.jcore.types.ace.Name entityAttributeName1 = new de.julielab.jcore.types.ace.Name(jcas); @@ -2060,7 +2062,7 @@ private void buildEntityAttributeNames(JCas jcas, de.julielab.jcore.types.ace.En } // buildEntityAttributeNames /*----------------------------------------------------------------------------------------------*/ - private void buildEntityMentions1(JCas jcas, Entity entity) { + private static void buildEntityMentions1(JCas jcas, Entity entity) { de.julielab.jcore.types.ace.EntityMention entityMention1 = new de.julielab.jcore.types.ace.EntityMention(jcas); entityMention1.setMention_ldctype("PTV"); entityMention1.setMention_type("PRO"); @@ -2101,7 +2103,7 @@ private void buildEntityMentions1(JCas jcas, Entity entity) { } // of buildEntityMentions /*----------------------------------------------------------------------------------------------*/ - private void buildEntityMentions2(JCas jcas, Entity entity2) { + private static void buildEntityMentions2(JCas jcas, Entity entity2) { de.julielab.jcore.types.ace.EntityMention entityMention1 = new de.julielab.jcore.types.ace.EntityMention(jcas); entityMention1.setLdcatr("FALSE"); entityMention1.setAce_role("LOC"); @@ -2180,6 +2182,7 @@ private void buildEntityMentions2(JCas jcas, Entity entity2) { /** * Test if method getNextCas() has done its job */ + @Test public void testGetNextCas() { System.out.println("CALL testGetNextCas"); checkDocumentText(); @@ -2195,7 +2198,7 @@ public void checkDocumentText() { for (int i = 0; i < casArrayList.size(); i++) { String text = casArrayList.get(i).getDocumentText(); - assertTrue(((text == null) ? "null" : text), (text != null) && (!text.equals(""))); + assertTrue((text != null) && (!text.equals("")), ((text == null) ? "null" : text)); } // of for } // of checkDocumentText @@ -2209,7 +2212,7 @@ public void checkDocumentText() { * the type * @return the iterator */ - private Iterator getTypeIterator(CAS cas, int type) { + private static Iterator getTypeIterator(CAS cas, int type) { Iterator iterator = null; try { @@ -2221,7 +2224,7 @@ private Iterator getTypeIterator(CAS cas, int type) { } // getTypeIterator /*----------------------------------------------------------------------------------------------*/ - private void writeCasToXMI(CAS cas, int docs) throws CASException, IOException, SAXException { + private static void writeCasToXMI(CAS cas, int docs) throws CASException, IOException, SAXException { JFSIndexRepository indexes = cas.getJCas().getJFSIndexRepository(); Iterator documentIter = indexes.getAnnotationIndex(Document.type).iterator(); diff --git a/jcore-ace-reader/src/test/resources/de/julielab/jcore/reader/ace/data/out/PLACEHOLDER b/jcore-ace-reader/src/test/resources/de/julielab/jcore/reader/ace/data/out/PLACEHOLDER new file mode 100644 index 000000000..e69de29bb diff --git a/jcore-acronym-ae/pom.xml b/jcore-acronym-ae/pom.xml index dfd4fce45..b4fc4640d 100644 --- a/jcore-acronym-ae/pom.xml +++ b/jcore-acronym-ae/pom.xml @@ -38,8 +38,8 @@ ${jcore-utilities-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine
diff --git a/jcore-acronym-ae/src/test/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotatorTest.java b/jcore-acronym-ae/src/test/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotatorTest.java index c2c74ba6e..3721ee562 100644 --- a/jcore-acronym-ae/src/test/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotatorTest.java +++ b/jcore-acronym-ae/src/test/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotatorTest.java @@ -18,7 +18,6 @@ import de.julielab.jcore.types.Abbreviation; import de.julielab.jcore.types.AbbreviationLongform; import de.julielab.jcore.types.Sentence; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.cas.CAS; @@ -35,7 +34,7 @@ import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -45,12 +44,14 @@ import java.util.ArrayList; import java.util.Collection; +import static org.junit.jupiter.api.Assertions.*; + /** * The AcronymAnnotatorTest class * * @author jwermter */ -public class AcronymAnnotatorTest extends TestCase { +public class AcronymAnnotatorTest { private static final String DOCUMENT_TEXT = "[TAZ]Die Firma Kohl-kopf (FK-K) hat für die Straßenverkehrsordnung (StVO) " + "in der Bundesrepublik Deutschland(BRD) einen hochintelligenten Manager für die Chefetage " @@ -73,6 +74,7 @@ public class AcronymAnnotatorTest extends TestCase { private static final String ALL_TYPES_NAME = "de.julielab.jcore.types.jcore-all-types"; + @Test public void testProcess() throws ResourceInitializationException, InvalidXMLException, IOException, CASException { CAS cas = CasCreationUtils.createCas( diff --git a/jcore-acronym-writer/pom.xml b/jcore-acronym-writer/pom.xml index 035774709..7924aaae0 100644 --- a/jcore-acronym-writer/pom.xml +++ b/jcore-acronym-writer/pom.xml @@ -41,8 +41,8 @@ ${jcore-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe Acronym Writer diff --git a/jcore-annotation-adder-ae/pom.xml b/jcore-annotation-adder-ae/pom.xml index a8f6ce3bd..dc4379f04 100644 --- a/jcore-annotation-adder-ae/pom.xml +++ b/jcore-annotation-adder-ae/pom.xml @@ -41,8 +41,8 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine de.julielab diff --git a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java index 65c0de306..a7f76f786 100644 --- a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java +++ b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java @@ -13,7 +13,7 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ExternalResourceDescription; import org.assertj.core.data.Offset; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.util.ArrayList; diff --git a/jcore-banner-ae/pom.xml b/jcore-banner-ae/pom.xml index 9e47d8857..d50f90b07 100644 --- a/jcore-banner-ae/pom.xml +++ b/jcore-banner-ae/pom.xml @@ -59,8 +59,8 @@ julielab-java-utilities - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-banner-ae/src/main/java/banner/annotation/BEAT.java b/jcore-banner-ae/src/main/java/banner/annotation/BEAT.java index 113f6139f..38281692f 100644 --- a/jcore-banner-ae/src/main/java/banner/annotation/BEAT.java +++ b/jcore-banner-ae/src/main/java/banner/annotation/BEAT.java @@ -18,8 +18,8 @@ import java.awt.*; import java.awt.event.*; import java.io.IOException; -import java.util.*; import java.util.List; +import java.util.*; public class BEAT extends JFrame implements ActionListener, CaretListener { diff --git a/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java b/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java index 12e9e2776..489ecd37d 100644 --- a/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java +++ b/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java @@ -18,14 +18,14 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class BANNERAnnotatorTest { private final static Logger log = LoggerFactory.getLogger(BANNERAnnotatorTest.class); diff --git a/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/ModelTrainTest.java b/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/ModelTrainTest.java index 7604ae62f..9d5d4958c 100644 --- a/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/ModelTrainTest.java +++ b/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/ModelTrainTest.java @@ -12,11 +12,11 @@ import banner.eval.BANNER; import org.apache.commons.configuration.XMLConfiguration; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertTrue; public class ModelTrainTest { @Test diff --git a/jcore-banner-ae/src/test/java/de/julielab/jcore/banner/dataset/JCoReEntityDatasetTest.java b/jcore-banner-ae/src/test/java/de/julielab/jcore/banner/dataset/JCoReEntityDatasetTest.java index 35925ad84..843106130 100644 --- a/jcore-banner-ae/src/test/java/de/julielab/jcore/banner/dataset/JCoReEntityDatasetTest.java +++ b/jcore-banner-ae/src/test/java/de/julielab/jcore/banner/dataset/JCoReEntityDatasetTest.java @@ -11,14 +11,14 @@ package de.julielab.jcore.banner.dataset; import banner.tokenization.SimpleTokenizer; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.util.Set; import java.util.stream.Collectors; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; public class JCoReEntityDatasetTest { @Test diff --git a/jcore-bc2gmformat-writer/pom.xml b/jcore-bc2gmformat-writer/pom.xml index 8092a37ee..37c5a1de0 100644 --- a/jcore-bc2gmformat-writer/pom.xml +++ b/jcore-bc2gmformat-writer/pom.xml @@ -28,8 +28,8 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine de.julielab diff --git a/jcore-bc2gmformat-writer/src/test/java/de/julielab/jcore/consumer/bc2gmformat/BC2GMFormatWriterTest.java b/jcore-bc2gmformat-writer/src/test/java/de/julielab/jcore/consumer/bc2gmformat/BC2GMFormatWriterTest.java index 41faec637..3752d67b5 100644 --- a/jcore-bc2gmformat-writer/src/test/java/de/julielab/jcore/consumer/bc2gmformat/BC2GMFormatWriterTest.java +++ b/jcore-bc2gmformat-writer/src/test/java/de/julielab/jcore/consumer/bc2gmformat/BC2GMFormatWriterTest.java @@ -2,13 +2,13 @@ package de.julielab.jcore.consumer.bc2gmformat; import org.apache.uima.fit.factory.UimaContextFactory; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.lang.reflect.Method; import java.util.TreeMap; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; /** diff --git a/jcore-biolemmatizer-ae/pom.xml b/jcore-biolemmatizer-ae/pom.xml index 241617304..b5e089a8f 100644 --- a/jcore-biolemmatizer-ae/pom.xml +++ b/jcore-biolemmatizer-ae/pom.xml @@ -31,7 +31,11 @@ biolemmatizer-core 1.2 - junitjunit + + org.junit.jupiter + junit-jupiter-engine + + JCoRe BioLemmatizer JULIE Lab Jena, Germany diff --git a/jcore-biolemmatizer-ae/src/test/java/de/julielab/jcore/ae/biolemmatizer/BioLemmatizerTest.java b/jcore-biolemmatizer-ae/src/test/java/de/julielab/jcore/ae/biolemmatizer/BioLemmatizerTest.java index ada58be07..241aadaee 100644 --- a/jcore-biolemmatizer-ae/src/test/java/de/julielab/jcore/ae/biolemmatizer/BioLemmatizerTest.java +++ b/jcore-biolemmatizer-ae/src/test/java/de/julielab/jcore/ae/biolemmatizer/BioLemmatizerTest.java @@ -10,10 +10,10 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; /** * Unit tests for jcore-de.julielab.jcore.ae.biolemmatizer-ae. * @author diff --git a/jcore-bionlpformat-consumer/pom.xml b/jcore-bionlpformat-consumer/pom.xml index d868129aa..a2cdae928 100644 --- a/jcore-bionlpformat-consumer/pom.xml +++ b/jcore-bionlpformat-consumer/pom.xml @@ -29,8 +29,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe BioNLP Format Consumer diff --git a/jcore-bionlpformat-consumer/src/main/java/de/julielab/jcore/consumer/bionlpformat/main/SegmentConsumer.java b/jcore-bionlpformat-consumer/src/main/java/de/julielab/jcore/consumer/bionlpformat/main/SegmentConsumer.java index 287a79921..f09c3a48a 100644 --- a/jcore-bionlpformat-consumer/src/main/java/de/julielab/jcore/consumer/bionlpformat/main/SegmentConsumer.java +++ b/jcore-bionlpformat-consumer/src/main/java/de/julielab/jcore/consumer/bionlpformat/main/SegmentConsumer.java @@ -22,9 +22,6 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.context.annotation.Configuration; import java.io.*; import java.util.Iterator; diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/main/EventConsumerTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/main/EventConsumerTest.java index 6668a969d..8a6659cfb 100644 --- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/main/EventConsumerTest.java +++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/main/EventConsumerTest.java @@ -13,16 +13,16 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.*; import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class EventConsumerTest { private static final String EVENT_E8 = "E8 Phosphorylation:T14 Theme:T17 Site:T13"; @@ -44,7 +44,7 @@ public class EventConsumerTest { private AnalysisEngine consumer; private FilenameFilter filter; - @Before + @BeforeEach public void setUp() throws Exception { cas = JCasFactory.createJCas("src/test/resources/types/jcore-all-types"); consumer = AnalysisEngineFactory.createEngine(BioEventConsumer.class, @@ -113,7 +113,7 @@ public boolean accept(File file, String name) { }; } - @After + @AfterEach public void tearDown() { File dataDirectory = new File(TARGET_DIRECTORY); diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/DocumentWriterTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/DocumentWriterTest.java index bdd89cc38..12e2baa53 100644 --- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/DocumentWriterTest.java +++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/DocumentWriterTest.java @@ -10,8 +10,8 @@ import de.julielab.jcore.types.Title; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.io.Writer; @@ -28,7 +28,7 @@ public class DocumentWriterTest { private DocumentWriter documentWriter; private Writer writer; - @Before + @BeforeEach public void setUp() throws Exception{ cas = JCasFactory.createJCas("src/test/resources/types/jcore-all-types"); cas.setDocumentText(DOCUMENT_TITLE + "\n" + DOCUMENT_ABSTRACT); diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EntityWriterTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EntityWriterTest.java index d98cb4722..29cd9e064 100644 --- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EntityWriterTest.java +++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EntityWriterTest.java @@ -9,15 +9,15 @@ import de.julielab.jcore.types.EntityMention; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.io.Writer; import static org.easymock.classextension.EasyMock.*; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; public class EntityWriterTest { private static final String ENTITY_T13 = "T13 Entity 322 330 tyrosine\n"; @@ -30,7 +30,7 @@ public class EntityWriterTest { private Writer writer; private EntityMention entityT13; - @Before + @BeforeEach public void setUp() throws Exception{ cas = JCasFactory.createJCas("src/test/resources/types/jcore-semantics-biology-types"); diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventTriggerWriterTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventTriggerWriterTest.java index 317dd0cef..2a04a48f1 100644 --- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventTriggerWriterTest.java +++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventTriggerWriterTest.java @@ -9,15 +9,15 @@ import de.julielab.jcore.types.EventTrigger; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.io.Writer; import static org.easymock.classextension.EasyMock.*; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; public class EventTriggerWriterTest { private static final String TRIGGER_T1 = "T1 Negative_regulation 12 19 inhibit\n"; @@ -28,7 +28,7 @@ public class EventTriggerWriterTest { private Writer writer; private EventTrigger triggerT1; - @Before + @BeforeEach public void setUp() throws Exception{ cas = JCasFactory.createJCas("src/test/resources/types/jcore-semantics-biology-types"); diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventWriterTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventWriterTest.java index 5d8b717cf..58052dc0b 100644 --- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventWriterTest.java +++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventWriterTest.java @@ -10,8 +10,8 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.FileWriter; import java.io.IOException; @@ -36,7 +36,7 @@ public class EventWriterTest { private Gene proteinT17; private EntityMention entityT13; - @Before + @BeforeEach public void setUp() throws Exception{ cas = JCasFactory.createJCas("src/test/resources/types/jcore-semantics-biology-types"); diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/ProteinWriterTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/ProteinWriterTest.java index 3871f07ff..2cdc5be50 100644 --- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/ProteinWriterTest.java +++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/ProteinWriterTest.java @@ -9,16 +9,16 @@ import de.julielab.jcore.types.Gene; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.FileWriter; import java.io.IOException; import java.io.Writer; import static org.easymock.classextension.EasyMock.*; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; public class ProteinWriterTest { @@ -35,7 +35,7 @@ public class ProteinWriterTest { private static final String DOCUMENT_TEXT = "Interferons inhibit activation of STAT6 by interleukin 4 in human monocytes by inducing SOCS-1 gene expression.\n" + "Interferons (IFNs) inhibit induction by IL-4 of multiple genes in human monocytes. However, the mechanism by which IFNs mediate this inhibition has not been defined. IL-4 activates gene expression by inducing tyrosine phosphorylation, homodimerization, and nuclear translocation of the latent transcription factor, STAT6 (signal transducer and activator of transcription-6). STAT6-responsive elements are characteristically present in the promoters of IL-4-inducible genes. Because STAT6 activation is essential for IL-4-induced gene expression, we examined the ability of type I and type II IFNs to regulate activation of STAT6 by IL-4 in primary human monocytes. Pretreatment of monocytes with IFN-beta or IFN-gamma, but not IL-1, IL-2, macrophage colony-stimulating factor, granulocyte/macrophage colony-stimulating factor, IL-6, or transforming growth factor beta suppressed activation of STAT6 by IL-4. This inhibition was associated with decreased tyrosine phosphorylation and nuclear translocation of STAT6 and was not evident unless the cells were preincubated with IFN for at least 1 hr before IL-4 stimulation. Furthermore, inhibition by IFN could be blocked by cotreatment with actinomycin D and correlated temporally with induction of the JAK/STAT inhibitory gene, SOCS-1. Forced expression of SOCS-1 in a macrophage cell line, RAW264, markedly suppressed trans-activation of an IL-4-inducible reporter as well as IL-6- and IFN-gamma-induced reporter gene activity. These findings demonstrate that IFNs inhibit IL-4-induced activation of STAT6 and STAT6-dependent gene expression, at least in part, by inducing expression of SOCS-1."; - @Before + @BeforeEach public void setUp() throws Exception{ cas = JCasFactory.createJCas("src/test/resources/types/jcore-semantics-biology-types"); diff --git a/jcore-bionlpformat-reader/pom.xml b/jcore-bionlpformat-reader/pom.xml index 65fcefb66..94aa1584f 100644 --- a/jcore-bionlpformat-reader/pom.xml +++ b/jcore-bionlpformat-reader/pom.xml @@ -41,8 +41,8 @@ ${jcore-utilities-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-bionlpformat-reader/src/main/java/de/julielab/jcore/reader/bionlpformat/utils/AnnotationFileMapper.java b/jcore-bionlpformat-reader/src/main/java/de/julielab/jcore/reader/bionlpformat/utils/AnnotationFileMapper.java index 70efe8571..5a265d736 100644 --- a/jcore-bionlpformat-reader/src/main/java/de/julielab/jcore/reader/bionlpformat/utils/AnnotationFileMapper.java +++ b/jcore-bionlpformat-reader/src/main/java/de/julielab/jcore/reader/bionlpformat/utils/AnnotationFileMapper.java @@ -17,8 +17,8 @@ import java.io.BufferedReader; import java.io.IOException; - import java.util.*; import java.util.List; + import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; diff --git a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/CoreferenceReadingTest.java b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/CoreferenceReadingTest.java index ce2926f00..1b2a68ac9 100644 --- a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/CoreferenceReadingTest.java +++ b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/CoreferenceReadingTest.java @@ -16,8 +16,8 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; import org.xml.sax.SAXException; import java.io.FileOutputStream; @@ -25,7 +25,7 @@ import java.io.OutputStream; // Ignore because the data path does generally not exist; a fix should only contain some test data, not the whole dataset -@Ignore +@Disabled public class CoreferenceReadingTest { @Test public void testCoreferenceReading() throws UIMAException, IOException, diff --git a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/EventReaderTest.java b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/EventReaderTest.java index 9c7aea226..68c64fc94 100644 --- a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/EventReaderTest.java +++ b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/EventReaderTest.java @@ -17,23 +17,23 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.XMLInputSource; -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; import java.util.Set; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; // This test's EventReaderTest.xml descriptor points to local directories of Ekaterina Buyko and as such, the test doesn't work this way. However it might, if the data is made available as proper test data. -@Ignore +@Disabled public class EventReaderTest { private static final String DESCRIPTOR_FILE = "src/test/resources/de/julielab/jcore/reader/bionlpformat/desc/EventReaderTest.xml"; private CollectionReader collectionReader; - @Before + @BeforeEach public void setUp() throws Exception { CollectionReaderDescription readerDescription = (CollectionReaderDescription) UIMAFramework .getXMLParser().parseCollectionReaderDescription( diff --git a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AbstractFileMapperTest.java b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AbstractFileMapperTest.java index 25685ec01..c95a9d148 100644 --- a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AbstractFileMapperTest.java +++ b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AbstractFileMapperTest.java @@ -18,15 +18,15 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.XMLInputSource; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.BufferedReader; import static org.easymock.EasyMock.expect; import static org.easymock.classextension.EasyMock.*; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class AbstractFileMapperTest { @@ -34,7 +34,7 @@ public class AbstractFileMapperTest { private JCas cas; private TextFileMapper abstractFileMapper; - @Before + @BeforeEach public void setUp() throws Exception { CollectionReaderDescription readerDescription = UIMAFramework.getXMLParser() .parseCollectionReaderDescription(new XMLInputSource(DESCRIPTOR_FILE)); diff --git a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AnnotationFileMapperTest.java b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AnnotationFileMapperTest.java index 46bf09ee6..85b582ed0 100644 --- a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AnnotationFileMapperTest.java +++ b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AnnotationFileMapperTest.java @@ -28,8 +28,8 @@ import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.XMLInputSource; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.BufferedReader; import java.util.HashMap; @@ -37,8 +37,8 @@ import static org.easymock.EasyMock.expect; import static org.easymock.classextension.EasyMock.*; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -68,7 +68,7 @@ public class AnnotationFileMapperTest { private Gene t3; private Map mappedProteins; - @Before + @BeforeEach public void setUp() throws Exception { CollectionReaderDescription readerDescription = (CollectionReaderDescription) UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(DESCRIPTOR_FILE)); CollectionReader collectionReader = UIMAFramework.produceCollectionReader(readerDescription); diff --git a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/OntoFormatReaderTest.java b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/OntoFormatReaderTest.java index 2abfcc03d..24a3d7805 100644 --- a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/OntoFormatReaderTest.java +++ b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/OntoFormatReaderTest.java @@ -11,7 +11,7 @@ package de.julielab.jcore.reader.bionlp09event.utils; import de.julielab.jcore.reader.bionlpformat.utils.OntoFormatReader; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; diff --git a/jcore-biosem-ae/pom.xml b/jcore-biosem-ae/pom.xml index eec6bc55f..42cfd7194 100644 --- a/jcore-biosem-ae/pom.xml +++ b/jcore-biosem-ae/pom.xml @@ -52,8 +52,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java b/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java index 9a9f16a35..2b4011ff0 100644 --- a/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java +++ b/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java @@ -31,8 +31,8 @@ import utils.BioSemException; import utils.DBUtils; -import java.util.*; import java.util.List; +import java.util.*; import java.util.Map.Entry; public class BioSemEventAnnotator extends JCasAnnotator_ImplBase { diff --git a/jcore-biosem-ae/src/test/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotatorTest.java b/jcore-biosem-ae/src/test/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotatorTest.java index ae49970cd..da7a683de 100644 --- a/jcore-biosem-ae/src/test/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotatorTest.java +++ b/jcore-biosem-ae/src/test/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotatorTest.java @@ -22,15 +22,15 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ExternalResourceDescription; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileInputStream; import java.util.Collections; import java.util.List; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class BioSemEventAnnotatorTest { @Test @@ -53,7 +53,7 @@ public void testProcess() throws Exception { if (testOutputFile.exists()) testOutputFile.delete(); - assertTrue("Test document was not found by the BioNLP ST reader.", bioNlpSTReader.hasNext()); + assertTrue(bioNlpSTReader.hasNext(), "Test document was not found by the BioNLP ST reader."); bioNlpSTReader.getNext(jCas.getCas()); engine.process(jCas); bioNlpSTWriter.process(jCas); diff --git a/jcore-conll-consumer/pom.xml b/jcore-conll-consumer/pom.xml index 4ba6ef20c..bbab62b95 100644 --- a/jcore-conll-consumer/pom.xml +++ b/jcore-conll-consumer/pom.xml @@ -24,8 +24,8 @@ logback-classic - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-conll-consumer/src/test/java/de/julielab/jcore/consumer/cas2conll/test/ConllConsumerTest.java b/jcore-conll-consumer/src/test/java/de/julielab/jcore/consumer/cas2conll/test/ConllConsumerTest.java index cb66ca825..ad46ef663 100644 --- a/jcore-conll-consumer/src/test/java/de/julielab/jcore/consumer/cas2conll/test/ConllConsumerTest.java +++ b/jcore-conll-consumer/src/test/java/de/julielab/jcore/consumer/cas2conll/test/ConllConsumerTest.java @@ -21,7 +21,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.BufferedReader; import java.io.File; @@ -30,7 +30,7 @@ import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertTrue; public class ConllConsumerTest { diff --git a/jcore-coordination-baseline-ae/pom.xml b/jcore-coordination-baseline-ae/pom.xml index ea88c0b43..0b54fac37 100644 --- a/jcore-coordination-baseline-ae/pom.xml +++ b/jcore-coordination-baseline-ae/pom.xml @@ -37,8 +37,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/ConjunctAnnotatorTest.java b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/ConjunctAnnotatorTest.java index fdca4b78e..6eb0c2ee6 100644 --- a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/ConjunctAnnotatorTest.java +++ b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/ConjunctAnnotatorTest.java @@ -7,7 +7,6 @@ package de.julielab.jcore.ae.coordbaseline.main; import de.julielab.jcore.types.*; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.cas.FSIterator; @@ -17,13 +16,16 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; +import static org.junit.jupiter.api.Assertions.assertTrue; -public class ConjunctAnnotatorTest extends TestCase + +public class ConjunctAnnotatorTest { private static final Logger LOGGER = LoggerFactory.getLogger(ConjunctAnnotatorTest.class); private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties"; @@ -32,19 +34,8 @@ public class ConjunctAnnotatorTest extends TestCase private static final String coordinationLabels2 = "antecedent,conjunct,conjunction,conjunct,antecedent,antecedent"; private static final String TEST_DESC = "src/test/resources/desc/ConjunctAnnotatorTest.xml"; - - - - - - -/*--------------------------------------------------------------------------------*/ - protected void setUp() throws Exception - { - super.setUp(); - } // of setUp -/*--------------------------------------------------------------------------------*/ - public void initCas(JCas jcas) + + public void initCas(JCas jcas) { jcas.reset(); @@ -558,6 +549,7 @@ public void initCas(JCas jcas) } // of initCas /*--------------------------------------------------------------------------------*/ + @Test public void testProcess() { XMLInputSource descriptor = null; @@ -595,7 +587,7 @@ public void testProcess() try { ae.process(jcas, null); - assertTrue("Invalid JCas!", checkJCas(jcas)); + assertTrue(checkJCas(jcas), "Invalid JCas!"); } // of try catch (Exception e) { diff --git a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/CoordinationAnnotatorTest.java b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/CoordinationAnnotatorTest.java index dd5416a7a..32662f928 100644 --- a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/CoordinationAnnotatorTest.java +++ b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/CoordinationAnnotatorTest.java @@ -22,7 +22,6 @@ package de.julielab.jcore.ae.coordbaseline.main; import de.julielab.jcore.types.*; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.cas.FSIterator; @@ -32,13 +31,16 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; +import static org.junit.jupiter.api.Assertions.assertTrue; -public class CoordinationAnnotatorTest extends TestCase + +public class CoordinationAnnotatorTest { private static final Logger LOGGER = LoggerFactory.getLogger(CoordinationAnnotatorTest.class); @@ -55,11 +57,7 @@ public class CoordinationAnnotatorTest extends TestCase private static final String TEST_DESC = "src/test/resources/desc/CoordinationAnnotatorTest.xml"; - protected void setUp() throws Exception - { - super.setUp(); - } // of setUp - + public void initCas(JCas jcas) { jcas.reset(); @@ -562,7 +560,7 @@ public void initCas(JCas jcas) } // of initCas - + @Test public void testProcess() { XMLInputSource descriptor = null; @@ -598,7 +596,7 @@ public void testProcess() try { ae.process(jcas, null); - assertTrue("Invalid JCas!", checkJCas(jcas)); + assertTrue(checkJCas(jcas), "Invalid JCas!"); } // of try catch (Exception e) diff --git a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EEEAnnotatorTest.java b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EEEAnnotatorTest.java index a010c3178..4203cdc16 100644 --- a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EEEAnnotatorTest.java +++ b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EEEAnnotatorTest.java @@ -7,7 +7,6 @@ package de.julielab.jcore.ae.coordbaseline.main; import de.julielab.jcore.types.*; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.cas.FSIterator; @@ -17,13 +16,16 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; +import static org.junit.jupiter.api.Assertions.assertTrue; -public class EEEAnnotatorTest extends TestCase + +public class EEEAnnotatorTest { private static final Logger LOGGER = LoggerFactory.getLogger(EEEAnnotatorTest.class); private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties"; @@ -33,12 +35,7 @@ public class EEEAnnotatorTest extends TestCase private static final String EEE2 = "simple upstream and downstream sequence elements"; private static final String TEST_DESC = "src/test/resources/desc/EEEAnnotatorTest.xml"; -/*--------------------------------------------------------------------------------*/ - protected void setUp() throws Exception - { - super.setUp(); - } // of setUp -/*--------------------------------------------------------------------------------*/ + public void initCas(JCas jcas) { jcas.reset(); @@ -538,6 +535,8 @@ public void initCas(JCas jcas) entity3.addToIndexes(); } // of initCas /*--------------------------------------------------------------------------------*/ + + @Test public void testProcess() { XMLInputSource descriptor = null; @@ -575,7 +574,7 @@ public void testProcess() try { ae.process(jcas, null); - assertTrue("Invalid JCas!", checkJCas(jcas)); + assertTrue(checkJCas(jcas), "Invalid JCas!"); } // of try diff --git a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EllipsisAnnotatorTest.java b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EllipsisAnnotatorTest.java index 749371a51..94d697619 100644 --- a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EllipsisAnnotatorTest.java +++ b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EllipsisAnnotatorTest.java @@ -7,7 +7,6 @@ package de.julielab.jcore.ae.coordbaseline.main; import de.julielab.jcore.types.*; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.cas.FSIterator; @@ -17,12 +16,15 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; -public class EllipsisAnnotatorTest extends TestCase +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class EllipsisAnnotatorTest { private static final Logger LOGGER = LoggerFactory.getLogger(EllipsisAnnotatorTest.class); private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties"; @@ -30,12 +32,7 @@ public class EllipsisAnnotatorTest extends TestCase private static final String ellipsis1 = "X cells, Y cells, and Z cells"; private static final String ellipsis2 = "simple upstream sequence elements and simple downstream sequence elements"; private static final String TEST_DESC = "src/test/resources/desc/EllipsisAnnotatorTest.xml"; -/*--------------------------------------------------------------------------------*/ - protected void setUp() throws Exception - { - super.setUp(); - } // of setUp -/*--------------------------------------------------------------------------------*/ + public void initCas(JCas jcas) { jcas.reset(); @@ -697,6 +694,8 @@ public void initCas(JCas jcas) c26.addToIndexes(); } // of initCas /*---------------------------------------------------------------------------*/ + + @Test public void testProcess() { XMLInputSource descriptor = null; @@ -734,7 +733,7 @@ public void testProcess() try { ae.process(jcas, null); - assertTrue("Invalid JCas!", checkJCas(jcas)); + assertTrue(checkJCas(jcas), "Invalid JCas!"); } // of try catch (Exception e) { diff --git a/jcore-cord19-reader/pom.xml b/jcore-cord19-reader/pom.xml index b77f93e91..d9f7736d7 100644 --- a/jcore-cord19-reader/pom.xml +++ b/jcore-cord19-reader/pom.xml @@ -48,8 +48,8 @@ ${jcore-utilities-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine org.assertj diff --git a/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReaderTest.java b/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReaderTest.java index f7a8e8fcf..0453a1cde 100644 --- a/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReaderTest.java +++ b/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReaderTest.java @@ -15,7 +15,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.nio.file.Path; import java.util.Collection; diff --git a/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/JsonFormatTest.java b/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/JsonFormatTest.java index dba932cac..5e39b79d0 100644 --- a/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/JsonFormatTest.java +++ b/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/JsonFormatTest.java @@ -4,7 +4,7 @@ import de.julielab.jcore.reader.cord19.jsonformat.Affiliation; import de.julielab.jcore.reader.cord19.jsonformat.Author; import de.julielab.jcore.reader.cord19.jsonformat.Cord19Document; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.nio.file.Path; diff --git a/jcore-coreference-writer/pom.xml b/jcore-coreference-writer/pom.xml index ee4c26044..1bafb6e13 100644 --- a/jcore-coreference-writer/pom.xml +++ b/jcore-coreference-writer/pom.xml @@ -41,8 +41,8 @@ ${jcore-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe Coreference Writer diff --git a/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java b/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java index 32613e57d..c85dcfa82 100644 --- a/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java +++ b/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java @@ -1,7 +1,6 @@ package de.julielab.jcore.consumer.coreference; import de.julielab.java.utilities.FileUtilities; -import de.julielab.jcore.types.Abbreviation; import de.julielab.jcore.types.CorefExpression; import de.julielab.jcore.types.CorefRelation; import de.julielab.jcore.utility.JCoReTools; @@ -15,17 +14,12 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.cas.FSArray; -import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import java.io.File; import java.io.IOException; import java.io.OutputStream; -import java.util.HashMap; import java.util.Iterator; -import java.util.Map; -import java.util.Spliterators; @ResourceMetaData(name = "JCoRe Coreference Writer", description = "Writes co-reference annotation to a text file.") public class CoreferenceWriter extends JCasAnnotator_ImplBase { diff --git a/jcore-cpe-db-runner/pom.xml b/jcore-cpe-db-runner/pom.xml index d84ab5a84..62e879169 100644 --- a/jcore-cpe-db-runner/pom.xml +++ b/jcore-cpe-db-runner/pom.xml @@ -71,8 +71,8 @@ ${project.parent.version} - junit - junit + org.junit.jupiter + junit-jupiter-engine https://github.com/JULIELab/jcore-base/tree/master/jcore-cpe-db-runner diff --git a/jcore-ct-reader/pom.xml b/jcore-ct-reader/pom.xml index ac50c8cdb..0630444f6 100644 --- a/jcore-ct-reader/pom.xml +++ b/jcore-ct-reader/pom.xml @@ -41,8 +41,8 @@ ${jcore-utilities-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe Clinical Trials Reader diff --git a/jcore-ct-reader/src/test/java/de/julielab/jcore/reader/ct/ClinicalTrialsReaderTest.java b/jcore-ct-reader/src/test/java/de/julielab/jcore/reader/ct/ClinicalTrialsReaderTest.java index b1aa75967..140b19874 100644 --- a/jcore-ct-reader/src/test/java/de/julielab/jcore/reader/ct/ClinicalTrialsReaderTest.java +++ b/jcore-ct-reader/src/test/java/de/julielab/jcore/reader/ct/ClinicalTrialsReaderTest.java @@ -11,13 +11,13 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.Collection; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; /** * Unit tests for jcore-ct-reader. diff --git a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierReaderTest.java b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierReaderTest.java index 579613897..11aa0d9ab 100644 --- a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierReaderTest.java +++ b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierReaderTest.java @@ -12,22 +12,22 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.BeforeClass; import org.junit.ClassRule; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.testcontainers.containers.PostgreSQLContainer; import java.io.IOException; import java.sql.SQLException; import static de.julielab.jcore.reader.db.TableReaderConstants.*; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class DBMultiplierReaderTest { @ClassRule public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); - @BeforeClass + @BeforeAll public static void setup() throws SQLException { DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); try (final CoStoSysConnection ignore = dbc.obtainOrReserveConnection()) { diff --git a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierTest.java b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierTest.java index 2816f9535..fa378c49e 100644 --- a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierTest.java +++ b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierTest.java @@ -19,9 +19,9 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.BeforeClass; import org.junit.ClassRule; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testcontainers.containers.PostgreSQLContainer; @@ -32,14 +32,14 @@ import java.sql.SQLException; import static de.julielab.jcore.reader.db.TableReaderConstants.*; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class DBMultiplierTest { private final static Logger log = LoggerFactory.getLogger(DBMultiplierTest.class); @ClassRule public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); - @BeforeClass + @BeforeAll public static void setup() throws SQLException, IOException { DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); dbc.reserveConnection(); diff --git a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBReaderTest.java b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBReaderTest.java index 8ed7c86bf..015d3e3f5 100644 --- a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBReaderTest.java +++ b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBReaderTest.java @@ -12,9 +12,9 @@ import org.apache.uima.fit.factory.CollectionReaderFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.BeforeClass; import org.junit.ClassRule; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testcontainers.containers.PostgreSQLContainer; @@ -25,14 +25,14 @@ import java.sql.SQLException; import static de.julielab.jcore.reader.db.TableReaderConstants.*; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class DBReaderTest { @ClassRule public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); - @BeforeClass + @BeforeAll public static void setup() throws SQLException { DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); dbc.reserveConnection(); diff --git a/jcore-descriptor-creator/pom.xml b/jcore-descriptor-creator/pom.xml index aae843561..0c410747d 100644 --- a/jcore-descriptor-creator/pom.xml +++ b/jcore-descriptor-creator/pom.xml @@ -46,8 +46,8 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-descriptor-creator/src/test/java/de/julielab/jcore/misc/DescriptorCreatorTest.java b/jcore-descriptor-creator/src/test/java/de/julielab/jcore/misc/DescriptorCreatorTest.java index 534fadda9..41a146892 100644 --- a/jcore-descriptor-creator/src/test/java/de/julielab/jcore/misc/DescriptorCreatorTest.java +++ b/jcore-descriptor-creator/src/test/java/de/julielab/jcore/misc/DescriptorCreatorTest.java @@ -1,27 +1,24 @@ package de.julielab.jcore.misc; -import static java.util.stream.Collectors.joining; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import de.julielab.java.utilities.IOStreamUtilities; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.nio.file.Path; -import java.util.Arrays; -import java.util.Optional; import java.util.stream.Stream; -import de.julielab.java.utilities.IOStreamUtilities; -import org.apache.commons.io.FileUtils; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.Test; +import static java.util.stream.Collectors.joining; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; public class DescriptorCreatorTest { - @BeforeClass - @AfterClass + @BeforeAll + @AfterAll public static void shutdown() throws IOException { //FileUtils.deleteDirectory(new File(Arrays.asList("src", "test", "resources", "de").stream().collect(joining(File.separator)))); } diff --git a/jcore-descriptor-creator/src/test/java/de/julielab/jcore/reader/testreader/TestReader.java b/jcore-descriptor-creator/src/test/java/de/julielab/jcore/reader/testreader/TestReader.java index 937c00e4d..36e70a5cc 100644 --- a/jcore-descriptor-creator/src/test/java/de/julielab/jcore/reader/testreader/TestReader.java +++ b/jcore-descriptor-creator/src/test/java/de/julielab/jcore/reader/testreader/TestReader.java @@ -1,12 +1,12 @@ package de.julielab.jcore.reader.testreader; -import java.io.IOException; - import org.apache.uima.cas.CAS; import org.apache.uima.collection.CollectionException; import org.apache.uima.collection.CollectionReader_ImplBase; import org.apache.uima.util.Progress; +import java.io.IOException; + public class TestReader extends CollectionReader_ImplBase { @Override diff --git a/jcore-dta-reader/pom.xml b/jcore-dta-reader/pom.xml index b47f53e66..7f5b51af2 100644 --- a/jcore-dta-reader/pom.xml +++ b/jcore-dta-reader/pom.xml @@ -70,7 +70,11 @@ org.slf4j slf4j-api - junitjunit + + org.junit.jupiter + junit-jupiter-engine + + Reader for DTA files (German digital humanities corpus) http://www.julielab.de diff --git a/jcore-dta-reader/src/test/java/de/julielab/jcore/reader/dta/DTAFileReaderTest.java b/jcore-dta-reader/src/test/java/de/julielab/jcore/reader/dta/DTAFileReaderTest.java index eadb4101b..0e2b0f995 100644 --- a/jcore-dta-reader/src/test/java/de/julielab/jcore/reader/dta/DTAFileReaderTest.java +++ b/jcore-dta-reader/src/test/java/de/julielab/jcore/reader/dta/DTAFileReaderTest.java @@ -24,13 +24,13 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileInputStream; import java.util.*; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class DTAFileReaderTest { diff --git a/jcore-dta-reader/src/test/java/de/julielab/jcore/reader/dta/util/DTAUtilsTest.java b/jcore-dta-reader/src/test/java/de/julielab/jcore/reader/dta/util/DTAUtilsTest.java index 3f9780106..b971e6cfc 100644 --- a/jcore-dta-reader/src/test/java/de/julielab/jcore/reader/dta/util/DTAUtilsTest.java +++ b/jcore-dta-reader/src/test/java/de/julielab/jcore/reader/dta/util/DTAUtilsTest.java @@ -14,14 +14,14 @@ import de.julielab.jcore.reader.dta.DTAFileReaderTest.Version; import de.julielab.jcore.types.extensions.dta.DTABelletristik; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class DTAUtilsTest { diff --git a/jcore-ec-code-ae/pom.xml b/jcore-ec-code-ae/pom.xml index 05cc496a5..6f0c55f60 100644 --- a/jcore-ec-code-ae/pom.xml +++ b/jcore-ec-code-ae/pom.xml @@ -28,8 +28,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-ec-code-ae/src/test/java/de/julielab/jcore/ae/ec/ECNumberAnnotatorTest.java b/jcore-ec-code-ae/src/test/java/de/julielab/jcore/ae/ec/ECNumberAnnotatorTest.java index 8408f9d56..3960c59a9 100644 --- a/jcore-ec-code-ae/src/test/java/de/julielab/jcore/ae/ec/ECNumberAnnotatorTest.java +++ b/jcore-ec-code-ae/src/test/java/de/julielab/jcore/ae/ec/ECNumberAnnotatorTest.java @@ -18,9 +18,9 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class ECNumberAnnotatorTest { @Test diff --git a/jcore-elasticsearch-consumer/pom.xml b/jcore-elasticsearch-consumer/pom.xml index 540e2f7d1..a4fed0dc9 100644 --- a/jcore-elasticsearch-consumer/pom.xml +++ b/jcore-elasticsearch-consumer/pom.xml @@ -68,6 +68,10 @@ icu4j 55.1 + + org.apache.commons + commons-lang3 + org.mapdb mapdb @@ -89,6 +93,10 @@ logback-classic test + + org.junit.jupiter + junit-jupiter + JULIE Lab Jena, Germany diff --git a/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/AbstractFieldGeneratorTest.java b/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/AbstractFieldGeneratorTest.java index 68292673e..588489b8c 100644 --- a/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/AbstractFieldGeneratorTest.java +++ b/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/AbstractFieldGeneratorTest.java @@ -367,6 +367,6 @@ public void testAddArray() { builder.registerTypeAdapter(PreanalyzedFieldValue.class, new PreanalyzedFieldValue.PreanalyzedFieldValueGsonAdapter()); Gson gson = builder.create(); - assertEquals("{\"field\":[\"eins\",\"zwei\"]}", gson.toJson(d)); + assertEquals("{\"field\":[\"eins\",\"zwei\"]}", gson.toJson(d)); } } diff --git a/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/ElasticSearchConsumerIT.java b/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/ElasticSearchConsumerIT.java index 849c005df..36a71fbe0 100644 --- a/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/ElasticSearchConsumerIT.java +++ b/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/ElasticSearchConsumerIT.java @@ -1,17 +1,15 @@ package de.julielab.jcore.consumer.es; -import de.julielab.java.utilities.IOStreamUtilities; import de.julielab.jcore.consumer.es.preanalyzed.Document; import de.julielab.jcore.consumer.es.preanalyzed.RawToken; import de.julielab.jcore.types.Header; import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.cas.CASException; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.BeforeClass; import org.junit.ClassRule; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testcontainers.containers.GenericContainer; @@ -19,12 +17,11 @@ import org.testcontainers.containers.output.Slf4jLogConsumer; import org.testcontainers.shaded.com.fasterxml.jackson.databind.ObjectMapper; -import java.net.HttpURLConnection; import java.net.URL; import java.time.Duration; import java.util.Map; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class ElasticSearchConsumerIT { public static final String TEST_INDEX = "testindex"; @@ -39,7 +36,7 @@ public class ElasticSearchConsumerIT { .withStartupTimeout(Duration.ofMinutes(2)) .withEnv("cluster.name", TEST_CLUSTER); - @BeforeClass + @BeforeAll public static void setup() { Slf4jLogConsumer toStringConsumer = new Slf4jLogConsumer(log); es.followOutput(toStringConsumer, OutputFrame.OutputType.STDOUT); diff --git a/jcore-embedding-writer/pom.xml b/jcore-embedding-writer/pom.xml index d5d5304a6..d294419fd 100644 --- a/jcore-embedding-writer/pom.xml +++ b/jcore-embedding-writer/pom.xml @@ -28,14 +28,18 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine de.julielab jcore-utilities ${jcore-utilities-version} + + org.apache.commons + commons-lang3 + org.assertj assertj-core diff --git a/jcore-embedding-writer/src/test/java/de/julielab/jcore/consumer/ew/DecoderTest.java b/jcore-embedding-writer/src/test/java/de/julielab/jcore/consumer/ew/DecoderTest.java index 85ed94597..d34cdd780 100644 --- a/jcore-embedding-writer/src/test/java/de/julielab/jcore/consumer/ew/DecoderTest.java +++ b/jcore-embedding-writer/src/test/java/de/julielab/jcore/consumer/ew/DecoderTest.java @@ -2,7 +2,7 @@ import org.apache.commons.lang3.tuple.Pair; import org.assertj.core.data.Offset; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.*; import java.nio.ByteBuffer; diff --git a/jcore-embedding-writer/src/test/java/de/julielab/jcore/consumer/ew/EmbeddingWriterTest.java b/jcore-embedding-writer/src/test/java/de/julielab/jcore/consumer/ew/EmbeddingWriterTest.java index 6a80fbcbe..8d19cf1ce 100644 --- a/jcore-embedding-writer/src/test/java/de/julielab/jcore/consumer/ew/EmbeddingWriterTest.java +++ b/jcore-embedding-writer/src/test/java/de/julielab/jcore/consumer/ew/EmbeddingWriterTest.java @@ -10,7 +10,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.DoubleArray; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileInputStream; diff --git a/jcore-event-flattener-ae/pom.xml b/jcore-event-flattener-ae/pom.xml index 423a141b9..91788a532 100644 --- a/jcore-event-flattener-ae/pom.xml +++ b/jcore-event-flattener-ae/pom.xml @@ -1,48 +1,56 @@ - - 4.0.0 - - de.julielab - jcore-base - 2.6.0-SNAPSHOT - - jcore-event-flattener-ae - JCoRe Event Flattener AE - This component reads de.julielab.jcore.types.EventMention annotations and converts event structures into de.julielab.jcore.types.ext.FlattenedRelation annotation. The purpose of FlattenedRelations is to represent complex event structures in a more simple manner. This can be helpful for visualization or further processing. - - - org.slf4j - slf4j-api - - - com.google.guava - guava - 18.0 - test - - - de.julielab - jcore-types - ${jcore-types-version} - - - ch.qos.logback - logback-classic - test - - - de.julielab - jcore-descriptor-creator - - junitjunit - - JULIE Lab Jena, Germany - http://www.julielab.de - - https://github.com/JULIELab/jcore-base/tree/master/jcore-event-flattener-ae - - - BSD-2-Clause - https://opensource.org/licenses/BSD-2-Clause - - + + 4.0.0 + + de.julielab + jcore-base + 2.6.0-SNAPSHOT + + jcore-event-flattener-ae + JCoRe Event Flattener AE + This component reads de.julielab.jcore.types.EventMention annotations and converts event structures + into de.julielab.jcore.types.ext.FlattenedRelation annotation. The purpose of FlattenedRelations is to represent + complex event structures in a more simple manner. This can be helpful for visualization or further processing. + + + + org.slf4j + slf4j-api + + + com.google.guava + guava + 18.0 + test + + + de.julielab + jcore-types + ${jcore-types-version} + + + ch.qos.logback + logback-classic + test + + + de.julielab + jcore-descriptor-creator + + + org.junit.jupiter + junit-jupiter-engine + + + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-event-flattener-ae + + + BSD-2-Clause + https://opensource.org/licenses/BSD-2-Clause + + diff --git a/jcore-event-flattener-ae/src/test/java/de/julielab/jules/ae/EventFlattenerTest.java b/jcore-event-flattener-ae/src/test/java/de/julielab/jules/ae/EventFlattenerTest.java index ea1c0d4c3..8af8ce297 100644 --- a/jcore-event-flattener-ae/src/test/java/de/julielab/jules/ae/EventFlattenerTest.java +++ b/jcore-event-flattener-ae/src/test/java/de/julielab/jules/ae/EventFlattenerTest.java @@ -13,108 +13,108 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.FileInputStream; import java.util.Set; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class EventFlattenerTest { - @SuppressWarnings("unused") - private final static Logger log = LoggerFactory - .getLogger(EventFlattenerTest.class); + @SuppressWarnings("unused") + private final static Logger log = LoggerFactory + .getLogger(EventFlattenerTest.class); - @Test - public void testProcess() throws Exception, SecurityException { - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-all-types"); - XmiCasDeserializer.deserialize(new FileInputStream( - "src/test/resources/21499307.xmi"), jCas - .getCas()); + @Test + public void testProcess() throws Exception, SecurityException { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-all-types"); + XmiCasDeserializer.deserialize(new FileInputStream( + "src/test/resources/21499307.xmi"), jCas + .getCas()); - AnalysisEngine flattener = AnalysisEngineFactory - .createEngine(EventFlattener.class); - flattener.process(jCas); + AnalysisEngine flattener = AnalysisEngineFactory + .createEngine(EventFlattener.class); + flattener.process(jCas); - FSIterator sentit = jCas.getAnnotationIndex(Sentence.type) - .iterator(); - int sentenceCounter = 1; - // we are interested in the 8th sentence because there is the only complex event structure there - Sentence interestingSent = null; - while (sentit.hasNext()) { - Sentence s = (Sentence) sentit.next(); - switch (sentenceCounter) { - case 3: - assertEquals("Wrong number of flattened events in sentence " - + s.getCoveredText(), 2, countEventsInSentence(s)); - break; - case 5: - assertEquals("Wrong number of flattened events in sentence " - + s.getCoveredText(), 1, countEventsInSentence(s)); - break; - case 6: - assertEquals("Wrong number of flattened events in sentence " - + s.getCoveredText(), 2, countEventsInSentence(s)); - break; - case 7: - assertEquals("Wrong number of flattened events in sentence " - + s.getCoveredText(), 4, countEventsInSentence(s)); - break; - case 8: - assertEquals("Wrong number of flattened events in sentence " - + s.getCoveredText(), 6, countEventsInSentence(s)); - interestingSent = s; - break; - case 9: - assertEquals("Wrong number of flattened events in sentence " - + s.getCoveredText(), 1, countEventsInSentence(s)); - break; - default: - assertEquals("Wrong number of flattened events in sentence " - + s.getCoveredText(), 0, countEventsInSentence(s)); - } - sentenceCounter++; - } - FSIterator flateventit = jCas - .getAnnotationIndex(FlattenedRelation.type).subiterator(interestingSent); - while (flateventit.hasNext()) { - FlattenedRelation fr = (FlattenedRelation) flateventit.next(); - if (fr.getId().equals("FE" + 13)) { - // All arguments there? - Set expectedArguments = Sets.newHashSet("anti-apoptotic Bcl-2", "CSN5"); - for (int i = 0; i < fr.getArguments().size(); ++i) - assertTrue("Unexpected argument: " + fr.getArguments(i).getCoveredText(), expectedArguments.remove(fr.getArguments(i).getCoveredText())); - assertTrue("Expected arguments not found in relation: " + expectedArguments, expectedArguments.isEmpty()); - // Arguments correctly divided into agents and patients? - assertEquals(1, fr.getAgents().size()); - assertEquals(1, fr.getPatients().size()); - assertEquals("CSN5", fr.getAgents(0).getCoveredText()); - assertEquals("anti-apoptotic Bcl-2", fr.getPatients(0).getCoveredText()); - // All participating (sub-)events there? - assertEquals(3, fr.getRelations().size()); - Set expectedRelations = Sets.newHashSet("depletion", "caused", "expression"); - for (int i = 0; i < fr.getRelations().size(); ++i) - assertTrue("Unexpected relation: " + fr.getRelations(i).getCoveredText(), expectedRelations.remove(fr.getRelations(i).getCoveredText())); - assertTrue(expectedRelations.isEmpty()); - } - } - - } + FSIterator sentit = jCas.getAnnotationIndex(Sentence.type) + .iterator(); + int sentenceCounter = 1; + // we are interested in the 8th sentence because there is the only complex event structure there + Sentence interestingSent = null; + while (sentit.hasNext()) { + Sentence s = (Sentence) sentit.next(); + switch (sentenceCounter) { + case 3: + assertEquals(2, countEventsInSentence(s), "Wrong number of flattened events in sentence " + + s.getCoveredText()); + break; + case 5: + assertEquals(1, countEventsInSentence(s), "Wrong number of flattened events in sentence " + + s.getCoveredText()); + break; + case 6: + assertEquals(2, countEventsInSentence(s), "Wrong number of flattened events in sentence " + + s.getCoveredText()); + break; + case 7: + assertEquals(4, countEventsInSentence(s), "Wrong number of flattened events in sentence " + + s.getCoveredText()); + break; + case 8: + assertEquals(6, countEventsInSentence(s), "Wrong number of flattened events in sentence " + + s.getCoveredText()); + interestingSent = s; + break; + case 9: + assertEquals(1, countEventsInSentence(s), "Wrong number of flattened events in sentence " + + s.getCoveredText()); + break; + default: + assertEquals(0, countEventsInSentence(s), "Wrong number of flattened events in sentence " + + s.getCoveredText()); + } + sentenceCounter++; + } + FSIterator flateventit = jCas + .getAnnotationIndex(FlattenedRelation.type).subiterator(interestingSent); + while (flateventit.hasNext()) { + FlattenedRelation fr = (FlattenedRelation) flateventit.next(); + if (fr.getId().equals("FE" + 13)) { + // All arguments there? + Set expectedArguments = Sets.newHashSet("anti-apoptotic Bcl-2", "CSN5"); + for (int i = 0; i < fr.getArguments().size(); ++i) + assertTrue(expectedArguments.remove(fr.getArguments(i).getCoveredText()), "Unexpected argument: " + fr.getArguments(i).getCoveredText()); + assertTrue(expectedArguments.isEmpty(), "Expected arguments not found in relation: " + expectedArguments); + // Arguments correctly divided into agents and patients? + assertEquals(1, fr.getAgents().size()); + assertEquals(1, fr.getPatients().size()); + assertEquals("CSN5", fr.getAgents(0).getCoveredText()); + assertEquals("anti-apoptotic Bcl-2", fr.getPatients(0).getCoveredText()); + // All participating (sub-)events there? + assertEquals(3, fr.getRelations().size()); + Set expectedRelations = Sets.newHashSet("depletion", "caused", "expression"); + for (int i = 0; i < fr.getRelations().size(); ++i) + assertTrue(expectedRelations.remove(fr.getRelations(i).getCoveredText()), "Unexpected relation: " + fr.getRelations(i).getCoveredText()); + assertTrue(expectedRelations.isEmpty()); + } + } - private int countEventsInSentence(Sentence s) throws CASRuntimeException, - CASException { - FSIterator flateventit = s.getCAS().getJCas() - .getAnnotationIndex(FlattenedRelation.type).subiterator(s); - int count = 0; - while (flateventit.hasNext()) { - @SuppressWarnings("unused") - Annotation annotation = (Annotation) flateventit.next(); - count++; - } - return count; - } + } + + private int countEventsInSentence(Sentence s) throws CASRuntimeException, + CASException { + FSIterator flateventit = s.getCAS().getJCas() + .getAnnotationIndex(FlattenedRelation.type).subiterator(s); + int count = 0; + while (flateventit.hasNext()) { + @SuppressWarnings("unused") + Annotation annotation = (Annotation) flateventit.next(); + count++; + } + return count; + } } diff --git a/jcore-feature-value-replacement-ae/pom.xml b/jcore-feature-value-replacement-ae/pom.xml index f3e120d76..8395dfefd 100644 --- a/jcore-feature-value-replacement-ae/pom.xml +++ b/jcore-feature-value-replacement-ae/pom.xml @@ -34,8 +34,8 @@ jcore-descriptor-creator - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-feature-value-replacement-ae/src/test/java/de/julielab/jcore/ae/fvr/FeatureValueReplacementAnnotatorTest.java b/jcore-feature-value-replacement-ae/src/test/java/de/julielab/jcore/ae/fvr/FeatureValueReplacementAnnotatorTest.java index efb4df831..81958daf1 100644 --- a/jcore-feature-value-replacement-ae/src/test/java/de/julielab/jcore/ae/fvr/FeatureValueReplacementAnnotatorTest.java +++ b/jcore-feature-value-replacement-ae/src/test/java/de/julielab/jcore/ae/fvr/FeatureValueReplacementAnnotatorTest.java @@ -10,11 +10,11 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.resource.ExternalResourceDescription; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class FeatureValueReplacementAnnotatorTest { @Test diff --git a/jcore-file-reader/pom.xml b/jcore-file-reader/pom.xml index 0de264d3b..179cc5647 100644 --- a/jcore-file-reader/pom.xml +++ b/jcore-file-reader/pom.xml @@ -26,8 +26,8 @@ julielab-java-utilities - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-file-reader/src/test/java/de/julielab/jcore/reader/file/main/FileReaderTest.java b/jcore-file-reader/src/test/java/de/julielab/jcore/reader/file/main/FileReaderTest.java index f5f2f9cd7..f1e440d04 100644 --- a/jcore-file-reader/src/test/java/de/julielab/jcore/reader/file/main/FileReaderTest.java +++ b/jcore-file-reader/src/test/java/de/julielab/jcore/reader/file/main/FileReaderTest.java @@ -29,16 +29,16 @@ import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class FileReaderTest { @@ -107,7 +107,7 @@ public class FileReaderTest { private static final String FILE_ARTIFACT_4 = "data/onlyToken/8563171.txt"; - @BeforeClass + @BeforeAll public static void setUp() throws Exception { writeArtifact(ARTIFACT_1, FILE_ARTIFACT_1); diff --git a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/NerTaggingResponse.java b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/NerTaggingResponse.java index f28e7bd22..b876a0731 100644 --- a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/NerTaggingResponse.java +++ b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/NerTaggingResponse.java @@ -1,7 +1,6 @@ package de.julielab.jcore.ae.flairner; import java.util.List; -import java.util.stream.Stream; /** *

A class to assemble the response from FLAIR for a tagging request. The found entities are returned as diff --git a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/PythonConnector.java b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/PythonConnector.java index 2ba03c82c..f7a09ba7b 100644 --- a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/PythonConnector.java +++ b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/PythonConnector.java @@ -1,7 +1,6 @@ package de.julielab.jcore.ae.flairner; import de.julielab.jcore.types.Sentence; -import org.apache.commons.lang3.tuple.Pair; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import java.io.IOException; diff --git a/jcore-flair-ner-ae/src/test/resources/1681975.xmi b/jcore-flair-ner-ae/src/test/resources/1681975.xmi index 467d07936..04b9b74fa 100644 --- a/jcore-flair-ner-ae/src/test/resources/1681975.xmi +++ b/jcore-flair-ner-ae/src/test/resources/1681975.xmi @@ -1 +1,5 @@ -1681975 \ No newline at end of file + +1681975 \ No newline at end of file diff --git a/jcore-flair-token-embedding-ae/pom.xml b/jcore-flair-token-embedding-ae/pom.xml index 483998eda..251ffedb9 100644 --- a/jcore-flair-token-embedding-ae/pom.xml +++ b/jcore-flair-token-embedding-ae/pom.xml @@ -33,8 +33,8 @@ 1.0.1 - junit - junit + org.junit.jupiter + junit-jupiter-engine com.google.code.gson diff --git a/jcore-flair-token-embedding-ae/src/main/resources/de/julielab/jcore/ae/fte/python/getEmbeddingScript.py b/jcore-flair-token-embedding-ae/src/main/resources/de/julielab/jcore/ae/fte/python/getEmbeddingScript.py index a262f84af..43095851a 100644 --- a/jcore-flair-token-embedding-ae/src/main/resources/de/julielab/jcore/ae/fte/python/getEmbeddingScript.py +++ b/jcore-flair-token-embedding-ae/src/main/resources/de/julielab/jcore/ae/fte/python/getEmbeddingScript.py @@ -1,15 +1,11 @@ -import os -from flair.models import SequenceTagger +import json +import sys +import time from flair.data import Sentence -from typing import List - -from flair.embeddings import WordEmbeddings, CharacterEmbeddings, BytePairEmbeddings, FlairEmbeddings, BertEmbeddings, ELMoEmbeddings from flair.embeddings import StackedEmbeddings - -import sys -import json +from flair.embeddings import WordEmbeddings, CharacterEmbeddings, BytePairEmbeddings, FlairEmbeddings, BertEmbeddings, \ + ELMoEmbeddings from struct import * -import time def decodeString(buffer): diff --git a/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/EmbeddingScriptTest.java b/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/EmbeddingScriptTest.java index ee2ff04ae..d62ad9b4e 100644 --- a/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/EmbeddingScriptTest.java +++ b/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/EmbeddingScriptTest.java @@ -5,8 +5,8 @@ import de.julielab.ipc.javabridge.ResultDecoders; import de.julielab.ipc.javabridge.StdioBridge; import org.assertj.core.data.Offset; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.HashMap; @@ -20,11 +20,11 @@ public class EmbeddingScriptTest { private static final String SCRIPT_PATH = "src/main/resources/de/julielab/jcore/ae/fte/python/getEmbeddingScript.py"; private static String pythonCommand; - @BeforeClass + @BeforeAll public static void setup() { pythonCommand = System.getenv("PYTHON"); if (pythonCommand == null) - pythonCommand = "python3.6"; + pythonCommand = "python"; } @Test @@ -49,11 +49,7 @@ public void testPythonEmbeddingScriptSimple() throws Exception { final double[][] vectors = response.map(ResultDecoders.decodeVectors).findAny().get(); bridge.stop(); - assertThat(vectors).hasSize(10); - for (double[] vector : vectors) { - // The vectors should all have a dimensionality of 1024 - assertThat(vector.length).isEqualTo(1024); - } + assertThat(vectors).hasDimensions(10, 1024); // Those values were output using print(token.embedding.numpy(), file=sys.stderr) in the script assertThat(vectors[0][0]).isCloseTo(1.8812446e-01, Offset.offset(0.000001)); @@ -86,11 +82,7 @@ public void testPythonEmbeddingScriptSpecificVectorsResponse() throws Exception final double[][] vectors = response.map(ResultDecoders.decodeVectors).findAny().get(); bridge.stop(); - assertThat(vectors).hasSize(2); - for (int i = 0; i < vectors.length; i++) { - // The vectors should all have a dimensionality of 1024 - assertThat(vectors[i].length).isEqualTo(1024); - } + assertThat(vectors).hasDimensions(2, 1024); // Those values were output using print(token.embedding.numpy(), file=sys.stderr) in the script assertThat(vectors[0][0]).isCloseTo(-0.16511102, Offset.offset(0.000001)); @@ -128,7 +120,7 @@ public void testPythonEmbeddingScriptMultipleSentences() throws Exception { final double[][] vectors = response.map(ResultDecoders.decodeVectors).findAny().get(); bridge.stop(); - assertThat(vectors).hasSize(12); + assertThat(vectors.length).isEqualTo(12); } } diff --git a/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotatorTest.java b/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotatorTest.java index 200bb491c..d67615d3e 100644 --- a/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotatorTest.java +++ b/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotatorTest.java @@ -8,7 +8,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Collection; @@ -29,7 +29,9 @@ public void testEmbeddingAnnotator() throws Exception { addTokens(jCas); final String embeddingPath = "flair:src/test/resources/gene_small_best_lm.pt"; - final AnalysisEngine engine = AnalysisEngineFactory.createEngine("de.julielab.jcore.ae.fte.desc.jcore-flair-token-embedding-ae", FlairTokenEmbeddingAnnotator.PARAM_EMBEDDING_PATH, embeddingPath); + final AnalysisEngine engine = AnalysisEngineFactory.createEngine("de.julielab.jcore.ae.fte.desc.jcore-flair-token-embedding-ae", + FlairTokenEmbeddingAnnotator.PARAM_EMBEDDING_PATH, embeddingPath, + FlairTokenEmbeddingAnnotator.PARAM_PYTHON_EXECUTABLE, "python"); engine.process(jCas); @@ -58,7 +60,10 @@ public void testEmbeddingAnnotatorWithFilterAnnotation() throws Exception { new Gene(jCas, 75, 91).addToIndexes(); final String embeddingPath = "flair:src/test/resources/gene_small_best_lm.pt"; - final AnalysisEngine engine = AnalysisEngineFactory.createEngine("de.julielab.jcore.ae.fte.desc.jcore-flair-token-embedding-ae", FlairTokenEmbeddingAnnotator.PARAM_EMBEDDING_PATH, embeddingPath, FlairTokenEmbeddingAnnotator.PARAM_COMPUTATION_FILTER, "de.julielab.jcore.types.Gene"); + final AnalysisEngine engine = AnalysisEngineFactory.createEngine("de.julielab.jcore.ae.fte.desc.jcore-flair-token-embedding-ae", + FlairTokenEmbeddingAnnotator.PARAM_EMBEDDING_PATH, embeddingPath, + FlairTokenEmbeddingAnnotator.PARAM_COMPUTATION_FILTER, "de.julielab.jcore.types.Gene", + FlairTokenEmbeddingAnnotator.PARAM_PYTHON_EXECUTABLE, "python"); engine.process(jCas); diff --git a/jcore-flow-controllers/pom.xml b/jcore-flow-controllers/pom.xml index d17ecac74..fe3e3ff4e 100644 --- a/jcore-flow-controllers/pom.xml +++ b/jcore-flow-controllers/pom.xml @@ -24,7 +24,6 @@ org.junit.jupiter junit-jupiter-engine - test ch.qos.logback diff --git a/jcore-iexml-consumer/pom.xml b/jcore-iexml-consumer/pom.xml index 8924c020c..5d7a199a2 100644 --- a/jcore-iexml-consumer/pom.xml +++ b/jcore-iexml-consumer/pom.xml @@ -77,8 +77,8 @@ 2.6.0-SNAPSHOT - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-iexml-reader/pom.xml b/jcore-iexml-reader/pom.xml index 2ce284fda..3d1e90378 100644 --- a/jcore-iexml-reader/pom.xml +++ b/jcore-iexml-reader/pom.xml @@ -78,8 +78,8 @@ 2.6.0-SNAPSHOT - junit - junit + org.junit.jupiter + junit-jupiter-engine Reader for IEXML files as used in the Mantra project/challenge diff --git a/jcore-ign-reader/pom.xml b/jcore-ign-reader/pom.xml index 423a3fbce..8bd754eaa 100644 --- a/jcore-ign-reader/pom.xml +++ b/jcore-ign-reader/pom.xml @@ -35,8 +35,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-ign-reader/src/test/java/de/julielab/jcore/reader/ign/IGNReaderTest.java b/jcore-ign-reader/src/test/java/de/julielab/jcore/reader/ign/IGNReaderTest.java index 11e48e537..e0fdec94c 100644 --- a/jcore-ign-reader/src/test/java/de/julielab/jcore/reader/ign/IGNReaderTest.java +++ b/jcore-ign-reader/src/test/java/de/julielab/jcore/reader/ign/IGNReaderTest.java @@ -19,11 +19,11 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Collection; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class IGNReaderTest { private static final String READER_DESCRIPTOR = "de.julielab.jcore.reader.ign.desc.jcore-ign-reader"; diff --git a/jcore-iob-consumer/pom.xml b/jcore-iob-consumer/pom.xml index e09d8591a..b1a21c3b7 100644 --- a/jcore-iob-consumer/pom.xml +++ b/jcore-iob-consumer/pom.xml @@ -34,8 +34,8 @@ 1.0.7 - junit - junit + org.junit.jupiter + junit-jupiter-engine commons-io diff --git a/jcore-iob-consumer/src/main/java/de/julielab/jcore/consumer/cas2iob/utils/UIMAUtils.java b/jcore-iob-consumer/src/main/java/de/julielab/jcore/consumer/cas2iob/utils/UIMAUtils.java index 3e6affd02..fa06059f6 100644 --- a/jcore-iob-consumer/src/main/java/de/julielab/jcore/consumer/cas2iob/utils/UIMAUtils.java +++ b/jcore-iob-consumer/src/main/java/de/julielab/jcore/consumer/cas2iob/utils/UIMAUtils.java @@ -9,6 +9,7 @@ package de.julielab.jcore.consumer.cas2iob.utils; import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; diff --git a/jcore-iob-consumer/src/test/java/de/julielab/jcore/consumer/cas2iob/main/ToIOBConsumerTest.java b/jcore-iob-consumer/src/test/java/de/julielab/jcore/consumer/cas2iob/main/ToIOBConsumerTest.java index fefa5975a..e1d926452 100644 --- a/jcore-iob-consumer/src/test/java/de/julielab/jcore/consumer/cas2iob/main/ToIOBConsumerTest.java +++ b/jcore-iob-consumer/src/test/java/de/julielab/jcore/consumer/cas2iob/main/ToIOBConsumerTest.java @@ -30,7 +30,7 @@ import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileInputStream; diff --git a/jcore-jnet-ae/pom.xml b/jcore-jnet-ae/pom.xml index ea8a89340..31f7e544b 100644 --- a/jcore-jnet-ae/pom.xml +++ b/jcore-jnet-ae/pom.xml @@ -117,8 +117,8 @@ julielab-java-utilities - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/cli/JNETApplicationTest.java b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/cli/JNETApplicationTest.java index cdfe60693..4cc449a62 100644 --- a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/cli/JNETApplicationTest.java +++ b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/cli/JNETApplicationTest.java @@ -7,11 +7,11 @@ package de.julielab.jcore.ae.jnet.cli; import org.junit.After; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertTrue; public class JNETApplicationTest { private static final String PREFIX = "src/test/resources/de/julielab/jcore/ae/jnet/cli/"; diff --git a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/tagger/NETaggerTest.java b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/tagger/NETaggerTest.java index f21a11d09..e05e6a6c1 100644 --- a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/tagger/NETaggerTest.java +++ b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/tagger/NETaggerTest.java @@ -2,7 +2,7 @@ import cc.mallet.types.Instance; import cc.mallet.types.InstanceList; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.InputStream; import java.util.ArrayList; diff --git a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/MiniTestapp.java b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/MiniTestapp.java index 1b1ed323f..006328391 100644 --- a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/MiniTestapp.java +++ b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/MiniTestapp.java @@ -35,8 +35,8 @@ import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.XMLInputSource; import org.apache.uima.util.XMLSerializer; -import org.junit.After; -import org.junit.Test; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; @@ -47,7 +47,7 @@ import java.io.IOException; import java.nio.charset.Charset; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class MiniTestapp { @@ -61,7 +61,7 @@ public class MiniTestapp { private static final String ANNOTATOR_DESC = PREFIX + "EntityAnnotatorTest.xml"; - @After + @AfterEach public void clean() { if (new File(TEST_XMI_OUT).isFile()) { new File(TEST_XMI_OUT).delete(); diff --git a/jcore-jpos-ae/pom.xml b/jcore-jpos-ae/pom.xml index 87cbc7fc5..4f195e62d 100644 --- a/jcore-jpos-ae/pom.xml +++ b/jcore-jpos-ae/pom.xml @@ -114,8 +114,8 @@ 2.1.2 - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-jpos-ae/src/test/java/de/julielab/jcore/ae/jpos/postagger/POSAnnotatorTest.java b/jcore-jpos-ae/src/test/java/de/julielab/jcore/ae/jpos/postagger/POSAnnotatorTest.java index c7a03c06d..50c639d51 100644 --- a/jcore-jpos-ae/src/test/java/de/julielab/jcore/ae/jpos/postagger/POSAnnotatorTest.java +++ b/jcore-jpos-ae/src/test/java/de/julielab/jcore/ae/jpos/postagger/POSAnnotatorTest.java @@ -17,9 +17,9 @@ import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class POSAnnotatorTest { diff --git a/jcore-jsbd-ae/pom.xml b/jcore-jsbd-ae/pom.xml index 964b14ef9..e21b02e2b 100644 --- a/jcore-jsbd-ae/pom.xml +++ b/jcore-jsbd-ae/pom.xml @@ -76,7 +76,6 @@ org.assertj assertj-core - 3.9.1 de.julielab @@ -103,6 +102,10 @@ mallet 2.0.8 + + org.apache.commons + commons-lang3 + de.julielab jcore-utilities @@ -112,6 +115,10 @@ de.julielab jcore-descriptor-creator + + org.junit.jupiter + junit-jupiter + JULIE Lab Jena, Germany diff --git a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/Abstract2UnitPipeTest.java b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/Abstract2UnitPipeTest.java index 3d7f63cc7..91ffa9f45 100644 --- a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/Abstract2UnitPipeTest.java +++ b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/Abstract2UnitPipeTest.java @@ -15,8 +15,8 @@ import cc.mallet.types.Token; import cc.mallet.types.TokenSequence; import org.assertj.core.data.Offset; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.List; @@ -27,7 +27,7 @@ public class Abstract2UnitPipeTest { protected static Pipe pipe; - @Before + @BeforeEach public void init() { pipe = new Abstract2UnitPipe(false); } diff --git a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/SentenceSplitterTest.java b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/SentenceSplitterTest.java index 8715c714b..a3ce21a17 100644 --- a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/SentenceSplitterTest.java +++ b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/SentenceSplitterTest.java @@ -18,7 +18,7 @@ import cc.mallet.pipe.Pipe; import cc.mallet.types.Instance; import cc.mallet.types.InstanceList; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,8 +26,8 @@ import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; /** * Test for the class {@link SentenceSplitter} diff --git a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java index 5506d38b8..0f0870ae8 100644 --- a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java +++ b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java @@ -34,7 +34,7 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -44,7 +44,7 @@ import java.util.stream.Collectors; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class SentenceAnnotatorTest { /** @@ -224,7 +224,7 @@ public void testSentenceDelimiterTypes() throws Exception { while (it.hasNext()) { Annotation sentence = it.next(); Range sentenceRange = Range.between(sentence.getBegin(), sentence.getEnd()); - assertTrue("Range " + sentenceRange + " was not expected", expectedSpans.remove(sentenceRange)); + assertTrue(expectedSpans.remove(sentenceRange), "Range " + sentenceRange + " was not expected"); } assertTrue(expectedSpans.isEmpty()); } diff --git a/jcore-jtbd-ae/pom.xml b/jcore-jtbd-ae/pom.xml index 0c7e7d127..54671bfc1 100644 --- a/jcore-jtbd-ae/pom.xml +++ b/jcore-jtbd-ae/pom.xml @@ -91,8 +91,8 @@ 2.0.8 - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe Token Annotator diff --git a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/TokenizerTest.java b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/TokenizerTest.java index c953307c1..e99c1f2f2 100644 --- a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/TokenizerTest.java +++ b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/TokenizerTest.java @@ -24,7 +24,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -39,7 +39,7 @@ import java.util.List; import java.util.stream.Collectors; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; /** * Test for the class {@link Tokenizer} diff --git a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotatorTest.java b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotatorTest.java index 4e3dfe9b3..37d8571f9 100644 --- a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotatorTest.java +++ b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotatorTest.java @@ -26,7 +26,7 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/jcore-julielab-entity-evaluator-consumer/pom.xml b/jcore-julielab-entity-evaluator-consumer/pom.xml index 35ae8b960..e0e543814 100644 --- a/jcore-julielab-entity-evaluator-consumer/pom.xml +++ b/jcore-julielab-entity-evaluator-consumer/pom.xml @@ -45,8 +45,8 @@ julielab-java-utilities - junit - junit + org.junit.jupiter + junit-jupiter-engine org.apache.commons diff --git a/jcore-julielab-entity-evaluator-consumer/src/test/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumerTest.java b/jcore-julielab-entity-evaluator-consumer/src/test/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumerTest.java index 69010da56..ca29657b9 100644 --- a/jcore-julielab-entity-evaluator-consumer/src/test/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumerTest.java +++ b/jcore-julielab-entity-evaluator-consumer/src/test/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumerTest.java @@ -22,7 +22,7 @@ import org.apache.uima.jcas.cas.DoubleArray; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.cas.StringArray; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.ByteArrayInputStream; import java.io.File; @@ -34,8 +34,8 @@ import java.util.zip.GZIPInputStream; import static de.julielab.jcore.consumer.entityevaluator.EntityEvaluatorConsumer.*; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class EntityEvaluatorConsumerTest { diff --git a/jcore-likelihood-assignment-ae/pom.xml b/jcore-likelihood-assignment-ae/pom.xml index d053fef46..0ab512b9b 100644 --- a/jcore-likelihood-assignment-ae/pom.xml +++ b/jcore-likelihood-assignment-ae/pom.xml @@ -33,8 +33,8 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe Likelihood Assignment AE diff --git a/jcore-likelihood-assignment-ae/src/test/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotatorTest.java b/jcore-likelihood-assignment-ae/src/test/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotatorTest.java index 5caf84f55..6fe9746f5 100644 --- a/jcore-likelihood-assignment-ae/src/test/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotatorTest.java +++ b/jcore-likelihood-assignment-ae/src/test/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotatorTest.java @@ -12,14 +12,14 @@ import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.Iterator; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; /** diff --git a/jcore-likelihood-detection-ae/pom.xml b/jcore-likelihood-detection-ae/pom.xml index eb4aaa51e..1bee1538d 100644 --- a/jcore-likelihood-detection-ae/pom.xml +++ b/jcore-likelihood-detection-ae/pom.xml @@ -42,8 +42,8 @@ julielab-java-utilities - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe Likelihood Detection AE diff --git a/jcore-likelihood-detection-ae/src/test/java/de/julielab/jcore/ae/likelihooddetection/LikelihoodDetectionAnnotatorTest.java b/jcore-likelihood-detection-ae/src/test/java/de/julielab/jcore/ae/likelihooddetection/LikelihoodDetectionAnnotatorTest.java index 864b0c431..814ce9755 100644 --- a/jcore-likelihood-detection-ae/src/test/java/de/julielab/jcore/ae/likelihooddetection/LikelihoodDetectionAnnotatorTest.java +++ b/jcore-likelihood-detection-ae/src/test/java/de/julielab/jcore/ae/likelihooddetection/LikelihoodDetectionAnnotatorTest.java @@ -11,7 +11,7 @@ import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -19,7 +19,7 @@ import java.util.ArrayList; import java.util.Iterator; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; /** diff --git a/jcore-line-multiplier/pom.xml b/jcore-line-multiplier/pom.xml index f81a228ca..2bd30a4d4 100644 --- a/jcore-line-multiplier/pom.xml +++ b/jcore-line-multiplier/pom.xml @@ -29,8 +29,8 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine org.assertj diff --git a/jcore-line-multiplier/src/test/java/de/julielab/jcore/multiplier/line/LineMultiplierTest.java b/jcore-line-multiplier/src/test/java/de/julielab/jcore/multiplier/line/LineMultiplierTest.java index 23b7e9ea3..5ecd2c19a 100644 --- a/jcore-line-multiplier/src/test/java/de/julielab/jcore/multiplier/line/LineMultiplierTest.java +++ b/jcore-line-multiplier/src/test/java/de/julielab/jcore/multiplier/line/LineMultiplierTest.java @@ -5,13 +5,13 @@ import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.List; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertTrue; /** * Unit tests for jcore-line-multiplier. */ diff --git a/jcore-line-multiplier/target/test-classes/de/julielab/jcore/multiplier/line/LineMultiplierTest.class b/jcore-line-multiplier/target/test-classes/de/julielab/jcore/multiplier/line/LineMultiplierTest.class index e654ed056e0b31d6e11df028b557de2de00bdb72..f32ad510b6fe0df92dcaa8afe7ba2bbfd652919f 100644 GIT binary patch delta 64 zcmX>seqVe;A(Oa_PkvFlepYE-W(kli$Sg@M(oZbN++4#nkwr)WLvHe1cFD~e?6X+_ D1M?S4 delta 47 vcmcaFepq}%ArrTNPkvFlepYE-X36Fbrim*V{vh6QORa2_N6QUm>>|& diff --git a/jcore-lingpipe-porterstemmer-ae/pom.xml b/jcore-lingpipe-porterstemmer-ae/pom.xml index 6df6ba486..615c960a0 100644 --- a/jcore-lingpipe-porterstemmer-ae/pom.xml +++ b/jcore-lingpipe-porterstemmer-ae/pom.xml @@ -22,8 +22,8 @@ 4.1.2-JL1.0 - junit - junit + org.junit.jupiter + junit-jupiter-engine https://github.com/JULIELab/jcore-base/tree/master/jcore-lingpipe-porterstemmer-ae diff --git a/jcore-lingpipe-porterstemmer-ae/src/test/java/de/julielab/jcore/ae/lingpipe/porterstemmer/LingpipePorterstemmerAnnotatorTest.java b/jcore-lingpipe-porterstemmer-ae/src/test/java/de/julielab/jcore/ae/lingpipe/porterstemmer/LingpipePorterstemmerAnnotatorTest.java index 58eb08a15..5bc2d85dd 100644 --- a/jcore-lingpipe-porterstemmer-ae/src/test/java/de/julielab/jcore/ae/lingpipe/porterstemmer/LingpipePorterstemmerAnnotatorTest.java +++ b/jcore-lingpipe-porterstemmer-ae/src/test/java/de/julielab/jcore/ae/lingpipe/porterstemmer/LingpipePorterstemmerAnnotatorTest.java @@ -16,10 +16,10 @@ import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class LingpipePorterstemmerAnnotatorTest { @Test diff --git a/jcore-lingpipegazetteer-ae/pom.xml b/jcore-lingpipegazetteer-ae/pom.xml index 686f9ae80..3941b37bd 100644 --- a/jcore-lingpipegazetteer-ae/pom.xml +++ b/jcore-lingpipegazetteer-ae/pom.xml @@ -52,15 +52,14 @@ org.apache.commons commons-lang3 - 3.4 org.assertj assertj-core - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/StringNormalizerForChunkingTest.java b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/StringNormalizerForChunkingTest.java index a1bbadf8c..06cc79ca0 100644 --- a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/StringNormalizerForChunkingTest.java +++ b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/StringNormalizerForChunkingTest.java @@ -9,11 +9,11 @@ import de.julielab.jcore.ae.lingpipegazetteer.utils.StringNormalizerForChunking; import de.julielab.jcore.ae.lingpipegazetteer.utils.StringNormalizerForChunking.NormalizedString; import org.apache.commons.lang3.Range; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.List; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class StringNormalizerForChunkingTest { @Test @@ -23,25 +23,25 @@ public void testTextNormalization() { term = "\"Call\" - postponed"; ns = StringNormalizerForChunking.normalizeString(term); - assertEquals("Term normalization was not correct", "Call postponed", ns.string); + assertEquals( "Call postponed", ns.string, "Term normalization was not correct"); term = "\"Light-for-dates\" with signs of fetal malnutrition, 1,000-1,249 grams"; ns = StringNormalizerForChunking.normalizeString(term); - assertEquals("Term normalization was not correct", - "Lightfordates with signs of fetal malnutrition 10001249 grams", ns.string); + assertEquals("Lightfordates with signs of fetal malnutrition 10001249 grams", + ns.string, "Term normalization was not correct"); term = "#Tarsal &/or metatarsal bones"; ns = StringNormalizerForChunking.normalizeString(term); - assertEquals("Term normalization was not correct", "Tarsal or metatarsal bones", ns.string); + assertEquals( "Tarsal or metatarsal bones", ns.string, "Term normalization was not correct"); term = "% it = jCas.getAnnotationIndex(EntityMention.type).iterator(); - assertTrue("There are no entity annotations in the CAS.", it.hasNext()); + assertTrue(it.hasNext(), "There are no entity annotations in the CAS."); EntityMention em = (EntityMention) it.next(); - assertEquals("Start wrong: ", new Integer(0), new Integer(em.getBegin())); - assertEquals("End wrong: ", new Integer(5), new Integer(em.getEnd())); - assertEquals("Wrong type: ", "SHP-1", em.getSpecificType()); + assertEquals( new Integer(0), new Integer(em.getBegin()), "Start wrong: "); + assertEquals( new Integer(5), new Integer(em.getEnd()), "End wrong: "); + assertEquals( "SHP-1", em.getSpecificType(), "Wrong type: "); - assertTrue("The secnond entity annotations is missing.", it.hasNext()); + assertTrue(it.hasNext(), "The secnond entity annotations is missing."); em = (EntityMention) it.next(); - assertEquals("Start wrong: ", new Integer(10), new Integer(em.getBegin())); - assertEquals("End wrong: ", new Integer(45), new Integer(em.getEnd())); - assertEquals("Wrong type: ", "KLRG2", em.getSpecificType()); + assertEquals( new Integer(10), new Integer(em.getBegin()), "Start wrong: "); + assertEquals( new Integer(45), new Integer(em.getEnd()), "End wrong: "); + assertEquals( "KLRG2", em.getSpecificType(), "Wrong type: "); - assertFalse("There are too many annotations.", it.hasNext()); + assertFalse(it.hasNext(), "There are too many annotations."); jCas.reset(); jCas.setDocumentText( @@ -314,13 +316,13 @@ public void testAnnotatorWithTextNormalization() annotator.process(jCas); it = jCas.getAnnotationIndex(EntityMention.type).iterator(); - assertTrue("There are no entity annotations in the CAS.", it.hasNext()); + assertTrue(it.hasNext(), "There are no entity annotations in the CAS."); em = (EntityMention) it.next(); - assertEquals("Start wrong: ", new Integer(17), new Integer(em.getBegin())); - assertEquals("End wrong: ", new Integer(103), new Integer(em.getEnd())); - assertEquals("Wrong type: ", "CHEM", em.getSpecificType()); + assertEquals( new Integer(17), new Integer(em.getBegin()), "Start wrong: "); + assertEquals( new Integer(103), new Integer(em.getEnd()), "End wrong: "); + assertEquals( "CHEM", em.getSpecificType(), "Wrong type: "); - assertFalse("There are too many annotations.", it.hasNext()); + assertFalse(it.hasNext(), "There are too many annotations."); jCas.reset(); jCas.setDocumentText( @@ -328,13 +330,13 @@ public void testAnnotatorWithTextNormalization() annotator.process(jCas); it = jCas.getAnnotationIndex(EntityMention.type).iterator(); - assertTrue("There are no entity annotations in the CAS.", it.hasNext()); + assertTrue(it.hasNext(), "There are no entity annotations in the CAS."); em = (EntityMention) it.next(); - assertEquals("Start wrong: ", new Integer(17), new Integer(em.getBegin())); - assertEquals("End wrong: ", new Integer(103), new Integer(em.getEnd())); - assertEquals("Wrong type: ", "CHEM", em.getSpecificType()); + assertEquals( new Integer(17), new Integer(em.getBegin()), "Start wrong: "); + assertEquals( new Integer(103), new Integer(em.getEnd()), "End wrong: "); + assertEquals( "CHEM", em.getSpecificType(), "Wrong type: "); - assertFalse("There are too many annotations.", it.hasNext()); + assertFalse(it.hasNext(), "There are too many annotations."); jCas.reset(); jCas.setDocumentText( @@ -342,7 +344,7 @@ public void testAnnotatorWithTextNormalization() annotator.process(jCas); it = jCas.getAnnotationIndex(EntityMention.type).iterator(); - assertFalse("There is an annotation in CAS although there shouldnt be.", it.hasNext()); + assertFalse(it.hasNext(), "There is an annotation in CAS although there shouldnt be."); jCas.reset(); jCas.setDocumentText("Test-dosing unit KLRg1 killer cell lectin like receptor G2 Parkinson's Disease"); @@ -354,7 +356,7 @@ public void testAnnotatorWithTextNormalization() System.out.println(it.next().getCoveredText()); counter++; } - assertEquals("Wrong entity count: ", new Integer(4), counter); + assertEquals( new Integer(4), counter, "Wrong entity count: "); } @@ -378,10 +380,10 @@ public void testAnnotatorWithPluralNormalization() annotator.process(jCas); Collection entityMentions = JCasUtil.select(jCas, EntityMention.class); - assertEquals("Expected a single entity", 2, entityMentions.size()); + assertEquals( 2, entityMentions.size(), "Expected a single entity"); Iterator iterator = entityMentions.iterator(); - assertEquals("Unexpected covered entity text", "lipoprotein", iterator.next().getCoveredText()); - assertEquals("Unexpected covered entity text", "lipoproteins", iterator.next().getCoveredText()); + assertEquals( "lipoprotein", iterator.next().getCoveredText(), "Unexpected covered entity text"); + assertEquals( "lipoproteins", iterator.next().getCoveredText(), "Unexpected covered entity text"); } @Test @@ -416,7 +418,7 @@ public void testAnnotateAcronymsWithFullFormEntity() throws Exception { it.next(); counter++; } - assertEquals("Wrong entity count: ", new Integer(1), counter); + assertEquals( new Integer(1), counter, "Wrong entity count: "); jCas.reset(); jCas.setDocumentText( @@ -454,7 +456,7 @@ public void testAnnotateAcronymsWithFullFormEntity() throws Exception { } assertEquals("GENE", next.getSpecificType()); } - assertEquals("Wrong entity count: ", new Integer(1), counter); + assertEquals( new Integer(1), counter, "Wrong entity count: "); } @Test @@ -599,9 +601,9 @@ public void testGroupOvecrlappingChunks() { assertEquals(1, bestChunkList.size()); Chunk bestChunk = bestChunkList.get(0); assertFalse( + bestChunks.contains(bestChunk), "Duplicate best chunk: " + bestChunk + " (\"" - + chunkedText.subSequence(bestChunk.start(), bestChunk.end()) + "\")", - bestChunks.contains(bestChunk)); + + chunkedText.subSequence(bestChunk.start(), bestChunk.end()) + "\")"); bestChunks.add(bestChunk); } } diff --git a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/OverlappingChunkTest.java b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/OverlappingChunkTest.java index 078f62ecb..c700ff26f 100644 --- a/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/OverlappingChunkTest.java +++ b/jcore-lingpipegazetteer-ae/src/test/java/de/julielab/jcore/ae/lingpipegazetteer/uima/OverlappingChunkTest.java @@ -3,12 +3,12 @@ import com.aliasi.chunk.Chunk; import com.aliasi.chunk.ChunkFactory; import de.julielab.jcore.ae.lingpipegazetteer.chunking.OverlappingChunk; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class OverlappingChunkTest { @Test diff --git a/jcore-lingscope-ae/pom.xml b/jcore-lingscope-ae/pom.xml index 4c5a15b41..60cdb8dd6 100644 --- a/jcore-lingscope-ae/pom.xml +++ b/jcore-lingscope-ae/pom.xml @@ -47,8 +47,8 @@ jcore-descriptor-creator - junit - junit + org.junit.jupiter + junit-jupiter-engine de.julielab diff --git a/jcore-lingscope-ae/src/main/java/de/julielab/jcore/ae/lingscope/LingscopePosAnnotator.java b/jcore-lingscope-ae/src/main/java/de/julielab/jcore/ae/lingscope/LingscopePosAnnotator.java index ff34b56ad..aaaae3656 100644 --- a/jcore-lingscope-ae/src/main/java/de/julielab/jcore/ae/lingscope/LingscopePosAnnotator.java +++ b/jcore-lingscope-ae/src/main/java/de/julielab/jcore/ae/lingscope/LingscopePosAnnotator.java @@ -22,8 +22,8 @@ import java.io.File; import java.io.IOException; -import java.util.*; import java.util.List; +import java.util.*; import java.util.function.Supplier; import java.util.stream.Collectors; diff --git a/jcore-lingscope-ae/src/test/java/de/julielab/LingscopePosAnnotatorTest.java b/jcore-lingscope-ae/src/test/java/de/julielab/LingscopePosAnnotatorTest.java index 1e5d75496..7089675df 100644 --- a/jcore-lingscope-ae/src/test/java/de/julielab/LingscopePosAnnotatorTest.java +++ b/jcore-lingscope-ae/src/test/java/de/julielab/LingscopePosAnnotatorTest.java @@ -8,7 +8,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.stream.Collectors; diff --git a/jcore-linnaeus-species-ae/pom.xml b/jcore-linnaeus-species-ae/pom.xml index 68c29ba14..354a3c751 100644 --- a/jcore-linnaeus-species-ae/pom.xml +++ b/jcore-linnaeus-species-ae/pom.xml @@ -41,8 +41,8 @@ jcore-descriptor-creator - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-linnaeus-species-ae/src/main/java/de/julielab/jcore/ae/linnaeus/LinnaeusMatcherProvider.java b/jcore-linnaeus-species-ae/src/main/java/de/julielab/jcore/ae/linnaeus/LinnaeusMatcherProvider.java index bdccc500e..0bf56eb18 100644 --- a/jcore-linnaeus-species-ae/src/main/java/de/julielab/jcore/ae/linnaeus/LinnaeusMatcherProvider.java +++ b/jcore-linnaeus-species-ae/src/main/java/de/julielab/jcore/ae/linnaeus/LinnaeusMatcherProvider.java @@ -1,7 +1,5 @@ package de.julielab.jcore.ae.linnaeus; -import org.apache.uima.resource.DataResource; -import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.SharedResourceObject; import uk.ac.man.entitytagger.matching.Matcher; diff --git a/jcore-linnaeus-species-ae/src/test/java/de/julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotatorTest.java b/jcore-linnaeus-species-ae/src/test/java/de/julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotatorTest.java index 58a46dec9..16bcd3e2c 100644 --- a/jcore-linnaeus-species-ae/src/test/java/de/julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotatorTest.java +++ b/jcore-linnaeus-species-ae/src/test/java/de/julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotatorTest.java @@ -20,11 +20,10 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ExternalResourceDescription; -import org.apache.uima.resource.metadata.ExternalResourceBinding; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class LinnaeusSpeciesAnnotatorTest { @Test diff --git a/jcore-medxn-ae/pom.xml b/jcore-medxn-ae/pom.xml index aac277c21..0eaff3697 100644 --- a/jcore-medxn-ae/pom.xml +++ b/jcore-medxn-ae/pom.xml @@ -25,8 +25,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-medxn-ae/src/test/java/de/julielab/jcore/ae/medxn/MedAttrAnnotatorTest.java b/jcore-medxn-ae/src/test/java/de/julielab/jcore/ae/medxn/MedAttrAnnotatorTest.java index 110de0875..4f4e08302 100644 --- a/jcore-medxn-ae/src/test/java/de/julielab/jcore/ae/medxn/MedAttrAnnotatorTest.java +++ b/jcore-medxn-ae/src/test/java/de/julielab/jcore/ae/medxn/MedAttrAnnotatorTest.java @@ -21,16 +21,17 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.JFSIndexRepository; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; +import static org.junit.jupiter.api.Assertions.assertTrue; + public class MedAttrAnnotatorTest { private static final String AE_DESCRIPTOR = "de.julielab.jcore.ae.medxn.desc.jcore-medxn-ae-attributes-german"; @@ -66,11 +67,11 @@ private void check(String[] goldlines, JCas tcas) { Boolean lengthEqual = (goldlines.length == menCount); - Assert.assertTrue("Expression count differs; should be '" + - Integer.toString(goldlines.length) + "' but is '" + menCount.toString() +"'.", - lengthEqual); + assertTrue(lengthEqual, + "Expression count differs; should be '" + + goldlines.length + "' but is '" + menCount.toString() +"'."); Boolean arrayEqual = (goldlines.equals(actLines.toArray(new String[actLines.size()]))); - Assert.assertTrue("Expressions differ", arrayEqual); + assertTrue(arrayEqual, "Expressions differ"); } private void reset() { @@ -78,7 +79,7 @@ private void reset() { } - @Before + @BeforeEach public void initializeComponents() throws IOException, UIMAException { if (setUpIsDone) { return; @@ -90,7 +91,7 @@ public void initializeComponents() throws IOException, UIMAException { setUpIsDone = true; } - @Ignore + @Disabled @Test public void testDuration() { String text; @@ -113,7 +114,7 @@ public void testDuration() { } } - @Ignore + @Disabled @Test public void testDose() { String text; @@ -136,7 +137,7 @@ public void testDose() { } } - @Ignore + @Disabled @Test public void testFrequency() { String text; @@ -159,7 +160,7 @@ public void testFrequency() { } } - @Ignore + @Disabled @Test public void testModus() { String text; diff --git a/jcore-msdoc-reader/pom.xml b/jcore-msdoc-reader/pom.xml index 74d9d3daa..c162caa94 100644 --- a/jcore-msdoc-reader/pom.xml +++ b/jcore-msdoc-reader/pom.xml @@ -46,8 +46,8 @@ 3.16 - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-msdoc-reader/src/test/java/de/julielab/jcore/reader/msdoc/main/MSdocReaderTest.java b/jcore-msdoc-reader/src/test/java/de/julielab/jcore/reader/msdoc/main/MSdocReaderTest.java index d3945a6db..68942199c 100644 --- a/jcore-msdoc-reader/src/test/java/de/julielab/jcore/reader/msdoc/main/MSdocReaderTest.java +++ b/jcore-msdoc-reader/src/test/java/de/julielab/jcore/reader/msdoc/main/MSdocReaderTest.java @@ -27,17 +27,17 @@ import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class MSdocReaderTest { /** @@ -69,7 +69,7 @@ public class MSdocReaderTest { private static final String DOC_DUMMY_NAME = "dummy.doc"; private static final String DOC_DUMMY_FILE = "src/test/resources/" + DOC_DUMMY_NAME; - @BeforeClass + @BeforeAll public static void setUp() throws Exception { /** * Create dummies of *.doc-files. @@ -161,7 +161,7 @@ private static void writeArtifact(String file_name) throws IOException { } } - @AfterClass + @AfterAll public static void tearDown() throws Exception { /** * Delete dummies from setUp. diff --git a/jcore-mstparser-ae/pom.xml b/jcore-mstparser-ae/pom.xml index 83f9017af..08d948e99 100644 --- a/jcore-mstparser-ae/pom.xml +++ b/jcore-mstparser-ae/pom.xml @@ -80,8 +80,8 @@ provided - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-mstparser-ae/src/test/java/de/julielab/jcore/ae/mstparser/main/MSTParserTest.java b/jcore-mstparser-ae/src/test/java/de/julielab/jcore/ae/mstparser/main/MSTParserTest.java index 297b93cb5..46a6fe3a9 100644 --- a/jcore-mstparser-ae/src/test/java/de/julielab/jcore/ae/mstparser/main/MSTParserTest.java +++ b/jcore-mstparser-ae/src/test/java/de/julielab/jcore/ae/mstparser/main/MSTParserTest.java @@ -19,7 +19,6 @@ import de.julielab.jcore.types.DependencyRelation; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; @@ -36,7 +35,8 @@ import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; -import org.junit.Ignore; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; @@ -46,12 +46,15 @@ import java.io.FileOutputStream; import java.io.IOException; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + /** * This is the JUnit test for the MST Parser Annotator. * * @author Lichtenwald */ -public class MSTParserTest extends TestCase { +public class MSTParserTest { private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties"; public static final String PARAM_MAX_NUM_TOKENS = "MaxNumTokens"; @@ -68,7 +71,7 @@ public class MSTParserTest extends TestCase { /*--------------------------------------------------------------------------------------------*/ - @Ignore + @Disabled // public void testCAS() throws Exception { // // String[] heads = new String[] { "have", "Migrants", "drown", "coast", "off", "40", "40", "migrants", "have", // // "have", "drowned", "Sea", "Sea", "in", "drowned", "coast", "coast", "off", "coast", "of", "drowned", @@ -174,6 +177,7 @@ public class MSTParserTest extends TestCase { // jcas.reset(); // } // of initCas + @Test public void testThreads() throws Exception { try { int count = 3; @@ -188,7 +192,7 @@ public void testThreads() throws Exception { x.run(); Thread.sleep(5000); } catch (RuntimeException e) { - fail("Errorin Threads"); + fail("Error in Threads"); } } @@ -230,6 +234,7 @@ public void testThreads() throws Exception { * @throws AnalysisEngineProcessException * @throws SAXException */ + @Test public void testProcess() throws IOException, InvalidXMLException, ResourceInitializationException, CASException, AnalysisEngineProcessException, SAXException { XMLInputSource descriptor = new XMLInputSource(DESCRIPTOR_MST_PARSER); @@ -245,9 +250,10 @@ public void testProcess() throws IOException, InvalidXMLException, ResourceIniti FileOutputStream fos = new FileOutputStream(OUTPUT_DIR + File.separator + "test.xmi"); XmiCasSerializer.serialize(jcas.getCas(), fos); - assertTrue("Invalid JCas!", checkAnnotations(jcas, null)); + assertTrue(checkAnnotations(jcas, null), "Invalid JCas!"); } // of testProcess + @Test public void testProcessWithNumTokensRestriction() throws IOException, InvalidXMLException, ResourceInitializationException, CASException, AnalysisEngineProcessException, SAXException, ResourceConfigurationException { @@ -263,7 +269,7 @@ public void testProcessWithNumTokensRestriction() ae.process(jcas); FileOutputStream fos = new FileOutputStream(OUTPUT_DIR + File.separator + "test.xmi"); XmiCasSerializer.serialize(jcas.getCas(), fos); - assertTrue("Invalid JCas!", checkAnnotations(jcas, MAX_NUM_TOKENS)); + assertTrue(checkAnnotations(jcas, MAX_NUM_TOKENS), "Invalid JCas!"); } /** diff --git a/jcore-muc7-reader/pom.xml b/jcore-muc7-reader/pom.xml index a1461b459..b06e1cbbf 100644 --- a/jcore-muc7-reader/pom.xml +++ b/jcore-muc7-reader/pom.xml @@ -22,8 +22,8 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-muc7-reader/scripts/muc7_SGML2XML.py b/jcore-muc7-reader/scripts/muc7_SGML2XML.py index b015b9342..9dbed485a 100644 --- a/jcore-muc7-reader/scripts/muc7_SGML2XML.py +++ b/jcore-muc7-reader/scripts/muc7_SGML2XML.py @@ -5,9 +5,7 @@ # - `

`: needs to be closed with `

` import re -import os import sys -import glob def close_paragraphs(line): diff --git a/jcore-muc7-reader/src/test/java/de/julielab/jcore/reader/muc7/MUC7ReaderTest.java b/jcore-muc7-reader/src/test/java/de/julielab/jcore/reader/muc7/MUC7ReaderTest.java index 77f12db5e..b2e97da26 100644 --- a/jcore-muc7-reader/src/test/java/de/julielab/jcore/reader/muc7/MUC7ReaderTest.java +++ b/jcore-muc7-reader/src/test/java/de/julielab/jcore/reader/muc7/MUC7ReaderTest.java @@ -11,7 +11,6 @@ import de.julielab.jcore.types.muc7.ENAMEX; import de.julielab.jcore.types.muc7.NUMEX; import de.julielab.jcore.types.muc7.TIMEX; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.metadata.AnalysisEngineMetaData; import org.apache.uima.cas.CAS; @@ -24,6 +23,8 @@ import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.xml.sax.SAXException; import javax.xml.parsers.ParserConfigurationException; @@ -31,7 +32,9 @@ import java.util.ArrayList; import java.util.Iterator; -public class MUC7ReaderTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class MUC7ReaderTest { /** * Path to the MedlineReader descriptor */ @@ -40,12 +43,11 @@ public class MUC7ReaderTest extends TestCase { /** * Object to be tested */ - private CollectionReader muc7Reader; + private static CollectionReader muc7Reader; + private static CAS cas; + - - private CAS cas; - /** * Test data */ @@ -87,12 +89,11 @@ public class MUC7ReaderTest extends TestCase { /** * * CAS array with CAS objects that where processed by the muc7Reader */ - private ArrayList cases = new ArrayList(); + private static ArrayList cases = new ArrayList(); - @Override - protected void setUp() throws Exception { - super.setUp(); + @BeforeAll + protected static void setUp() throws Exception { muc7Reader = produceCollectionReader(MUC7_READER_DESCRIPTOR); processAllCases(); } @@ -105,7 +106,7 @@ protected void setUp() throws Exception { * @throws SAXException * @throws ParserConfigurationException */ - private void processAllCases() throws CASException, SAXException, ParserConfigurationException { + private static void processAllCases() throws CASException, SAXException, ParserConfigurationException { try { while (muc7Reader.hasNext()) { cas = CasCreationUtils.createCas((AnalysisEngineMetaData) muc7Reader.getMetaData()); @@ -123,20 +124,21 @@ private void processAllCases() throws CASException, SAXException, ParserConfigur } /** * Test if method getNextCas() has done its job - */ + */ + @Test public void testGetNextCas() { //check for a TIMEX entity String[] timexData = getTimexData(DOC_ID); - assertTrue("TIMEX", checkTimex(timexData)); + assertTrue(checkTimex(timexData), "TIMEX"); //check for a ENAMEX entity String[] enamexData = getEnamexData(DOC_ID); - assertTrue("ENAMEX", checkEnamex(enamexData)); + assertTrue(checkEnamex(enamexData), "ENAMEX"); //check for a NUMEX entity String[] numexData = getNumexData(DOC_ID); - assertTrue("NUMEX", checkNumex(numexData)); + assertTrue(checkNumex(numexData), "NUMEX"); //TODO coreference doesn't works as of now //check for a coref chain @@ -337,7 +339,7 @@ private void buildCorefChain(int corefID, ArrayList corefChain, JCas jca /** * Gets an Iterator over the the CAS for the specific type * - * @param cas (the CAS) + * @param jcas (the CAS) * @param type (the type) * @return the iterator */ @@ -371,7 +373,7 @@ private String[] toStringArray(ArrayList stringArray) { * @throws InvalidXMLException * @throws ResourceInitializationException */ - private CollectionReader produceCollectionReader(String descriptor) throws InvalidXMLException, IOException, ResourceInitializationException { + private static CollectionReader produceCollectionReader(String descriptor) throws InvalidXMLException, IOException, ResourceInitializationException { CollectionReader collectionReader; ResourceSpecifier spec; spec = UIMAFramework.getXMLParser().parseResourceSpecifier(new XMLInputSource(descriptor)); diff --git a/jcore-mutationfinder-ae/pom.xml b/jcore-mutationfinder-ae/pom.xml index 62b3a5d5b..b6d707627 100644 --- a/jcore-mutationfinder-ae/pom.xml +++ b/jcore-mutationfinder-ae/pom.xml @@ -23,8 +23,8 @@ 2.0.8 - junit - junit + org.junit.jupiter + junit-jupiter-engine de.julielab @@ -35,7 +35,16 @@ de.julielab jcore-descriptor-creator + + org.slf4j + slf4j-api + + + org.assertj + assertj-core + + BSD-2-Clause diff --git a/jcore-mutationfinder-ae/src/test/java/de/julielab/jcore/ae/mutationfinder/MutationAnnotatorTest.java b/jcore-mutationfinder-ae/src/test/java/de/julielab/jcore/ae/mutationfinder/MutationAnnotatorTest.java index 5291c51fa..c877fdc14 100644 --- a/jcore-mutationfinder-ae/src/test/java/de/julielab/jcore/ae/mutationfinder/MutationAnnotatorTest.java +++ b/jcore-mutationfinder-ae/src/test/java/de/julielab/jcore/ae/mutationfinder/MutationAnnotatorTest.java @@ -6,12 +6,11 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.hamcrest.CoreMatchers; -import org.junit.Assert; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Collection; +import static org.assertj.core.api.Assertions.assertThat; public class MutationAnnotatorTest { @Test @@ -21,8 +20,8 @@ public void testAnnotator() throws Exception { jCas.setDocumentText("A covalently bound catalytic intermediate in Escherichia coli asparaginase: crystal structure of a Thr-89-Val mutant."); annotator.process(jCas); final Collection mutations = JCasUtil.select(jCas, PointMutation.class); - Assert.assertThat(mutations.size(), CoreMatchers.is(1)); - Assert.assertThat(mutations.stream().findAny().get().getCoveredText(), CoreMatchers.equalTo("Thr-89-Val")); - Assert.assertThat(mutations.stream().findAny().get().getSpecificType(), CoreMatchers.equalTo("T89V")); + assertThat(mutations).hasSize(1); + assertThat(mutations.stream().findAny().get().getCoveredText()).isEqualTo("Thr-89-Val"); + assertThat(mutations.stream().findAny().get().getSpecificType()).isEqualTo("T89V"); } } diff --git a/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/MutationFinderTest.java b/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/MutationFinderTest.java index 51aa04218..4bc918ef2 100644 --- a/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/MutationFinderTest.java +++ b/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/MutationFinderTest.java @@ -1,24 +1,27 @@ package edu.uchsc.ccp.nlp.ei.mutation; -import junit.framework.TestCase; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileInputStream; import java.util.*; +import static org.junit.jupiter.api.Assertions.assertEquals; + /* * Copyright (c) 2007 Regents of the University of Colorado * Please refer to the licensing agreement at MUTATIONFINDER_HOME/doc/license.txt */ -public class MutationFinderTest extends TestCase { +public class MutationFinderTest { - private List regularExpressions; + private static List regularExpressions; - private MutationFinder mf; + private static MutationFinder mf; - @Override - protected void setUp() throws Exception { + @BeforeAll + protected static void setUp() { /* The first four default regular expressions */ regularExpressions = new ArrayList(); regularExpressions @@ -32,8 +35,6 @@ protected void setUp() throws Exception { .add("(^|[\\s\\(\\[\\'\"/,\\-])(?P(CYS|ILE|SER|GLN|MET|ASN|PRO|LYS|ASP|THR|PHE|ALA|GLY|HIS|LEU|ARG|TRP|VAL|GLU|TYR)|(GLUTAMINE|GLUTAMIC ACID|LEUCINE|VALINE|ISOLEUCINE|LYSINE|ALANINE|GLYCINE|ASPARTATE|METHIONINE|THREONINE|HISTIDINE|ASPARTIC ACID|ARGININE|ASPARAGINE|TRYPTOPHAN|PROLINE|PHENYLALANINE|CYSTEINE|SERINE|GLUTAMATE|TYROSINE))(?P[1-9][0-9]*) to (?P(CYS|ILE|SER|GLN|MET|ASN|PRO|LYS|ASP|THR|PHE|ALA|GLY|HIS|LEU|ARG|TRP|VAL|GLU|TYR)|(GLUTAMINE|GLUTAMIC ACID|LEUCINE|VALINE|ISOLEUCINE|LYSINE|ALANINE|GLYCINE|ASPARTATE|METHIONINE|THREONINE|HISTIDINE|ASPARTIC ACID|ARGININE|ASPARAGINE|TRYPTOPHAN|PROLINE|PHENYLALANINE|CYSTEINE|SERINE|GLUTAMATE|TYROSINE))(?=([.,\\s)\\]\\'\":;\\-?!/]|$))"); mf = new MutationFinder(new HashSet(regularExpressions)); - - super.setUp(); } /** @@ -41,6 +42,7 @@ protected void setUp() throws Exception { * * @throws Exception */ + @Test public void testConstructor() throws Exception { mf = new MutationFinder(new HashSet()); mf = new MutationFinder(new HashSet(regularExpressions)); @@ -62,6 +64,7 @@ public void testConstructor() throws Exception { * * @throws Exception */ + @Test public void testExtractMappingsFromPythonRegex() throws Exception { Map groupMappings = MutationFinder.extractMappingsFromPythonRegex(regularExpressions.get(0)); assertEquals(new Integer(2), groupMappings.get(MutationFinder.WT_RES)); @@ -80,6 +83,7 @@ public void testExtractMappingsFromPythonRegex() throws Exception { * * @throws Exception */ + @Test public void testRemoveTagsFromPythonRegex() throws Exception { String regex0WithoutTags = "(^|[\\s\\(\\[\\'\"/,\\-])([CISQMNPKDTFAGHLRWVEY])([1-9][0-9]+)([CISQMNPKDTFAGHLRWVEY])(?=([.,\\s)\\]\\'\":;\\-?!/]|$))[CASE_SENSITIVE]"; assertEquals(regex0WithoutTags, MutationFinder.removeTagsFromPythonRegex(regularExpressions.get(0))); @@ -95,6 +99,7 @@ public void testRemoveTagsFromPythonRegex() throws Exception { * * @throws Exception */ + @Test public void testExtractionNoMutations() throws Exception { Map> mutations = mf.extractMutations(""); assertEquals(0, mutations.size()); @@ -117,6 +122,7 @@ public void testExtractionNoMutations() throws Exception { * * @throws Exception */ + @Test public void testExtractSingleMutation() throws Exception { Map> mutations = mf.extractMutations("S42T"); Set expectedPMs = new HashSet(); @@ -141,6 +147,7 @@ public void testExtractSingleMutation() throws Exception { * * @throws Exception */ + @Test public void testExtractMultipleMutations() throws Exception { Map> mutations = mf.extractMutations("S42T and W36Y"); Set expectedPMs = new HashSet(); @@ -173,6 +180,7 @@ public void testExtractMultipleMutations() throws Exception { * * @throws Exception */ + @Test public void testExtractMultipleMutationsWithPositiveLookahead() throws Exception { Map> mutations = mf.extractMutations("S42T W36Y"); Set expectedPMs = new HashSet(); @@ -191,6 +199,7 @@ public void testExtractMultipleMutationsWithPositiveLookahead() throws Exception * * @throws Exception */ + @Test public void testExtractionSpanCalculations() throws Exception { Map> mutations = mf.extractMutations("S42T and W36Y"); Mutation expectedPM = new PointMutation(42, "S", "T"); @@ -248,6 +257,7 @@ public void testExtractionSpanCalculations() throws Exception { * * @throws Exception */ + @Test public void testExtractionOfVariousFormats() throws Exception { Map> mutations = mf.extractMutations("The A42G mutation was made."); Mutation expectedPM = new PointMutation(42, "A", "G"); @@ -296,6 +306,7 @@ public void testExtractionOfVariousFormats() throws Exception { * * @throws Exception */ + @Test public void testRegexCaseInsensitiveFlag() throws Exception { Map> mutations = mf.extractMutations("a64t"); assertEquals(0, mutations.size()); @@ -323,6 +334,7 @@ public void testRegexCaseInsensitiveFlag() throws Exception { * * @throws Exception */ + @Test public void testCaseInsensitiveCases() throws Exception { Map> mutations = mf.extractMutations("ala64gly"); assertEquals(1, mutations.size()); @@ -346,6 +358,7 @@ public void testCaseInsensitiveCases() throws Exception { * * @throws Exception */ + @Test public void testPostProcessing() throws Exception { Map> mutations = mf.extractMutations("A64G"); assertEquals(1, mutations.size()); @@ -366,6 +379,7 @@ public void testPostProcessing() throws Exception { * * @throws Exception */ + @Test public void testVariedDigitLength() throws Exception { Map> mutations = mf.extractMutations("ala64gly"); assertEquals(1, mutations.size()); @@ -388,6 +402,7 @@ public void testVariedDigitLength() throws Exception { * * @throws Exception */ + @Test public void testUnacceptableGeneralWordBoundaries() throws Exception { String startCharacters = "abcdefghijklmnopqrstuvwxyz0123456789~@#$%^&*_+=])"; String endCharacters = "abcdefghijklmnopqrstuvwxyz0123456789~@#$%^&*_+=(['"; @@ -408,6 +423,7 @@ public void testUnacceptableGeneralWordBoundaries() throws Exception { * * @throws Exception */ + @Test public void testAcceptableGeneralWordBoundaries() throws Exception { char[] endCharacters = { '.', ',', ' ', '\t', '\n', ')', ']', '"', '\'', ':', ';', '?', '!', '/', '-' }; char[] startCharacters = { ' ', '\t', '\n', '"', '\'', '(', '[', '/', ',', '-' }; @@ -429,6 +445,7 @@ public void testAcceptableGeneralWordBoundaries() throws Exception { * * @throws Exception */ + @Test public void testMixOneAndThreeLetterStrings() throws Exception { Map> mutations = mf.extractMutations("A64Gly"); assertEquals(0, mutations.size()); @@ -442,6 +459,7 @@ public void testMixOneAndThreeLetterStrings() throws Exception { * * @throws Exception */ + @Test public void testFullNameMethods() throws Exception { Map> mutations = mf.extractMutations("alanine64-->Gly"); assertEquals(1, mutations.size()); @@ -455,6 +473,7 @@ public void testFullNameMethods() throws Exception { * * @throws Exception */ + @Test public void testOneLetterAbbreviationFailsNon_wNmFormat() throws Exception { Map> mutations = mf.extractMutations("A64-->glycine"); assertEquals(0, mutations.size()); @@ -471,6 +490,7 @@ public void testOneLetterAbbreviationFailsNon_wNmFormat() throws Exception { * * @throws Exception */ + @Test public void testTextBasedMatches() throws Exception { String[] mutationTexts = { "Ala64 to Gly", "Alanine64 to Glycine", "Ala64 to Glycine", "alanine64 to Gly", "The Ala64 to Gly substitution", "The Ala64 to glycine substitution", "The Ala64 to Gly substitution" }; @@ -490,6 +510,7 @@ public void testTextBasedMatches() throws Exception { * * @throws Exception */ + @Test public void testTextMatchSpacing() throws Exception { Map> mutations = mf.extractMutations("TheAla40toGlymutation"); assertEquals(0, mutations.size()); diff --git a/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/MutationTest.java b/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/MutationTest.java index 671baf314..465898ff9 100644 --- a/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/MutationTest.java +++ b/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/MutationTest.java @@ -1,19 +1,23 @@ package edu.uchsc.ccp.nlp.ei.mutation; -import junit.framework.TestCase; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; /* * Copyright (c) 2007 Regents of the University of Colorado * Please refer to the licensing agreement at MUTATIONFINDER_HOME/doc/license.txt */ -public class MutationTest extends TestCase { +public class MutationTest { /** * Test the the constructor works for input of both int's and String's * * @throws Exception */ + @Test public void testConstructor() throws Exception { Mutation m = new Mutation(42); assertEquals(42, m.getPosition()); @@ -32,6 +36,7 @@ public void testConstructor() throws Exception { * * @throws Exception */ + @Test public void testUnsupportedMethods() throws Exception { Mutation m = new Mutation(42); try { diff --git a/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/PointMutationTest.java b/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/PointMutationTest.java index ec5704846..73bb0df0b 100644 --- a/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/PointMutationTest.java +++ b/jcore-mutationfinder-ae/src/test/java/edu/uchsc/ccp/nlp/ei/mutation/PointMutationTest.java @@ -1,23 +1,26 @@ package edu.uchsc.ccp.nlp.ei.mutation; -import junit.framework.TestCase; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.util.HashMap; import java.util.Map; import java.util.Set; +import static org.junit.jupiter.api.Assertions.*; + /* * Copyright (c) 2007 Regents of the University of Colorado * Please refer to the licensing agreement at MUTATIONFINDER_HOME/doc/license.txt */ -public class PointMutationTest extends TestCase { - private PointMutation pointMutation; +public class PointMutationTest { + private static PointMutation pointMutation; - private Map aminoAcidCodeLookup; + private static Map aminoAcidCodeLookup; - @Override - protected void setUp() throws Exception { + @BeforeAll + protected static void setUp() throws Exception { pointMutation = new PointMutation(42, "W", "G"); aminoAcidCodeLookup = new HashMap(); @@ -84,7 +87,6 @@ protected void setUp() throws Exception { aminoAcidCodeLookup.put("D", "D"); aminoAcidCodeLookup.put("E", "E"); - super.setUp(); } /** @@ -92,6 +94,7 @@ protected void setUp() throws Exception { * * @throws Exception */ + @Test public void testConstructor() throws Exception { PointMutation pm = new PointMutation(42, "A", "C"); assertEquals(42, pm.getPosition()); @@ -125,6 +128,7 @@ public void testConstructor() throws Exception { * * @throws Exception */ + @Test public void testHashcode() throws Exception { PointMutation pm = new PointMutation(42, "W", "G"); assertEquals((pm.getClass().getName() + pm.toString()).hashCode(), pm.hashCode()); @@ -135,6 +139,7 @@ public void testHashcode() throws Exception { * * @throws Exception */ + @Test public void testInvalidInit() throws Exception { PointMutation pm; try { @@ -178,6 +183,7 @@ public void testInvalidInit() throws Exception { * * @throws Exception */ + @Test public void testEquals() throws Exception { PointMutation pm = new PointMutation(42, "W", "G"); assertTrue(pointMutation.equals(pm)); @@ -200,6 +206,7 @@ public void testEquals() throws Exception { * * @throws Exception */ + @Test public void testNormalizationOfResidue() throws Exception { Set residuesToNormalize = aminoAcidCodeLookup.keySet(); for (String residue : residuesToNormalize) { @@ -212,6 +219,7 @@ public void testNormalizationOfResidue() throws Exception { * * @throws Exception */ + @Test public void testNormalizationOfInvalidResidue() throws Exception { try { pointMutation.normalizeResidueIdentity(""); @@ -260,6 +268,7 @@ public void testNormalizationOfInvalidResidue() throws Exception { * Test the static method which enables creation of a PointMutation object from a String in the wNm format * @throws Exception */ + @Test public void testCreateNewPointMutationFrom_wNm() throws Exception { PointMutation pm = PointMutation.createPointMutationFrom_wNm("W42G"); assertEquals(pointMutation, pm); diff --git a/jcore-neo4j-relations-consumer/pom.xml b/jcore-neo4j-relations-consumer/pom.xml index 92fc5f29b..dca3293f1 100644 --- a/jcore-neo4j-relations-consumer/pom.xml +++ b/jcore-neo4j-relations-consumer/pom.xml @@ -64,6 +64,11 @@ de.julielab jcore-descriptor-creator + + junit + junit + 4.13.1 + JCoRe Neo4j Relations Consumer diff --git a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java index 9a1fc1905..174a19537 100644 --- a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java +++ b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerIntegrationTest.java @@ -32,7 +32,7 @@ import static de.julielab.neo4j.plugins.constants.semedico.SemanticRelationConstants.PROP_DOC_IDS; import static de.julielab.neo4j.plugins.datarepresentation.constants.ConceptConstants.PROP_SRC_IDS; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.neo4j.configuration.GraphDatabaseSettings.DEFAULT_DATABASE_NAME; /** diff --git a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java index 28ba51f74..6e242d25d 100644 --- a/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java +++ b/jcore-neo4j-relations-consumer/src/test/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumerTest.java @@ -14,7 +14,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.factory.UimaContextFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.lang.reflect.Method; import java.util.List; diff --git a/jcore-opennlp-chunk-ae/pom.xml b/jcore-opennlp-chunk-ae/pom.xml index 1ec6f1917..c89174100 100644 --- a/jcore-opennlp-chunk-ae/pom.xml +++ b/jcore-opennlp-chunk-ae/pom.xml @@ -53,8 +53,8 @@ julielab-java-utilities - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-opennlp-chunk-ae/src/test/java/de/julielab/jcore/ae/opennlp/chunk/ChunkAnnotatorTest.java b/jcore-opennlp-chunk-ae/src/test/java/de/julielab/jcore/ae/opennlp/chunk/ChunkAnnotatorTest.java index b87f1ab61..08be7f7ab 100644 --- a/jcore-opennlp-chunk-ae/src/test/java/de/julielab/jcore/ae/opennlp/chunk/ChunkAnnotatorTest.java +++ b/jcore-opennlp-chunk-ae/src/test/java/de/julielab/jcore/ae/opennlp/chunk/ChunkAnnotatorTest.java @@ -21,7 +21,6 @@ import de.julielab.jcore.types.PennBioIEPOSTag; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.fit.factory.AnalysisEngineFactory; @@ -33,6 +32,7 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,13 +41,13 @@ import java.util.function.BiConsumer; import java.util.stream.Collectors; -public class ChunkAnnotatorTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class ChunkAnnotatorTest { private static final Logger LOGGER = LoggerFactory.getLogger(ChunkAnnotatorTest.class); - protected void setUp() throws Exception { - super.setUp(); - } String text = "A study on the Prethcamide hydroxylation system in rat hepatic microsomes ."; @@ -84,6 +84,7 @@ private void initCas(JCas jcas) { } } + @Test public void testProcess() { XMLInputSource chunkerXML = null; @@ -134,7 +135,7 @@ public void testProcess() { assertEquals(chunks, predictedChunks); } - + @Test public void testProcessWithDefaultMappings() { XMLInputSource chunkerXML = null; @@ -185,7 +186,7 @@ public void testProcessWithDefaultMappings() { assertEquals(chunks, predictedChunks); } - + @Test public void testPunctuation() throws Exception { JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types"); AnalysisEngine chunker = AnalysisEngineFactory.createEngine("ChunkAnnotatorTest"); diff --git a/jcore-opennlp-parser-ae/pom.xml b/jcore-opennlp-parser-ae/pom.xml index 5a2ef3229..07da362a9 100644 --- a/jcore-opennlp-parser-ae/pom.xml +++ b/jcore-opennlp-parser-ae/pom.xml @@ -102,8 +102,8 @@ 1.6.0 - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-opennlp-parser-ae/src/test/java/de/julielab/jcore/ae/opennlpparser/main/ParseAnnotatorTest.java b/jcore-opennlp-parser-ae/src/test/java/de/julielab/jcore/ae/opennlpparser/main/ParseAnnotatorTest.java index 0f0cd1315..6955ce7c3 100644 --- a/jcore-opennlp-parser-ae/src/test/java/de/julielab/jcore/ae/opennlpparser/main/ParseAnnotatorTest.java +++ b/jcore-opennlp-parser-ae/src/test/java/de/julielab/jcore/ae/opennlpparser/main/ParseAnnotatorTest.java @@ -18,7 +18,6 @@ package de.julielab.jcore.ae.opennlpparser.main; import de.julielab.jcore.types.*; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; @@ -26,23 +25,19 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Iterator; -public class ParseAnnotatorTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class ParseAnnotatorTest { private static final Logger LOGGER = LoggerFactory.getLogger(ParseAnnotatorTest.class); private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties"; - @Override - protected void setUp() throws Exception { - super.setUp(); - // set log4j properties file - // PropertyConfigurator.configure(LOGGER_PROPERTIES); - } - String text = "A study on the Prethcamide hydroxylation system in rat hepatic microsomes ."; String wantedCons = "NP NP PP NP NP PP NP "; @@ -68,6 +63,7 @@ public void initCas(JCas jcas) { } } + @Test public void testProcess() { boolean annotationsOK = true; diff --git a/jcore-opennlp-postag-ae/pom.xml b/jcore-opennlp-postag-ae/pom.xml index cadd08079..77ca254eb 100644 --- a/jcore-opennlp-postag-ae/pom.xml +++ b/jcore-opennlp-postag-ae/pom.xml @@ -76,8 +76,8 @@ provided - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe OpenNLP POS Tagger diff --git a/jcore-opennlp-postag-ae/src/test/java/de/julielab/jcore/ae/opennlp/postag/PosTagAnnotatorTest.java b/jcore-opennlp-postag-ae/src/test/java/de/julielab/jcore/ae/opennlp/postag/PosTagAnnotatorTest.java index d2db4293f..ebdeb2c5b 100644 --- a/jcore-opennlp-postag-ae/src/test/java/de/julielab/jcore/ae/opennlp/postag/PosTagAnnotatorTest.java +++ b/jcore-opennlp-postag-ae/src/test/java/de/julielab/jcore/ae/opennlp/postag/PosTagAnnotatorTest.java @@ -33,15 +33,15 @@ import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Collection; import java.util.Iterator; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class PosTagAnnotatorTest { diff --git a/jcore-opennlp-postag-ae/src/test/java/de/julielab/jcore/ae/opennlp/postag/PosTagDictCreatorTest.java b/jcore-opennlp-postag-ae/src/test/java/de/julielab/jcore/ae/opennlp/postag/PosTagDictCreatorTest.java index 22dd88ad2..d7b8f6742 100644 --- a/jcore-opennlp-postag-ae/src/test/java/de/julielab/jcore/ae/opennlp/postag/PosTagDictCreatorTest.java +++ b/jcore-opennlp-postag-ae/src/test/java/de/julielab/jcore/ae/opennlp/postag/PosTagDictCreatorTest.java @@ -13,7 +13,7 @@ import opennlp.tools.postag.POSDictionary; import opennlp.tools.postag.POSSample; import org.apache.commons.io.FileUtils; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; @@ -23,8 +23,8 @@ import java.util.List; import java.util.Set; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class PosTagDictCreatorTest { @Test diff --git a/jcore-opennlp-sentence-ae/pom.xml b/jcore-opennlp-sentence-ae/pom.xml index c1c0c2b03..d1c2cd2c3 100644 --- a/jcore-opennlp-sentence-ae/pom.xml +++ b/jcore-opennlp-sentence-ae/pom.xml @@ -41,8 +41,8 @@ slf4j-api - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe OpenNLP Sentence Splitter diff --git a/jcore-opennlp-sentence-ae/src/test/java/de/julielab/jcore/ae/jsentsplit/SentenceAnnotatorTest.java b/jcore-opennlp-sentence-ae/src/test/java/de/julielab/jcore/ae/jsentsplit/SentenceAnnotatorTest.java index 3dcbbef41..6aacdf297 100644 --- a/jcore-opennlp-sentence-ae/src/test/java/de/julielab/jcore/ae/jsentsplit/SentenceAnnotatorTest.java +++ b/jcore-opennlp-sentence-ae/src/test/java/de/julielab/jcore/ae/jsentsplit/SentenceAnnotatorTest.java @@ -18,7 +18,6 @@ package de.julielab.jcore.ae.jsentsplit; import de.julielab.jcore.types.Sentence; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; @@ -26,12 +25,15 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Iterator; -public class SentenceAnnotatorTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class SentenceAnnotatorTest { /** * Logger for this class @@ -43,10 +45,7 @@ public class SentenceAnnotatorTest extends TestCase { String offsets = "0-15;16-32;"; - protected void setUp() throws Exception { - super.setUp(); - } - + @Test public void testProcess() { XMLInputSource sentenceXML = null; diff --git a/jcore-opennlp-token-ae/pom.xml b/jcore-opennlp-token-ae/pom.xml index 306972531..419b52446 100644 --- a/jcore-opennlp-token-ae/pom.xml +++ b/jcore-opennlp-token-ae/pom.xml @@ -40,8 +40,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe OpenNLP Tokenizer diff --git a/jcore-opennlp-token-ae/src/test/java/de/julielab/jcore/ae/opennlp/token/TokenAnnotatorTest.java b/jcore-opennlp-token-ae/src/test/java/de/julielab/jcore/ae/opennlp/token/TokenAnnotatorTest.java index 04ab72c43..f42582429 100644 --- a/jcore-opennlp-token-ae/src/test/java/de/julielab/jcore/ae/opennlp/token/TokenAnnotatorTest.java +++ b/jcore-opennlp-token-ae/src/test/java/de/julielab/jcore/ae/opennlp/token/TokenAnnotatorTest.java @@ -19,7 +19,6 @@ import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; @@ -32,7 +31,9 @@ import java.util.Iterator; -public class TokenAnnotatorTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TokenAnnotatorTest { private static final Logger LOGGER = LoggerFactory .getLogger(TokenAnnotatorTest.class); diff --git a/jcore-pmc-reader/pom.xml b/jcore-pmc-reader/pom.xml index 976a1b456..b0eee7fe3 100644 --- a/jcore-pmc-reader/pom.xml +++ b/jcore-pmc-reader/pom.xml @@ -154,8 +154,8 @@ jcore-descriptor-creator - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/NXMLURIIteratorTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/NXMLURIIteratorTest.java index 8c328c2ac..f1e6bd11c 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/NXMLURIIteratorTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/NXMLURIIteratorTest.java @@ -1,6 +1,6 @@ package de.julielab.jcore.reader.pmc; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileNotFoundException; @@ -15,7 +15,7 @@ import static java.nio.charset.StandardCharsets.UTF_8; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatCode; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertTrue; public class NXMLURIIteratorTest { @Test diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCMultiplierTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCMultiplierTest.java index 8a8527930..27339365b 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCMultiplierTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCMultiplierTest.java @@ -11,7 +11,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.HashSet; diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCReaderTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCReaderTest.java index 9d5d91007..7d5547754 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCReaderTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCReaderTest.java @@ -19,7 +19,7 @@ import org.apache.uima.fit.util.CasUtil; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.List; import java.util.*; @@ -28,7 +28,7 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatCode; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class PMCReaderTest { @Test @@ -288,8 +288,8 @@ public void testKeywords() throws Exception { Set expectedKeywords = new HashSet<>(Arrays.asList("Baltic Sea Action Plan (BSAP)", "Costs", "Review", "Eutrophication", "Hazardous substances")); IntStream.range(0, md.getKeywordList().size()) - .forEach(i -> assertTrue("The keyword \"" + md.getKeywordList(i).getName() + "\" was not expected", - expectedKeywords.remove(md.getKeywordList(i).getName()))); + .forEach(i -> assertTrue(expectedKeywords.remove(md.getKeywordList(i).getName()), + "The keyword \"" + md.getKeywordList(i).getName() + "\" was not expected")); assertTrue(expectedKeywords.isEmpty()); } diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/ContribGroupParserTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/ContribGroupParserTest.java index 911500480..324a653dc 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/ContribGroupParserTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/ContribGroupParserTest.java @@ -12,12 +12,12 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class ContribGroupParserTest { @Test diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/ContribParserTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/ContribParserTest.java index 667e85812..fc3f81489 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/ContribParserTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/ContribParserTest.java @@ -13,11 +13,11 @@ import de.julielab.jcore.types.AuthorInfo; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class ContribParserTest { @Test diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/FrontParserTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/FrontParserTest.java index c5ac41078..a3ba75ae7 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/FrontParserTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/FrontParserTest.java @@ -16,11 +16,11 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class FrontParserTest { @Test diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/NxmlElementParserTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/NxmlElementParserTest.java index 256ac33a0..de3fca292 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/NxmlElementParserTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/NxmlElementParserTest.java @@ -12,11 +12,11 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class NxmlElementParserTest { @Test diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/SectionParserTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/SectionParserTest.java index 8d2baf7fb..136420616 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/SectionParserTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/SectionParserTest.java @@ -15,7 +15,7 @@ import org.apache.commons.io.IOUtils; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -23,7 +23,7 @@ import java.io.FileInputStream; import java.util.zip.GZIPInputStream; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class SectionParserTest { diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/XRefParserTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/XRefParserTest.java index 46c79e5fb..72d94b03b 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/XRefParserTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/parser/XRefParserTest.java @@ -6,12 +6,12 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class XRefParserTest { @Test diff --git a/jcore-ppd-writer/pom.xml b/jcore-ppd-writer/pom.xml index 8e409735b..c5dc78e43 100644 --- a/jcore-ppd-writer/pom.xml +++ b/jcore-ppd-writer/pom.xml @@ -42,8 +42,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe Piped Format Writer diff --git a/jcore-ppd-writer/src/test/java/de/julielab/jcore/consumer/ppd/PPDWriterTest.java b/jcore-ppd-writer/src/test/java/de/julielab/jcore/consumer/ppd/PPDWriterTest.java index 0327f1b26..0603851fc 100644 --- a/jcore-ppd-writer/src/test/java/de/julielab/jcore/consumer/ppd/PPDWriterTest.java +++ b/jcore-ppd-writer/src/test/java/de/julielab/jcore/consumer/ppd/PPDWriterTest.java @@ -7,13 +7,13 @@ import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileInputStream; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; /** diff --git a/jcore-pubtator-reader/pom.xml b/jcore-pubtator-reader/pom.xml index 84661f424..3440bb1fc 100644 --- a/jcore-pubtator-reader/pom.xml +++ b/jcore-pubtator-reader/pom.xml @@ -41,8 +41,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-pubtator-reader/src/test/java/de/julielab/jcore/reader/pubtator/PubtatorReaderTest.java b/jcore-pubtator-reader/src/test/java/de/julielab/jcore/reader/pubtator/PubtatorReaderTest.java index afece0a59..9ad0c4efc 100644 --- a/jcore-pubtator-reader/src/test/java/de/julielab/jcore/reader/pubtator/PubtatorReaderTest.java +++ b/jcore-pubtator-reader/src/test/java/de/julielab/jcore/reader/pubtator/PubtatorReaderTest.java @@ -16,14 +16,14 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.Set; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class PubtatorReaderTest { @Test @@ -73,6 +73,6 @@ public void testDocumentDirectory() throws Exception { } jcas.reset(); } - assertTrue("The following IDs have not been read: " + expectedDocIds, expectedDocIds.isEmpty()); + assertTrue(expectedDocIds.isEmpty(), "The following IDs have not been read: " + expectedDocIds); } } diff --git a/jcore-stanford-lemmatizer-ae/pom.xml b/jcore-stanford-lemmatizer-ae/pom.xml index 39eda0c8b..33da8f8c2 100644 --- a/jcore-stanford-lemmatizer-ae/pom.xml +++ b/jcore-stanford-lemmatizer-ae/pom.xml @@ -37,8 +37,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-stanford-lemmatizer-ae/src/test/java/de/julielab/jcore/ae/stanford/lemma/StanfordLemmatizerTest.java b/jcore-stanford-lemmatizer-ae/src/test/java/de/julielab/jcore/ae/stanford/lemma/StanfordLemmatizerTest.java index 3e8b94fc2..ca0e0138b 100644 --- a/jcore-stanford-lemmatizer-ae/src/test/java/de/julielab/jcore/ae/stanford/lemma/StanfordLemmatizerTest.java +++ b/jcore-stanford-lemmatizer-ae/src/test/java/de/julielab/jcore/ae/stanford/lemma/StanfordLemmatizerTest.java @@ -13,7 +13,6 @@ import de.julielab.jcore.types.PennBioIEPOSTag; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; @@ -22,12 +21,15 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Iterator; -public class StanfordLemmatizerTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class StanfordLemmatizerTest { private static final Logger LOGGER = LoggerFactory .getLogger(StanfordLemmatizerTest.class); @@ -66,6 +68,7 @@ public void initCas(JCas aJCas) { } @SuppressWarnings("rawtypes") + @Test public void testProcess() { XMLInputSource lemmaXML = null; diff --git a/jcore-topic-indexing-ae/pom.xml b/jcore-topic-indexing-ae/pom.xml index b378f818d..be99da6f5 100644 --- a/jcore-topic-indexing-ae/pom.xml +++ b/jcore-topic-indexing-ae/pom.xml @@ -96,8 +96,8 @@ - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe-Topic-Labeling-AE diff --git a/jcore-topic-indexing-ae/src/test/java/de/julielab/jcore/ae/TopicIndexingTest.java b/jcore-topic-indexing-ae/src/test/java/de/julielab/jcore/ae/TopicIndexingTest.java index f42a8368b..756bad437 100644 --- a/jcore-topic-indexing-ae/src/test/java/de/julielab/jcore/ae/TopicIndexingTest.java +++ b/jcore-topic-indexing-ae/src/test/java/de/julielab/jcore/ae/TopicIndexingTest.java @@ -21,7 +21,7 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.StringArray; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; @@ -30,8 +30,8 @@ import java.util.List; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; /** * Unit tests for jcore-topic-labeling-ae. diff --git a/jcore-topics-writer/pom.xml b/jcore-topics-writer/pom.xml index 19752ec2e..644c48f2b 100644 --- a/jcore-topics-writer/pom.xml +++ b/jcore-topics-writer/pom.xml @@ -42,8 +42,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine JCoRe Topics Writer diff --git a/jcore-topics-writer/src/test/java/de/julielab/jcore/consumer/topics/TopicsWriterTest.java b/jcore-topics-writer/src/test/java/de/julielab/jcore/consumer/topics/TopicsWriterTest.java index 4db31fa61..693ce2c58 100644 --- a/jcore-topics-writer/src/test/java/de/julielab/jcore/consumer/topics/TopicsWriterTest.java +++ b/jcore-topics-writer/src/test/java/de/julielab/jcore/consumer/topics/TopicsWriterTest.java @@ -9,9 +9,9 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.DoubleArray; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; @@ -26,8 +26,8 @@ */ public class TopicsWriterTest { - @BeforeClass - @AfterClass + @BeforeAll + @AfterAll public static void setup() { FileUtils.deleteQuietly(new File("src/test/resources/output")); } diff --git a/jcore-txt-consumer/pom.xml b/jcore-txt-consumer/pom.xml index 07b878cab..3c4fdb273 100644 --- a/jcore-txt-consumer/pom.xml +++ b/jcore-txt-consumer/pom.xml @@ -44,8 +44,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-txt-consumer/src/test/java/de/julielab/jcore/consumer/txt/SentenceTokenConsumerTest.java b/jcore-txt-consumer/src/test/java/de/julielab/jcore/consumer/txt/SentenceTokenConsumerTest.java index 82b76eef9..29197eac6 100644 --- a/jcore-txt-consumer/src/test/java/de/julielab/jcore/consumer/txt/SentenceTokenConsumerTest.java +++ b/jcore-txt-consumer/src/test/java/de/julielab/jcore/consumer/txt/SentenceTokenConsumerTest.java @@ -20,7 +20,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.*; import java.nio.charset.StandardCharsets; @@ -32,7 +32,7 @@ import static de.julielab.jcore.consumer.txt.SentenceTokenConsumer.*; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class SentenceTokenConsumerTest { /** * just tests if there is an error with an empty CAS diff --git a/jcore-utilities/pom.xml b/jcore-utilities/pom.xml index aafbe74fe..5a6ad681f 100644 --- a/jcore-utilities/pom.xml +++ b/jcore-utilities/pom.xml @@ -19,8 +19,8 @@ slf4j-api - junit - junit + org.junit.jupiter + junit-jupiter-engine org.apache.commons diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReAnnotationToolsTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReAnnotationToolsTest.java index 42a374b7d..9a101452b 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReAnnotationToolsTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReAnnotationToolsTest.java @@ -18,7 +18,6 @@ package de.julielab.jcore.utility; import de.julielab.jcore.types.*; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.cas.CAS; import org.apache.uima.cas.FSIterator; @@ -28,27 +27,30 @@ import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.XMLInputSource; import org.apache.uima.util.XmlCasDeserializer; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.FileInputStream; -import java.lang.reflect.InvocationTargetException; import java.util.List; +import static org.junit.jupiter.api.Assertions.*; + // import de.julielab.jcore.types.Annotation; -public class JCoReAnnotationToolsTest extends TestCase { +public class JCoReAnnotationToolsTest { /** * Logger for this class */ private static final Logger LOG = LoggerFactory.getLogger(JCoReAnnotationToolsTest.class); - JCas jcas; - public final String DESC_TEST_ANALYSIS_ENGINE = "src/test/resources/AETestDescriptor.xml"; + static JCas jcas; + public final static String DESC_TEST_ANALYSIS_ENGINE = "src/test/resources/AETestDescriptor.xml"; - protected void setUp() throws Exception { + @BeforeAll + protected static void setUp() throws Exception { // get a CAS/JCas CAS cas = CasCreationUtils.createCas(UIMAFramework.getXMLParser().parseAnalysisEngineDescription( @@ -78,9 +80,8 @@ protected void setUp() throws Exception { e4.addToIndexes(); } - // TODO only Exception werfen - public void testGetAnnotationAtOffset() throws SecurityException, IllegalArgumentException, ClassNotFoundException, - NoSuchMethodException, InstantiationException, IllegalAccessException, InvocationTargetException { + @Test + public void testGetAnnotationAtOffset() throws Exception { LOG.debug("testGetAnnotationAtOffset() - testing getAnnotationAtOffset(..)"); Annotation entity = new Annotation(jcas); @@ -94,10 +95,8 @@ public void testGetAnnotationAtOffset() throws SecurityException, IllegalArgumen assertTrue(anno == null); } - // TODO only Exception werfen - public void testGetOverlappingAnnotation() throws SecurityException, IllegalArgumentException, - ClassNotFoundException, NoSuchMethodException, InstantiationException, IllegalAccessException, - InvocationTargetException { + @Test + public void testGetOverlappingAnnotation() throws Exception { LOG.debug("testGetOverlappingAnnotation() - testing getOverlappingAnnotation(..)"); Annotation entity = new Annotation(jcas); @@ -119,10 +118,8 @@ public void testGetOverlappingAnnotation() throws SecurityException, IllegalArgu assertTrue((anno != null) && (anno instanceof Annotation)); } - // TODO only Exception werfen - public void testGetAnnotationByClassName() throws SecurityException, IllegalArgumentException, - ClassNotFoundException, NoSuchMethodException, InstantiationException, IllegalAccessException, - InvocationTargetException { + @Test + public void testGetAnnotationByClassName() throws Exception { LOG.debug("testGetAnnotationByClassName() - testing getAnnotationObject(..)"); Annotation entity = new Annotation(jcas); @@ -130,6 +127,7 @@ public void testGetAnnotationByClassName() throws SecurityException, IllegalArgu assertTrue(anno instanceof Annotation); } + @Test public void testGetPartiallyOverlappingAnnotationOtherType() throws Exception { JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-all-types"); jcas.setDocumentText("wort"); @@ -242,7 +240,7 @@ public void testIncludedAnnotations() throws Exception { List includedAnnotations = JCoReAnnotationTools.getIncludedAnnotations(jcas, em, Token.class); - assertEquals("Wrong amount of included tokens returned", 4, includedAnnotations.size()); + assertEquals(4, includedAnnotations.size(), "Wrong amount of included tokens returned"); for (int i = 0; i < includedAnnotations.size(); i++) { Token includedToken = includedAnnotations.get(i); diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java index 445b234e3..12672e122 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java @@ -3,12 +3,12 @@ import de.julielab.jcore.types.InternalReference; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Arrays; import java.util.HashSet; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class JCoReCondensedDocumentTextTest { @Test diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReFSListIteratorTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReFSListIteratorTest.java index bb11d9beb..1ebca1c68 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReFSListIteratorTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReFSListIteratorTest.java @@ -13,14 +13,14 @@ import de.julielab.jcore.types.Token; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.NoSuchElementException; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class JCoReFSListIteratorTest { diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReFeaturePathTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReFeaturePathTest.java index 8983aa858..bc01ec660 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReFeaturePathTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReFeaturePathTest.java @@ -19,14 +19,14 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.cas.StringArray; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class JCoReFeaturePathTest { @Test @@ -275,12 +275,12 @@ public void testReplacePrimitiveValue() throws Exception { assertEquals("originalValue", cm.getTextualRepresentation()); assertEquals("originalValue", fp.getValueAsString(cm)); - assertEquals("replacementValue", fp.getValueAsString(cm, true)); + assertEquals( fp.getValueAsString(cm, true), "replacementValue"); assertEquals("replacementValue", fp.getValueAsString(cm)); assertEquals("replacementValue", cm.getTextualRepresentation()); // doing a replacement again should have no effect - assertEquals("replacementValue", fp.getValueAsString(cm, true)); + assertEquals( fp.getValueAsString(cm, true), "replacementValue"); assertEquals("replacementValue", fp.getValueAsString(cm)); } @@ -303,11 +303,11 @@ public void testReplaceNotMappedPrimitiveValue() throws Exception { assertEquals("originalValue", cm.getTextualRepresentation()); assertEquals("originalValue", fp.getValueAsString(cm)); - assertEquals("replacementValue", fp.getValueAsString(cm, true)); + assertEquals( fp.getValueAsString(cm, true), "replacementValue"); assertEquals("unknownValue", cm2.getTextualRepresentation()); assertEquals("unknownValue", fp.getValueAsString(cm2)); - assertEquals("not-mapped", fp.getValueAsString(cm2, true)); + assertEquals( fp.getValueAsString(cm2, true), "not-mapped"); } @Test @@ -328,7 +328,7 @@ public void testReplaceNotMappedPrimitiveValueWithNull() throws Exception { assertEquals("unknownValue", cm.getTextualRepresentation()); assertEquals("unknownValue", fp.getValueAsString(cm)); assertEquals(null, fp.getValueAsString(cm, true)); - assertNotSame("null", fp.getValueAsString(cm, true)); + assertNotSame( fp.getValueAsString(cm, true), "null"); } @Test @@ -350,7 +350,7 @@ public void testReplaceAllArrayElements() throws Exception { fp.initialize("/semanticTypes", replacements); assertEquals("entry1, entry2, entry3", fp.getValueAsString(ocm)); - assertEquals("replacement1, replacement2, replacement3", fp.getValueAsString(ocm, true)); + assertEquals( fp.getValueAsString(ocm, true), "replacement1, replacement2, replacement3"); } @Test @@ -368,7 +368,7 @@ public void testReplaceAllArrayElementsFromFile() throws Exception { fp.initialize("/semanticTypes"); assertEquals("entry1, entry2, entry3", fp.getValueAsString(ocm)); - assertEquals("replacement1, replacement2, replacement3", fp.getValueAsString(ocm, true)); + assertEquals( fp.getValueAsString(ocm, true), "replacement1, replacement2, replacement3"); } @@ -391,7 +391,7 @@ public void testReplaceSingleArrayElement() throws Exception { JCoReFeaturePath fp = new JCoReFeaturePath(); fp.initialize("/semanticTypes[1]", replacements); - assertEquals("replacement2", fp.getValueAsString(ocm, true)); + assertEquals( fp.getValueAsString(ocm, true), "replacement2"); fp.initialize("/semanticTypes"); assertEquals("entry1, replacement2, entry3", fp.getValueAsString(ocm)); @@ -423,7 +423,7 @@ public void testReplaceValueOnDeepFeatureStructure() throws Exception { JCoReFeaturePath fp = new JCoReFeaturePath(); fp.initialize("/resourceEntryList/entryId", replacements); - assertEquals("tid1, tid2", fp.getValueAsString(gene, true)); + assertEquals( fp.getValueAsString(gene, true), "tid1, tid2"); } diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReToolsTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReToolsTest.java index 3212f4c77..74619777f 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReToolsTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReToolsTest.java @@ -16,13 +16,13 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Arrays; import java.util.Collection; import java.util.List; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class JCoReToolsTest { @Test @@ -36,7 +36,7 @@ public void testAddCollectionToFSArray1() throws Exception { Annotation newElement = new Annotation(jCas); Collection newElements = Lists.newArrayList(newElement); FSArray joinedArray = JCoReTools.addToFSArray(fsArray, newElements); - assertEquals("A new FSArray was instantiated although the old one should have been kept", fsArray, joinedArray); + assertEquals( fsArray, joinedArray, "A new FSArray was instantiated although the old one should have been kept"); assertEquals(newElement, joinedArray.get(1)); } @@ -54,7 +54,7 @@ public void testAddCollectionToFSArray2() throws Exception { Annotation newElement4 = new Annotation(jCas, 4, 4); Collection newElements = Lists.newArrayList(newElement1, newElement2, newElement3, newElement4); FSArray joinedArray = JCoReTools.addToFSArray(fsArray, newElements); - assertNotSame("The old FSArray was returned although a new one should have been created", fsArray, joinedArray); + assertNotSame( fsArray, joinedArray, "The old FSArray was returned although a new one should have been created"); assertEquals(newElement1, joinedArray.get(1)); assertEquals(newElement2, joinedArray.get(2)); assertEquals(newElement3, joinedArray.get(3)); @@ -68,7 +68,7 @@ public void testAddCollectionToFSArray3() throws Exception { Annotation newElement = new Annotation(jCas); Collection newElements = Lists.newArrayList(newElement); FSArray joinedArray = JCoReTools.addToFSArray(fsArray, newElements); - assertNotSame("The old FSArray was returned although a new one should have been created", fsArray, joinedArray); + assertNotSame( fsArray, joinedArray, "The old FSArray was returned although a new one should have been created"); assertEquals(newElement, joinedArray.get(0)); } @@ -79,7 +79,7 @@ public void testAddCollectionToFSArray4() throws Exception { Annotation newElement = new Annotation(jCas); Collection newElements = Lists.newArrayList(newElement); FSArray joinedArray = JCoReTools.addToFSArray(fsArray, newElements); - assertEquals("A new FSArray was instantiated although the old one should have been kept", fsArray, joinedArray); + assertEquals( fsArray, joinedArray, "A new FSArray was instantiated although the old one should have been kept"); assertEquals(newElement, joinedArray.get(0)); } @@ -93,7 +93,7 @@ public void testAddElementToFSArray1() throws Exception { assertNull(fsArray.get(1)); Annotation newElement = new Annotation(jCas); FSArray joinedArray = JCoReTools.addToFSArray(fsArray, newElement); - assertEquals("A new FSArray was instantiated although the old one should have been kept", fsArray, joinedArray); + assertEquals( fsArray, joinedArray, "A new FSArray was instantiated although the old one should have been kept"); assertEquals(newElement, joinedArray.get(1)); } @@ -112,23 +112,23 @@ public void testAddElementToFSArray2() throws Exception { List newElements = Lists.newArrayList(newElement1, newElement2, newElement3, newElement4); FSArray joinedArray = JCoReTools.addToFSArray(fsArray, newElements.get(0)); - assertEquals("A new FSArray was instantiated although the old one should have been kept", fsArray, joinedArray); + assertEquals( fsArray, joinedArray, "A new FSArray was instantiated although the old one should have been kept"); assertEquals(2, joinedArray.size()); assertEquals(newElement1, joinedArray.get(1)); fsArray = joinedArray; joinedArray = JCoReTools.addToFSArray(fsArray, newElements.get(1)); - assertNotSame("The old FSArray was returned although a new one should have been created", fsArray, joinedArray); + assertNotSame( fsArray, joinedArray, "The old FSArray was returned although a new one should have been created"); assertEquals(newElement2, joinedArray.get(2)); fsArray = joinedArray; joinedArray = JCoReTools.addToFSArray(fsArray, newElements.get(2)); - assertEquals("A new FSArray was instantiated although the old one should have been kept", fsArray, joinedArray); + assertEquals( fsArray, joinedArray, "A new FSArray was instantiated although the old one should have been kept"); assertEquals(newElement3, joinedArray.get(3)); fsArray = joinedArray; joinedArray = JCoReTools.addToFSArray(fsArray, newElements.get(3)); - assertEquals("A new FSArray was instantiated although the old one should have been kept", fsArray, joinedArray); + assertEquals( fsArray, joinedArray, "A new FSArray was instantiated although the old one should have been kept"); assertEquals(newElement4, joinedArray.get(4)); } diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/ComparatorsTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/ComparatorsTest.java index f5720c7c1..5b87d968a 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/ComparatorsTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/ComparatorsTest.java @@ -13,10 +13,10 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class ComparatorsTest { @Test diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReCoverAnnotationIndexTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReCoverAnnotationIndexTest.java index e81f2cd08..5e2b1d105 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReCoverAnnotationIndexTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReCoverAnnotationIndexTest.java @@ -14,13 +14,13 @@ import de.julielab.jcore.types.Token; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Set; import java.util.stream.Collectors; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class JCoReCoverAnnotationIndexTest { @Test diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReMapAnnotationIndexTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReMapAnnotationIndexTest.java index 4cd521007..088917946 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReMapAnnotationIndexTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReMapAnnotationIndexTest.java @@ -14,7 +14,7 @@ import de.julielab.jcore.types.Token; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.HashMap; import java.util.Set; @@ -22,7 +22,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class JCoReMapAnnotationIndexTest { @Test diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndexTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndexTest.java index e2f7a39b2..942f32785 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndexTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReOverlapAnnotationIndexTest.java @@ -13,12 +13,12 @@ import de.julielab.jcore.types.Token; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.List; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class JCoReOverlapAnnotationIndexTest { @Test diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReSetAnnotationIndexTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReSetAnnotationIndexTest.java index cfb4f2374..111861268 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReSetAnnotationIndexTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReSetAnnotationIndexTest.java @@ -15,13 +15,13 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Set; import java.util.stream.Collectors; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class JCoReSetAnnotationIndexTest { @Test diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReTreeMapAnnotationIndexTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReTreeMapAnnotationIndexTest.java index 1294407f2..208e8abb4 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReTreeMapAnnotationIndexTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/JCoReTreeMapAnnotationIndexTest.java @@ -15,12 +15,12 @@ import de.julielab.jcore.types.Token; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.Set; import java.util.stream.Collectors; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class JCoReTreeMapAnnotationIndexTest { @Test diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/TermGeneratorsTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/TermGeneratorsTest.java index 8595e5840..e3d269f83 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/TermGeneratorsTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/index/TermGeneratorsTest.java @@ -13,12 +13,12 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.function.BinaryOperator; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; public class TermGeneratorsTest { diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java index 81dff7797..fd631e58f 100644 --- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java +++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java @@ -14,7 +14,6 @@ import de.julielab.xml.binary.BinaryJeDISNodeDecoder; import de.julielab.xml.binary.BinaryXmiBuilder; import org.apache.commons.lang.StringUtils; -import org.apache.uima.cas.CASException; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.impl.XmiCasDeserializer; import org.apache.uima.collection.CollectionException; diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierDifferentNsSchemaTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierDifferentNsSchemaTest.java index ff60e41a0..f7fa5f19a 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierDifferentNsSchemaTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierDifferentNsSchemaTest.java @@ -16,9 +16,9 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import org.testcontainers.containers.PostgreSQLContainer; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.File; @@ -35,7 +35,7 @@ public class XmiDBMultiplierDifferentNsSchemaTest { private static String costosysConfig; private static int subsetCounter; - @BeforeClass + @BeforeAll public static void setup() throws UIMAException, IOException, ConfigurationException { postgres.start(); XmiDBSetupHelper.createDbcConfig(postgres); @@ -49,7 +49,7 @@ public static void setup() throws UIMAException, IOException, ConfigurationExcep subsetCounter = 0; } - @AfterClass + @AfterAll public static void shutdown() { postgres.close(); } diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java index 2af097f43..c2398d503 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java @@ -16,9 +16,9 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import org.testcontainers.containers.PostgreSQLContainer; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.File; @@ -35,7 +35,7 @@ public class XmiDBMultiplierTest { private static String costosysConfig; private static int subsetCounter; - @BeforeClass + @BeforeAll public static void setup() throws UIMAException, IOException, ConfigurationException { postgres.start(); XmiDBSetupHelper.createDbcConfig(postgres); @@ -49,7 +49,7 @@ public static void setup() throws UIMAException, IOException, ConfigurationExcep subsetCounter = 0; } - @AfterClass + @AfterAll public static void shutdown() { postgres.close(); } diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java index 309ab09a4..51d66d493 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java @@ -2,7 +2,9 @@ import de.julielab.costosys.dbconnection.DataBaseConnector; import de.julielab.jcore.db.test.DBTestUtils; -import de.julielab.jcore.types.*; +import de.julielab.jcore.types.Header; +import de.julielab.jcore.types.Sentence; +import de.julielab.jcore.types.Token; import org.apache.commons.configuration2.ex.ConfigurationException; import org.apache.uima.UIMAException; import org.apache.uima.collection.CollectionReader; @@ -10,9 +12,9 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import org.testcontainers.containers.PostgreSQLContainer; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.IOException; @@ -20,15 +22,15 @@ import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; public class XmiDBReaderBinaryFormatTest { public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); private static String costosysConfig; private static String xmisubset; - @BeforeClass + @BeforeAll public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { postgres.start(); XmiDBSetupHelper.createDbcConfig(postgres); @@ -46,7 +48,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf } - @AfterClass + @AfterAll public static void shutdown() { postgres.close(); } diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderDifferentNsSchemaTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderDifferentNsSchemaTest.java index a8a15b58d..39b2639f0 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderDifferentNsSchemaTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderDifferentNsSchemaTest.java @@ -1,22 +1,20 @@ package de.julielab.jcore.reader.xmi; import de.julielab.costosys.dbconnection.DataBaseConnector; -import de.julielab.jcore.consumer.xmi.XMIDBWriter; import de.julielab.jcore.db.test.DBTestUtils; -import de.julielab.jcore.reader.db.TableReaderConstants; -import de.julielab.jcore.types.*; +import de.julielab.jcore.types.Header; +import de.julielab.jcore.types.Sentence; +import de.julielab.jcore.types.Token; import org.apache.commons.configuration2.ex.ConfigurationException; import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.collection.CollectionReader; -import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.CollectionReaderFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import org.testcontainers.containers.PostgreSQLContainer; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.IOException; @@ -24,15 +22,15 @@ import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; public class XmiDBReaderDifferentNsSchemaTest { public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); private static String costosysConfig; private static String xmisubset; - @BeforeClass + @BeforeAll public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { postgres.start(); XmiDBSetupHelper.createDbcConfig(postgres); @@ -50,7 +48,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf } - @AfterClass + @AfterAll public static void shutdown() { postgres.close(); } diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderGzippedDataTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderGzippedDataTest.java index 9a7fea0b3..018170026 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderGzippedDataTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderGzippedDataTest.java @@ -12,9 +12,9 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import org.testcontainers.containers.PostgreSQLContainer; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.File; @@ -35,7 +35,7 @@ public class XmiDBReaderGzippedDataTest { private static String costosysConfig; private static String xmisubset; - @BeforeClass + @BeforeAll public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { postgres.start(); XmiDBSetupHelper.createDbcConfig(postgres); @@ -53,7 +53,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf dbc.close(); } - @AfterClass + @AfterAll public static void shutdown() { postgres.close(); } diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderMonolithicDocumentsTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderMonolithicDocumentsTest.java index e0ae7f3ed..decd4e840 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderMonolithicDocumentsTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderMonolithicDocumentsTest.java @@ -12,9 +12,9 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import org.testcontainers.containers.PostgreSQLContainer; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.File; @@ -32,7 +32,7 @@ public class XmiDBReaderMonolithicDocumentsTest { private static String costosysConfig; private static String xmisubset; - @BeforeClass + @BeforeAll public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { postgres.start(); XmiDBSetupHelper.createDbcConfig(postgres); @@ -50,7 +50,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf dbc.close(); } - @AfterClass + @AfterAll public static void shutdown() { postgres.close(); } diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderTest.java index 72bea54a6..1f8150274 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderTest.java @@ -14,9 +14,9 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import org.testcontainers.containers.PostgreSQLContainer; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.IOException; @@ -24,15 +24,15 @@ import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; public class XmiDBReaderTest { public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); private static String costosysConfig; private static String xmisubset; - @BeforeClass + @BeforeAll public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { postgres.start(); XmiDBSetupHelper.createDbcConfig(postgres); @@ -50,7 +50,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf } - @AfterClass + @AfterAll public static void shutdown() { postgres.close(); } diff --git a/jcore-xmi-db-writer/pom.xml b/jcore-xmi-db-writer/pom.xml index 657e06c16..2b4a326f4 100644 --- a/jcore-xmi-db-writer/pom.xml +++ b/jcore-xmi-db-writer/pom.xml @@ -159,6 +159,12 @@ logback-classic test + + org.jetbrains + annotations + RELEASE + compile + https://github.com/JULIELab/jcore-base/tree/master/jcore-xmi-db-writer diff --git a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterBinaryFormatTest.java b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterBinaryFormatTest.java index f89ce94e5..4df9efaaa 100644 --- a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterBinaryFormatTest.java +++ b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterBinaryFormatTest.java @@ -4,9 +4,7 @@ import de.julielab.costosys.dbconnection.DataBaseConnector; import de.julielab.jcore.db.test.DBTestUtils; import de.julielab.jcore.types.*; -import de.julielab.jcore.types.ext.DBProcessingMetaData; import de.julielab.xml.XmiSplitConstants; -import de.julielab.xml.XmiSplitter; import de.julielab.xml.binary.BinaryDecodingResult; import de.julielab.xml.binary.BinaryJeDISNodeDecoder; import org.apache.commons.configuration2.ex.ConfigurationException; @@ -17,7 +15,11 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.StringArray; -import org.junit.*; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; import org.testcontainers.containers.PostgreSQLContainer; import java.io.ByteArrayInputStream; @@ -26,15 +28,15 @@ import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; -import java.util.*; import java.util.List; +import java.util.*; import java.util.stream.Collectors; import static java.nio.charset.StandardCharsets.UTF_8; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatCode; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; public class XmiDBWriterBinaryFormatTest { @ClassRule @@ -43,7 +45,7 @@ public class XmiDBWriterBinaryFormatTest { private static String xmlSubsetTable; private static DataBaseConnector dbc; - @BeforeClass + @BeforeAll public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { dbc = DBTestUtils.getDataBaseConnector(postgres); dbc.reserveConnection(); @@ -52,7 +54,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf dbc.releaseConnections(); } - @AfterClass + @AfterAll public static void shutDown() { dbc.close(); } @@ -65,7 +67,7 @@ public static JCas getJCasWithRequiredTypes() throws UIMAException { "de.julielab.jcore.types.jcore-xmi-splitter-types"); } - @Before + @BeforeEach public void cleanForTest() throws SQLException { String binaryMappingTable = "public." + MetaTableManager.BINARY_MAPPING_TABLE; String binaryFeaturesToMapTable = "public." + MetaTableManager.BINARY_FEATURES_TO_MAP_TABLE; diff --git a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterMonolithicDocumentTest.java b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterMonolithicDocumentTest.java index 10684230b..84e35a027 100644 --- a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterMonolithicDocumentTest.java +++ b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterMonolithicDocumentTest.java @@ -3,7 +3,9 @@ import de.julielab.costosys.dbconnection.CoStoSysConnection; import de.julielab.costosys.dbconnection.DataBaseConnector; import de.julielab.jcore.db.test.DBTestUtils; -import de.julielab.jcore.types.*; +import de.julielab.jcore.types.Header; +import de.julielab.jcore.types.Sentence; +import de.julielab.jcore.types.Token; import org.apache.commons.configuration2.ex.ConfigurationException; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngine; @@ -12,24 +14,18 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.AfterClass; -import org.junit.BeforeClass; import org.junit.ClassRule; -import org.junit.Test; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.testcontainers.containers.PostgreSQLContainer; import java.io.ByteArrayInputStream; -import java.io.IOException; import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatCode; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertTrue; public class XmiDBWriterMonolithicDocumentTest { @ClassRule @@ -37,14 +33,14 @@ public class XmiDBWriterMonolithicDocumentTest { private static String costosysConfig; private static DataBaseConnector dbc; - @BeforeClass + @BeforeAll public static void setup() throws ConfigurationException { dbc = DBTestUtils.getDataBaseConnector(postgres); costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2017", 1, postgres); DBTestUtils.createAndSetHiddenConfig("src/test/resources/hiddenConfig.txt", postgres); } - @AfterClass + @AfterAll public static void shutDown() { dbc.close(); } diff --git a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java index 866d0ddf8..00230cda6 100644 --- a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java +++ b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java @@ -11,10 +11,10 @@ import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.AfterClass; -import org.junit.BeforeClass; import org.junit.ClassRule; -import org.junit.Test; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import org.testcontainers.containers.PostgreSQLContainer; import java.io.IOException; @@ -34,7 +34,7 @@ public class XmiDBWriterTest { private static String xmlSubsetTable; private static DataBaseConnector dbc; - @BeforeClass + @BeforeAll public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { dbc = DBTestUtils.getDataBaseConnector(postgres); dbc.reserveConnection(); @@ -43,7 +43,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf dbc.releaseConnections(); } - @AfterClass + @AfterAll public static void shutDown() { dbc.close(); } diff --git a/jcore-xmi-reader/pom.xml b/jcore-xmi-reader/pom.xml index 9e3df5b4c..e7630643a 100644 --- a/jcore-xmi-reader/pom.xml +++ b/jcore-xmi-reader/pom.xml @@ -24,8 +24,8 @@ ${jcore-types-version} - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-xmi-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiCollectionReaderTest.java b/jcore-xmi-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiCollectionReaderTest.java index 17fda0be8..2d360f427 100644 --- a/jcore-xmi-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiCollectionReaderTest.java +++ b/jcore-xmi-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiCollectionReaderTest.java @@ -16,9 +16,9 @@ import org.apache.uima.fit.factory.CollectionReaderFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertTrue; public class XmiCollectionReaderTest { @Test diff --git a/jcore-xmi-writer/pom.xml b/jcore-xmi-writer/pom.xml index 65dd58b07..586126e26 100644 --- a/jcore-xmi-writer/pom.xml +++ b/jcore-xmi-writer/pom.xml @@ -29,8 +29,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-xmi-writer/src/test/java/de/julielab/jcore/consumer/xmi/CasToXmiConsumerTest.java b/jcore-xmi-writer/src/test/java/de/julielab/jcore/consumer/xmi/CasToXmiConsumerTest.java index 1242372d6..e6b7006e2 100644 --- a/jcore-xmi-writer/src/test/java/de/julielab/jcore/consumer/xmi/CasToXmiConsumerTest.java +++ b/jcore-xmi-writer/src/test/java/de/julielab/jcore/consumer/xmi/CasToXmiConsumerTest.java @@ -24,15 +24,15 @@ import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FilenameFilter; import java.io.IOException; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; /** * Test for class {@link CasToXmiConsumer} @@ -91,7 +91,7 @@ public boolean accept(File file, String name) { * Delete all files ending with "xmi" or "xmi.gzip" in the output directory, * and do the same for all subdirectories of outputDir, recursively */ - @Before + @BeforeEach public void clearDirectory() { File outputDir = new File(OUTPUT_FOLDER_XMI); removeXmiGzipAndZipFiles(outputDir); @@ -121,7 +121,7 @@ private void removeXmiGzipAndZipFiles(File dir) { /** * Create the CasConsumer under test */ - @Before + @BeforeEach public void createConsumer() { // XMLInputSource source; try { diff --git a/jcore-xml-db-reader/pom.xml b/jcore-xml-db-reader/pom.xml index 8447584a9..3342d08b7 100644 --- a/jcore-xml-db-reader/pom.xml +++ b/jcore-xml-db-reader/pom.xml @@ -66,7 +66,6 @@ org.junit.jupiter junit-jupiter-engine - test de.julielab diff --git a/jcore-xml-mapper/pom.xml b/jcore-xml-mapper/pom.xml index 7264e7079..c2fa73802 100644 --- a/jcore-xml-mapper/pom.xml +++ b/jcore-xml-mapper/pom.xml @@ -34,8 +34,8 @@ test - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/genericTypes/TypeTemplate.java b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/genericTypes/TypeTemplate.java index 383dc3215..466350e8c 100644 --- a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/genericTypes/TypeTemplate.java +++ b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/genericTypes/TypeTemplate.java @@ -24,7 +24,6 @@ import java.util.*; -import static org.fest.reflect.core.Reflection.constructor; /** * Represents a Template for a type which Contains a List of Feature Templates @@ -119,14 +118,17 @@ public void setFullClassName(String fullClassName) { public void setParser(String trim) throws CollectionException { if (trim != null) { externalParser = true; - Class externalParserClass; + Class externalParserClass = null; try { externalParserClass = Class.forName(trim); + this.parser = (TypeParser) externalParserClass.getConstructor().newInstance(); } catch (ClassNotFoundException e) { LOGGER.error("ExternalParser " + trim + " for type or feature " + fullClassName + " returns a ClassNotFoundException", e); throw new CollectionException(e); + } catch (Exception e) { + LOGGER.error("Could not create instance of class {}: ", externalParserClass, e); + throw new CollectionException(e); } - this.parser = (TypeParser) constructor().in(externalParserClass).newInstance(); }else{ this.parser = null; } diff --git a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/DocumentTextHandler.java b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/DocumentTextHandler.java index 02218ee8b..4ef868e6f 100644 --- a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/DocumentTextHandler.java +++ b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/DocumentTextHandler.java @@ -25,9 +25,6 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.atomic.AtomicInteger; - -import static org.fest.reflect.core.Reflection.constructor; /** * Handels to parse the DocumentText @@ -128,16 +125,19 @@ public void setXPathForPartOfDocumentText(int id, String xpath) { public void setExternalParserForPartOfDocument(int id, String externalParserClassName) throws CollectionException { if (externalParserClassName != null) { - Class externalParserClass; + Class externalParserClass = null; + DocumentTextPartParser parser; try { externalParserClass = Class.forName(externalParserClassName.trim()); + parser = (DocumentTextPartParser) externalParserClass.getConstructor().newInstance(); } catch (ClassNotFoundException e) { LOGGER.error("ExternalParser " + externalParserClassName + " for document text part " + id + " returns a ClassNotFoundException", e); throw new CollectionException(e); + } catch (Exception e) { + LOGGER.error("Could not create instance of {}: ", externalParserClass, e); + throw new CollectionException(e); } - DocumentTextPartParser parser = (DocumentTextPartParser) constructor().in(externalParserClass).newInstance(); this.docTextData.get(id).setParser(parser); } } - } diff --git a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/typeBuilder/StandardTypeBuilder.java b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/typeBuilder/StandardTypeBuilder.java index 9fa1c46b2..a010092c1 100644 --- a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/typeBuilder/StandardTypeBuilder.java +++ b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/typeBuilder/StandardTypeBuilder.java @@ -27,8 +27,6 @@ import java.util.HashMap; -import static org.fest.reflect.core.Reflection.method; - /** * In this class, the actual UIMA types are built from the templates which have * been filled with values by the type parsers before. The standard type builder @@ -150,8 +148,7 @@ private Annotation buildSingleInstance(ConcreteType concreteType, JCas jcas) thr // itself. if (standardJavaTypesMap.get(concreteFeature.getFullClassName()) != null) { featureClass = standardJavaTypesMap.get(concreteFeature.getFullClassName()); - method(methodName).withParameterTypes(featureClass).in(type) - .invoke(parseValueStringToValueType(concreteFeature.getValue(), concreteFeature.getFullClassName())); + type.getClass().getMethod(methodName, featureClass).invoke(type, parseValueStringToValueType(concreteFeature.getValue(), concreteFeature.getFullClassName())); } else if (concreteFeature.getFullClassName().equals("String") || concreteFeature.getFullClassName().equals("java.lang.String")) { featureClass = Class.forName(concreteFeature.getFullClassName()); typeClass.getMethod(methodName, featureClass).invoke(type, concreteFeature.getValue()); @@ -163,7 +160,7 @@ private Annotation buildSingleInstance(ConcreteType concreteType, JCas jcas) thr + "\" the feature value class (e.g. String, Integer, another type...) was not defined in the mapping file."); featureClass = Class.forName(featureClassName); TOP top = concreteFeature.getTypeTemplate().getParser().getTypeBuilder().buildType(concreteFeature, jcas); - method(methodName).withParameterTypes(featureClass).in(type).invoke(top); + type.getClass().getMethod(methodName, featureClass).invoke(type, top); } } catch (Throwable e) { LOGGER.error("Wrong Feature Type: " + concreteFeature.getFullClassName(), e); diff --git a/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/EncodingTest.java b/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/EncodingTest.java index 23a256259..9d61cd532 100644 --- a/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/EncodingTest.java +++ b/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/EncodingTest.java @@ -13,9 +13,9 @@ import com.ximpleware.AutoPilot; import com.ximpleware.VTDGen; import com.ximpleware.VTDNav; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; public class EncodingTest { @Test diff --git a/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLMapperTest.java b/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLMapperTest.java index 8b3efcb59..a3e682208 100644 --- a/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLMapperTest.java +++ b/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLMapperTest.java @@ -33,13 +33,13 @@ import org.apache.uima.resource.metadata.ExternalResourceBinding; import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.HashMap; import java.util.Map; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; /** * TODO insert description diff --git a/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLReaderTest.java b/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLReaderTest.java index 1ecb95ccd..ef926761a 100644 --- a/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLReaderTest.java +++ b/jcore-xml-mapper/src/test/java/de/julielab/jcore/reader/xmlmapper/XMLReaderTest.java @@ -6,8 +6,8 @@ package de.julielab.jcore.reader.xmlmapper; -import de.julielab.jcore.types.*; import de.julielab.jcore.types.Date; +import de.julielab.jcore.types.*; import de.julielab.jcore.types.pubmed.Header; import de.julielab.jcore.types.pubmed.ManualDescriptor; import org.apache.uima.UIMAException; @@ -29,7 +29,7 @@ import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; -import org.junit.Test; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; @@ -38,10 +38,10 @@ import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; -import java.util.*; import java.util.List; +import java.util.*; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; /** * Test for class MedlineReader @@ -183,7 +183,7 @@ public void testSingleEntityData() throws Throwable { if (DEBUG_MODE) { serializeCas(cas); } - assertTrue("test documenttext", cas.getDocumentText() != null && cas.getDocumentText().length() > 0); + assertTrue(cas.getDocumentText() != null && cas.getDocumentText().length() > 0); assertEquals( "Mitigation of graft-versus-host disease in rats treated with allogeneic and xenogeneic antilymphocytic sera.\nThis is a very short test abstract.", cas.getDocumentText()); @@ -191,7 +191,7 @@ public void testSingleEntityData() throws Throwable { int counter = 0; String[] types = new String[] { ":::diso:2,3", ":::spe", ":::pgn" }; String[] texts = new String[] { "graft-versus-host disease", "rats", "sera" }; - assertTrue("No entity mentions found in the CAS", iter.hasNext()); + assertTrue(iter.hasNext(), "No entity mentions found in the CAS"); while (iter.hasNext()) { EntityMention text = (EntityMention) iter.next(); String coveredText = text.getCoveredText(); @@ -398,8 +398,8 @@ public void testMissingInputDirectory() { medlineReader = getCollectionReader(DESC_XML_READER_MISSING_INPUT_DIR); fail("Expected exception was not thrown"); } catch (Exception e) { - assertTrue("Exception should be an instance of ResourceInitializationException , but was " - + e.getClass().getName(), e instanceof ResourceInitializationException); + assertTrue(e instanceof ResourceInitializationException, "Exception should be an instance of ResourceInitializationException , but was " + + e.getClass().getName()); } } @@ -497,71 +497,71 @@ private void checkElements() { String pmid = getPMID(cas); if (pmid.equals("11119751")) { checkCount++; - assertTrue("Invalid keyWordList", checkKeywords(cas, EXPECTED_KEYWORDS)); - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Invalid DBInfoList", ckeckDBInfos(cas, EXPECTED_DB_INFO)); - assertTrue("Invalid MeshHeading", checkMeshHeadings(cas, EXPECTED_MESH_HEADINGS)); - assertTrue("Invalid GeneSymbol", checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS)); - assertTrue("Invalid Chemical", checkChemicals(cas, EXPECTED_CHEMICALS)); - assertTrue("Invalid Header", checkHeader(cas, EXPECTED_HEADER)); - assertTrue("Invalid ManualDescriptor", checkManualDescriptor(cas)); - assertTrue("Invalid Journal", ckeckJournal(cas, EXPECTED_JOURNAL)); - assertTrue("Invalid DocumentText", checkDocumentText(cas, EXPECTED_DOCUMENT_TEXT)); - assertTrue("Invalid AbstractText", checkAbstractText(cas, EXPECTED_ABSTRACT_TEXT)); - assertTrue("Invalid Title", checkTitle(cas, EXPECTED_TITLE)); - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(checkKeywords(cas, EXPECTED_KEYWORDS), "Invalid keyWordList"); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(ckeckDBInfos(cas, EXPECTED_DB_INFO), "Invalid DBInfoList"); + assertTrue(checkMeshHeadings(cas, EXPECTED_MESH_HEADINGS), "Invalid MeshHeading"); + assertTrue(checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS), "Invalid GeneSymbol"); + assertTrue(checkChemicals(cas, EXPECTED_CHEMICALS), "Invalid Chemical"); + assertTrue(checkHeader(cas, EXPECTED_HEADER), "Invalid Header"); + assertTrue(checkManualDescriptor(cas), "Invalid ManualDescriptor"); + assertTrue(ckeckJournal(cas, EXPECTED_JOURNAL), "Invalid Journal"); + assertTrue(checkDocumentText(cas, EXPECTED_DOCUMENT_TEXT), "Invalid DocumentText"); + assertTrue(checkAbstractText(cas, EXPECTED_ABSTRACT_TEXT), "Invalid AbstractText"); + assertTrue(checkTitle(cas, EXPECTED_TITLE), "Invalid Title"); + assertTrue(!checkSentences(cas), "Sentences Found"); } // check medline XML without most lists (gene, keywords,...) if (pmid.equals("11119751-a")) { checkCount++; - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(!checkSentences(cas), "Sentences Found"); } // check medline XML with pub date: 2000 // Spring-Summer if (pmid.equals("11119751-b")) { checkCount++; - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Invalid GeneSymbol", checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS)); - assertTrue("Invalid Journal", ckeckJournal(cas, EXPECTED_JOURNAL)); - assertTrue("Invalid PubDate", checkPubDate(cas, EXPECTED_DATE_1)); - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS), "Invalid GeneSymbol"); + assertTrue(ckeckJournal(cas, EXPECTED_JOURNAL), "Invalid Journal"); + assertTrue(checkPubDate(cas, EXPECTED_DATE_1), "Invalid PubDate"); + assertTrue(!checkSentences(cas), "Sentences Found"); } // check medline XML with pub date: 2000 Dec // 23-30 if (pmid.equals("11119751-c")) { checkCount++; - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Invalid GeneSymbol", checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS)); - assertTrue("Invalid Journal", ckeckJournal(cas, EXPECTED_JOURNAL)); - assertTrue("Invalid PubDate", checkPubDate(cas, EXPECTED_DATE_2)); - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS), "Invalid GeneSymbol"); + assertTrue(ckeckJournal(cas, EXPECTED_JOURNAL), "Invalid Journal"); + assertTrue(checkPubDate(cas, EXPECTED_DATE_2), "Invalid PubDate"); + assertTrue(!checkSentences(cas), "Sentences Found"); } // check medline XML pub date: 2000 Oct-2001 // Mar if (pmid.equals("11119751-d")) { - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(!checkSentences(cas), "Sentences Found"); checkCount++; - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Invalid GeneSymbol", checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS)); - assertTrue("Invalid Journal", ckeckJournal(cas, EXPECTED_JOURNAL)); - assertTrue("Invalid PubDate", checkPubDate(cas, EXPECTED_DATE_3)); - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS), "Invalid GeneSymbol"); + assertTrue(ckeckJournal(cas, EXPECTED_JOURNAL), "Invalid Journal"); + assertTrue(checkPubDate(cas, EXPECTED_DATE_3), "Invalid PubDate"); + assertTrue(!checkSentences(cas), "Sentences Found"); } if (pmid.equals("8045680")) { checkCount++; - assertTrue("No Sentences Found", checkSentences(cas)); + assertTrue(checkSentences(cas), "No Sentences Found"); // assertTrue("Invalid Header", checkHeader(cas, // EXPECTED_HEADER_OTHER_LANGUAGE)); } if (pmid.equals("12626969")) { checkCount++; - assertTrue("No Sentences Found", checkSentences(cas)); + assertTrue(checkSentences(cas), "No Sentences Found"); // assertTrue("Invalid Header", checkHeader(cas, // EXPECTED_HEADER_OTHER_LANGUAGE)); } if (pmid.equals("11119751-e")) { - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(!checkSentences(cas), "Sentences Found"); checkCount++; // assertTrue("Invalid Header", checkHeader(cas, // EXPECTED_HEADER_OTHER_LANGUAGE)); @@ -569,25 +569,25 @@ private void checkElements() { // test the case that only a title is found and no abstractText // (documentText should be equal to title in this case) if (pmid.equals("17276851")) { - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(!checkSentences(cas), "Sentences Found"); checkCount++; - assertTrue("Invalid Document Title", checkTitle(cas, EXPECTED_TITLE_2)); - assertTrue("Invalid Document Text", checkDocumentText(cas, EXPECTED_TITLE_2)); + assertTrue(checkTitle(cas, EXPECTED_TITLE_2), "Invalid Document Title"); + assertTrue(checkDocumentText(cas, EXPECTED_TITLE_2), "Invalid Document Text"); } // PubMed has changed the XML element ForeName to FirstName, but // foreName should still // be supported if (pmid.equals("18439884")) { - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(!checkSentences(cas), "Sentences Found"); checkCount++; - assertTrue("Invalid foreName", checkForeNames(cas, EXPECTED_FORE_NAMES)); + assertTrue(checkForeNames(cas, EXPECTED_FORE_NAMES), "Invalid foreName"); checkJournalTitle(cas, EXPECTED_JOURNAL_TITLE); } if (pmid.equals("17306504")) { - assertTrue("Sentences Found", !checkSentences(cas)); + assertTrue(!checkSentences(cas), "Sentences Found"); checkCount++; - assertTrue("Invalid pubTypeList", checkPubTypeList(cas, EXPECTED_PUBTYPES)); - assertTrue("Invalid DOI", checkDoi(cas, EXPECTED_DOI)); + assertTrue(checkPubTypeList(cas, EXPECTED_PUBTYPES), "Invalid pubTypeList"); + assertTrue(checkDoi(cas, EXPECTED_DOI), "Invalid DOI"); } } assertEquals(11, checkCount); @@ -668,7 +668,7 @@ private boolean checkAbstractText(CAS cas, String abstractTextString2) { * * @param cas * The CAS - * @param title + * @param expectedTitle * The correct title * @return true if the correct title is contained in the CAS */ @@ -1006,9 +1006,9 @@ private boolean checkSentences(CAS cas) { int count = 0; while (iter.hasNext()) { Sentence s = (Sentence) iter.next(); - assertTrue("Sentence has an ID", s.getId() != null); - assertTrue("Sentence has an Begin", s.getBegin() >= 0); - assertTrue("Sentence has an End", s.getEnd() >= 0); + assertTrue(s.getId() != null, "Sentence has an ID"); + assertTrue(s.getBegin() >= 0, "Sentence has an Begin"); + assertTrue(s.getEnd() >= 0, "Sentence has an End"); count++; } if (count == 0) @@ -1052,7 +1052,7 @@ private boolean checkAuthors(CAS cas, String[][] authors) { * foreName, but both should be supported) * * @param cas - * @param foreName + * @param foreNames * @return */ private boolean checkForeNames(CAS cas, String[] foreNames) { diff --git a/jcore-xml-reader/pom.xml b/jcore-xml-reader/pom.xml index 1719c5c73..7d2ec2b1f 100644 --- a/jcore-xml-reader/pom.xml +++ b/jcore-xml-reader/pom.xml @@ -137,8 +137,8 @@ assertj-core - junit - junit + org.junit.jupiter + junit-jupiter-engine diff --git a/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLMultiplierReaderTest.java b/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLMultiplierReaderTest.java index 67faae92f..875be49ce 100644 --- a/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLMultiplierReaderTest.java +++ b/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLMultiplierReaderTest.java @@ -21,7 +21,6 @@ import de.julielab.jcore.types.Journal; import de.julielab.jcore.types.casmultiplier.JCoReURI; import de.julielab.jcore.types.pubmed.Header; -import junit.framework.TestCase; import org.apache.uima.UIMAException; import org.apache.uima.UIMAFramework; import org.apache.uima.cas.CAS; @@ -37,6 +36,7 @@ import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -45,10 +45,13 @@ import java.util.*; import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + /** * Test for class XML Reader */ -public class XMLMultiplierReaderTest extends TestCase { +public class XMLMultiplierReaderTest { private static final Logger LOGGER = LoggerFactory.getLogger(XMLMultiplierReaderTest.class); @@ -80,6 +83,7 @@ public XMLMultiplierReaderTest() { } } + @Test public void testZipInput() throws UIMAException, IOException { JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.casmultiplier.jcore-uri-multiplier-types", "org.apache.uima.ducc.FlowControllerTS"); @@ -97,10 +101,10 @@ public void testZipInput() throws UIMAException, IOException { String fileName = it.next(); if (jCoReURI.getUri().endsWith(fileName)) { found = true; - assertTrue("File name " + fileName + " was already found", foundFileNames.add(fileName)); + assertTrue(foundFileNames.add(fileName), "File name " + fileName + " was already found"); } } - assertTrue("The URI " + jCoReURI.getUri()+ " was not matched by any expected file names", found); + assertTrue(found, "The URI " + jCoReURI.getUri()+ " was not matched by any expected file names"); jCas.reset(); } assertThat(expectedFileNames).isEqualTo(foundFileNames); @@ -111,6 +115,7 @@ public void testZipInput() throws UIMAException, IOException { * * @throws ResourceInitializationException */ + @Test public void testGetNextCas_singleFile() throws Exception { xmlMultiplierReader = CollectionReaderFactory.createReader(DESC_XML_MULTIPLIER_READER_DIR, XMLMultiplierReader.PARAM_INPUT_FILE, "src/test/resources/pubmedXML/pubmedsample18n0001.xml.gz"); @@ -125,6 +130,7 @@ public void testGetNextCas_singleFile() throws Exception { } + @Test public void testGetNextCas_directory() throws Exception { xmlMultiplierReader = CollectionReaderFactory.createReader(DESC_XML_MULTIPLIER_READER_DIR, XMLMultiplierReader.PARAM_INPUT_DIR, "src/test/resources/pubmedXML/"); diff --git a/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLMultiplierTest.java b/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLMultiplierTest.java index c3913c702..c757166ba 100644 --- a/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLMultiplierTest.java +++ b/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLMultiplierTest.java @@ -10,12 +10,12 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; public class XMLMultiplierTest { diff --git a/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLReaderTest.java b/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLReaderTest.java index d1b67539f..cf54882b1 100644 --- a/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLReaderTest.java +++ b/jcore-xml-reader/src/test/java/de/julielab/jcore/reader/XMLReaderTest.java @@ -18,11 +18,10 @@ package de.julielab.jcore.reader; import de.julielab.jcore.reader.xml.XMLReader; -import de.julielab.jcore.types.*; import de.julielab.jcore.types.Date; +import de.julielab.jcore.types.*; import de.julielab.jcore.types.pubmed.Header; import de.julielab.jcore.types.pubmed.ManualDescriptor; -import junit.framework.TestCase; import org.apache.commons.lang3.ArrayUtils; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.metadata.AnalysisEngineMetaData; @@ -49,13 +48,16 @@ import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; -import java.util.*; import java.util.List; +import java.util.*; + +import static org.assertj.core.api.Fail.fail; +import static org.junit.jupiter.api.Assertions.*; /** * Test for class XML Reader */ -public class XMLReaderTest extends TestCase { +public class XMLReaderTest { private static final Logger LOGGER = LoggerFactory.getLogger(XMLReaderTest.class); @@ -228,7 +230,7 @@ public void testGetNextCas_singleFile() throws ResourceInitializationException { LOGGER.error(e.getMessage(), e); e.printStackTrace(); } - assertEquals("reading single file", EXPECTED_DOCUMENT_TEXT, cas.getDocumentText()); + assertEquals( EXPECTED_DOCUMENT_TEXT, cas.getDocumentText(), "reading single file"); } /** @@ -239,8 +241,8 @@ public void testMissingInputDirectory() { medlineReader = getCollectionReader(DESC_MEDLINE_READER_MISSING_INPUT_DIR); fail("Expected exception was not thrown"); } catch (Exception e) { - assertTrue("Exception should be an instance of ResourceInitializationException , but was " - + e.getClass().getName(), e instanceof ResourceInitializationException); + assertTrue(e instanceof ResourceInitializationException, "Exception should be an instance of ResourceInitializationException , but was " + + e.getClass().getName()); } } @@ -332,25 +334,25 @@ private void checkElements() { // check medline XML with all items if (getPMID(cas).equals("11119751")) { checkCount++; - assertTrue("Invalid keyWordList", checkKeywords(cas, EXPECTED_KEYWORDS)); - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Invalid DBInfoList", ckeckDBInfos(cas, EXPECTED_DB_INFO)); - assertTrue("Invalid MeshHeading", checkMeshHeadings(cas, EXPECTED_MESH_HEADINGS)); - assertTrue("Invalid GeneSymbol", checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS)); - assertTrue("Invalid Chemical", checkChemicals(cas, EXPECTED_CHEMICALS)); - assertTrue("Invalid Header in document " + getPMID(cas), checkHeader(cas, EXPECTED_HEADER)); - assertTrue("Invalid ManualDescriptor", checkManualDescriptor(cas)); - assertTrue("Invalid Journal", ckeckJournal(cas, EXPECTED_JOURNAL)); - assertTrue("Invalid DocumentText in document " + getPMID(cas), - checkDocumentText(cas, EXPECTED_DOCUMENT_TEXT)); - assertTrue("Invalid AbstractText", checkAbstractText(cas, EXPECTED_ABSTRACT_TEXT)); - assertTrue("Invalid Title", checkTitle(cas, EXPECTED_TITLE)); + assertTrue(checkKeywords(cas, EXPECTED_KEYWORDS), "Invalid keyWordList"); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(ckeckDBInfos(cas, EXPECTED_DB_INFO), "Invalid DBInfoList"); + assertTrue(checkMeshHeadings(cas, EXPECTED_MESH_HEADINGS), "Invalid MeshHeading"); + assertTrue(checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS), "Invalid GeneSymbol"); + assertTrue(checkChemicals(cas, EXPECTED_CHEMICALS), "Invalid Chemical"); + assertTrue(checkHeader(cas, EXPECTED_HEADER), "Invalid Header in document " + getPMID(cas)); + assertTrue(checkManualDescriptor(cas), "Invalid ManualDescriptor"); + assertTrue(ckeckJournal(cas, EXPECTED_JOURNAL), "Invalid Journal"); + assertTrue(checkDocumentText(cas, EXPECTED_DOCUMENT_TEXT), + "Invalid DocumentText in document " + getPMID(cas)); + assertTrue(checkAbstractText(cas, EXPECTED_ABSTRACT_TEXT), "Invalid AbstractText"); + assertTrue(checkTitle(cas, EXPECTED_TITLE), "Invalid Title"); } // check medline XML without most lists (gene, keywords,...) if (getPMID(cas).equals("11119751-a")) { checkCount++; - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); } @@ -358,30 +360,30 @@ private void checkElements() { // Spring-Summer if (getPMID(cas).equals("11119751-b")) { checkCount++; - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Invalid GeneSymbol", checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS)); - assertTrue("Invalid Journal in document " + getPMID(cas), ckeckJournal(cas, EXPECTED_JOURNAL)); - assertTrue("Invalid PubDate", checkPubDate(cas, EXPECTED_DATE_1)); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS), "Invalid GeneSymbol"); + assertTrue(ckeckJournal(cas, EXPECTED_JOURNAL), "Invalid Journal in document " + getPMID(cas)); + assertTrue(checkPubDate(cas, EXPECTED_DATE_1), "Invalid PubDate"); } // check medline XML with pub date: 2000 Dec // 23-30 if (getPMID(cas).equals("11119751-c")) { checkCount++; - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Invalid GeneSymbol", checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS)); - assertTrue("Invalid Journal", ckeckJournal(cas, EXPECTED_JOURNAL)); - assertTrue("Invalid PubDate", checkPubDate(cas, EXPECTED_DATE_2)); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS), "Invalid GeneSymbol"); + assertTrue(ckeckJournal(cas, EXPECTED_JOURNAL), "Invalid Journal"); + assertTrue(checkPubDate(cas, EXPECTED_DATE_2), "Invalid PubDate"); } // check medline XML pub date: 2000 Oct-2001 // Mar if (getPMID(cas).equals("11119751-d")) { checkCount++; - assertTrue("Invalid Authors", checkAuthors(cas, EXPECTED_AUTHORS)); - assertTrue("Invalid GeneSymbol", checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS)); - assertTrue("Invalid Journal", ckeckJournal(cas, EXPECTED_JOURNAL)); - assertTrue("Invalid PubDate", checkPubDate(cas, EXPECTED_DATE_3)); + assertTrue(checkAuthors(cas, EXPECTED_AUTHORS), "Invalid Authors"); + assertTrue(checkGeneSymbols(cas, EXPECTED_GENE_SYMBOLS), "Invalid GeneSymbol"); + assertTrue(ckeckJournal(cas, EXPECTED_JOURNAL), "Invalid Journal"); + assertTrue(checkPubDate(cas, EXPECTED_DATE_3), "Invalid PubDate"); } if (getPMID(cas).equals("11119751-e")) { @@ -394,22 +396,22 @@ private void checkElements() { // (documentText should be equal to title in this case) if (getPMID(cas).equals("17276851")) { checkCount++; - assertTrue("Invalid Document Title", checkTitle(cas, EXPECTED_TITLE_2)); - assertTrue("Invalid Document Text", checkDocumentText(cas, EXPECTED_TITLE_2)); + assertTrue(checkTitle(cas, EXPECTED_TITLE_2), "Invalid Document Title"); + assertTrue(checkDocumentText(cas, EXPECTED_TITLE_2), "Invalid Document Text"); } // PubMed has changed the XML element ForeName to FirstName, but // foreName should still be supported if (getPMID(cas).equals("18439884")) { checkCount++; - assertTrue("Invalid foreName", checkForeNames(cas, EXPECTED_FORE_NAMES)); + assertTrue(checkForeNames(cas, EXPECTED_FORE_NAMES), "Invalid foreName"); checkJournalTitle(cas, EXPECTED_JOURNAL_TITLE); } if (getPMID(cas).equals("17306504")) { checkCount++; - assertTrue("Invalid pubTypeList", checkPubTypeList(cas, EXPECTED_PUBTYPES)); - assertTrue("Invalid DOI in document " + getPMID(cas), checkDoi(cas, EXPECTED_DOI)); + assertTrue(checkPubTypeList(cas, EXPECTED_PUBTYPES), "Invalid pubTypeList"); + assertTrue(checkDoi(cas, EXPECTED_DOI), "Invalid DOI in document " + getPMID(cas)); } } assertEquals(9, checkCount); @@ -491,7 +493,7 @@ private boolean checkAbstractText(CAS cas, String abstractTextString2) { * * @param cas * The CAS - * @param title + * @param expectedTitle * The correct title * @return true if the correct title is contained in the CAS */ @@ -896,7 +898,7 @@ private boolean checkAuthors(CAS cas, String[][] authors) { * Check if foreName was correctly parsed (PubMed changed firstName to foreName, but both should be supported) * * @param cas - * @param foreName + * @param foreNames * @return */ private boolean checkForeNames(CAS cas, String[] foreNames) { From b0ca7688df47a8d7328d7f20b3f4dd568c5d5395 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 8 Jun 2021 07:54:39 +0200 Subject: [PATCH 063/269] Fixed XMIDBReader tests which actually rely on TestNG for multiple, parallel execution of a single test. I don't now if and how this would work with Junit5. --- .../xmi/XmiDBMultiplierDifferentNsSchemaTest.java | 8 ++++---- .../julielab/jcore/reader/xmi/XmiDBMultiplierTest.java | 8 ++++---- .../jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java | 10 +++++----- .../reader/xmi/XmiDBReaderDifferentNsSchemaTest.java | 10 +++++----- .../jcore/reader/xmi/XmiDBReaderGzippedDataTest.java | 8 ++++---- .../reader/xmi/XmiDBReaderMonolithicDocumentsTest.java | 8 ++++---- .../de/julielab/jcore/reader/xmi/XmiDBReaderTest.java | 10 +++++----- .../consumer/xmi/XmiDBWriterBinaryFormatTest.java | 8 +++++--- .../xmi/XmiDBWriterMonolithicDocumentTest.java | 6 ++++-- .../julielab/jcore/consumer/xmi/XmiDBWriterTest.java | 6 ++++-- 10 files changed, 44 insertions(+), 38 deletions(-) diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierDifferentNsSchemaTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierDifferentNsSchemaTest.java index f7fa5f19a..ff60e41a0 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierDifferentNsSchemaTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierDifferentNsSchemaTest.java @@ -16,9 +16,9 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; import org.testcontainers.containers.PostgreSQLContainer; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.File; @@ -35,7 +35,7 @@ public class XmiDBMultiplierDifferentNsSchemaTest { private static String costosysConfig; private static int subsetCounter; - @BeforeAll + @BeforeClass public static void setup() throws UIMAException, IOException, ConfigurationException { postgres.start(); XmiDBSetupHelper.createDbcConfig(postgres); @@ -49,7 +49,7 @@ public static void setup() throws UIMAException, IOException, ConfigurationExcep subsetCounter = 0; } - @AfterAll + @AfterClass public static void shutdown() { postgres.close(); } diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java index c2398d503..2af097f43 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java @@ -16,9 +16,9 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; import org.testcontainers.containers.PostgreSQLContainer; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.File; @@ -35,7 +35,7 @@ public class XmiDBMultiplierTest { private static String costosysConfig; private static int subsetCounter; - @BeforeAll + @BeforeClass public static void setup() throws UIMAException, IOException, ConfigurationException { postgres.start(); XmiDBSetupHelper.createDbcConfig(postgres); @@ -49,7 +49,7 @@ public static void setup() throws UIMAException, IOException, ConfigurationExcep subsetCounter = 0; } - @AfterAll + @AfterClass public static void shutdown() { postgres.close(); } diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java index 51d66d493..d2fc88444 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java @@ -12,9 +12,9 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; import org.testcontainers.containers.PostgreSQLContainer; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.IOException; @@ -30,7 +30,7 @@ public class XmiDBReaderBinaryFormatTest { private static String costosysConfig; private static String xmisubset; - @BeforeAll + @BeforeClass public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { postgres.start(); XmiDBSetupHelper.createDbcConfig(postgres); @@ -38,7 +38,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); costosysConfig = DBTestUtils.createTestCostosysConfig("xmi_text", 2, postgres); XmiDBSetupHelper.processAndSplitData(costosysConfig, false, true,"public"); - assertTrue("The data document table exists", dbc.withConnectionQueryBoolean(c -> c.tableExists("_data.documents"))); + assertTrue(dbc.withConnectionQueryBoolean(c -> c.tableExists("_data.documents")), "The data document table exists"); xmisubset = "xmisubset"; dbc.setActiveTableSchema("xmi_text"); dbc.reserveConnection(); @@ -48,7 +48,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf } - @AfterAll + @AfterClass public static void shutdown() { postgres.close(); } diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderDifferentNsSchemaTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderDifferentNsSchemaTest.java index 39b2639f0..d592bec9e 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderDifferentNsSchemaTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderDifferentNsSchemaTest.java @@ -12,9 +12,9 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; import org.testcontainers.containers.PostgreSQLContainer; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.IOException; @@ -30,7 +30,7 @@ public class XmiDBReaderDifferentNsSchemaTest { private static String costosysConfig; private static String xmisubset; - @BeforeAll + @BeforeClass public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { postgres.start(); XmiDBSetupHelper.createDbcConfig(postgres); @@ -38,7 +38,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); costosysConfig = DBTestUtils.createTestCostosysConfig("xmi_text", 2, postgres); XmiDBSetupHelper.processAndSplitData(costosysConfig, false, false, "someotherschema"); - assertTrue("The data document table exists", dbc.withConnectionQueryBoolean(c -> c.tableExists("_data.documents"))); + assertTrue(dbc.withConnectionQueryBoolean(c -> c.tableExists("_data.documents")), "The data document table exists"); xmisubset = "xmisubset"; dbc.setActiveTableSchema("xmi_text"); dbc.reserveConnection(); @@ -48,7 +48,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf } - @AfterAll + @AfterClass public static void shutdown() { postgres.close(); } diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderGzippedDataTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderGzippedDataTest.java index 018170026..9a7fea0b3 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderGzippedDataTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderGzippedDataTest.java @@ -12,9 +12,9 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; import org.testcontainers.containers.PostgreSQLContainer; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.File; @@ -35,7 +35,7 @@ public class XmiDBReaderGzippedDataTest { private static String costosysConfig; private static String xmisubset; - @BeforeAll + @BeforeClass public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { postgres.start(); XmiDBSetupHelper.createDbcConfig(postgres); @@ -53,7 +53,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf dbc.close(); } - @AfterAll + @AfterClass public static void shutdown() { postgres.close(); } diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderMonolithicDocumentsTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderMonolithicDocumentsTest.java index decd4e840..e0ae7f3ed 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderMonolithicDocumentsTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderMonolithicDocumentsTest.java @@ -12,9 +12,9 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; import org.testcontainers.containers.PostgreSQLContainer; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.File; @@ -32,7 +32,7 @@ public class XmiDBReaderMonolithicDocumentsTest { private static String costosysConfig; private static String xmisubset; - @BeforeAll + @BeforeClass public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { postgres.start(); XmiDBSetupHelper.createDbcConfig(postgres); @@ -50,7 +50,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf dbc.close(); } - @AfterAll + @AfterClass public static void shutdown() { postgres.close(); } diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderTest.java index 1f8150274..cf1d089ef 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderTest.java @@ -14,9 +14,9 @@ import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; import org.testcontainers.containers.PostgreSQLContainer; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.IOException; @@ -32,7 +32,7 @@ public class XmiDBReaderTest { private static String costosysConfig; private static String xmisubset; - @BeforeAll + @BeforeClass public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { postgres.start(); XmiDBSetupHelper.createDbcConfig(postgres); @@ -40,7 +40,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); costosysConfig = DBTestUtils.createTestCostosysConfig("xmi_text", 2, postgres); XmiDBSetupHelper.processAndSplitData(costosysConfig, false, false,"public"); - assertTrue("The data document table exists", dbc.withConnectionQueryBoolean(c -> c.tableExists("_data.documents"))); + assertTrue(dbc.withConnectionQueryBoolean(c -> c.tableExists("_data.documents")), "The data document table exists"); xmisubset = "xmisubset"; dbc.setActiveTableSchema("xmi_text"); dbc.reserveConnection(); @@ -50,7 +50,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf } - @AfterAll + @AfterClass public static void shutdown() { postgres.close(); } diff --git a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterBinaryFormatTest.java b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterBinaryFormatTest.java index 4df9efaaa..15b5fc5c9 100644 --- a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterBinaryFormatTest.java +++ b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterBinaryFormatTest.java @@ -15,12 +15,13 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.StringArray; -import org.junit.ClassRule; -import org.junit.Test; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -38,8 +39,9 @@ import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; +@Testcontainers public class XmiDBWriterBinaryFormatTest { - @ClassRule + @Container public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); private static String costosysConfig; private static String xmlSubsetTable; diff --git a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterMonolithicDocumentTest.java b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterMonolithicDocumentTest.java index 84e35a027..6af2d578d 100644 --- a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterMonolithicDocumentTest.java +++ b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterMonolithicDocumentTest.java @@ -14,11 +14,12 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.ClassRule; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; import java.io.ByteArrayInputStream; import java.sql.ResultSet; @@ -27,8 +28,9 @@ import static org.assertj.core.api.Assertions.assertThatCode; import static org.junit.jupiter.api.Assertions.assertTrue; +@Testcontainers public class XmiDBWriterMonolithicDocumentTest { - @ClassRule + @Container public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); private static String costosysConfig; private static DataBaseConnector dbc; diff --git a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java index 00230cda6..5f3a979bb 100644 --- a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java +++ b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java @@ -11,11 +11,12 @@ import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.ClassRule; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; import java.io.IOException; import java.sql.SQLException; @@ -27,8 +28,9 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatCode; +@Testcontainers public class XmiDBWriterTest { - @ClassRule + @Container public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); private static String costosysConfig; private static String xmlSubsetTable; From 930850c85973f9affad3a2a23fe36be0fc6f1283 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 8 Jun 2021 08:52:30 +0200 Subject: [PATCH 064/269] Added advanced logic to find the fitting Python executable for the flair components. --- .../jcore/ae/flairner/FlairNerAnnotator.java | 33 +++++++++++++- .../ae/fte/FlairTokenEmbeddingAnnotator.java | 45 +++++++++++++++---- .../fte/FlairTokenEmbeddingAnnotatorTest.java | 7 ++- 3 files changed, 71 insertions(+), 14 deletions(-) diff --git a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java index 4aea01797..8ce44a6f5 100644 --- a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java +++ b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java @@ -1,5 +1,6 @@ package de.julielab.jcore.ae.flairner; +import de.julielab.java.utilities.IOStreamUtilities; import de.julielab.jcore.ae.annotationadder.AnnotationAdderAnnotator; import de.julielab.jcore.ae.annotationadder.AnnotationAdderConfiguration; import de.julielab.jcore.ae.annotationadder.AnnotationAdderHelper; @@ -30,6 +31,8 @@ import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.StreamSupport; @@ -95,9 +98,35 @@ public void initialize(final UimaContext aContext) throws ResourceInitialization pythonExecutable = pythonExecutableOpt.get(); log.info("Python executable: {} (from descriptor)", pythonExecutable); } + List pythonCommands = List.of("python3", "python3.6", "python36", "python3.7", "python37", "python"); + for (int i = 0; i < pythonCommands.size() && pythonExecutable == null; i++) { + String currentPythonExecutable = pythonCommands.get(i); + log.debug("Trying Python executable: {}", currentPythonExecutable); + try { + try { + Process exec = new ProcessBuilder(List.of(currentPythonExecutable, "--version")).redirectErrorStream(true).start(); + List pythonOutput = IOStreamUtilities.getLinesFromInputStream(exec.getInputStream()); + int exitCode = exec.waitFor(); + if (exitCode == 0 && !pythonOutput.isEmpty()) { + String versionLine = pythonOutput.get(0); + Matcher m = Pattern.compile("3\\..*$").matcher(versionLine); + if (m.find()) { + pythonExecutable = currentPythonExecutable; + log.info("Found Python {} with command {}.", m.group(), pythonExecutable); + } + } + } catch (IOException e) { + log.trace("Python command {} does not exist. Trying the next.", currentPythonExecutable); + } + } catch (InterruptedException e) { + log.error("Error why trying to call python.", e); + throw new ResourceInitializationException(e); + } + } if (pythonExecutable == null) { - pythonExecutable = "python"; - log.info("Python executable: {} (default)", pythonExecutable); + String msg = String.format("Could not find Python 3.x installation. The following commands were tried: %s. Please make Python 3.x available under one of those commands or specify the Python executable explicitly in the component descriptor.", String.join(", ", pythonCommands)); + log.error(msg); + throw new ResourceInitializationException(new IllegalArgumentException(msg)); } try { connector = new StdioPythonConnector(flairModel, pythonExecutable, storeEmbeddings, gpuNum); diff --git a/jcore-flair-token-embedding-ae/src/main/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotator.java b/jcore-flair-token-embedding-ae/src/main/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotator.java index d41381bc9..a268d48fd 100644 --- a/jcore-flair-token-embedding-ae/src/main/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotator.java +++ b/jcore-flair-token-embedding-ae/src/main/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotator.java @@ -4,6 +4,7 @@ import de.julielab.ipc.javabridge.Options; import de.julielab.ipc.javabridge.ResultDecoders; import de.julielab.ipc.javabridge.StdioBridge; +import de.julielab.java.utilities.IOStreamUtilities; import de.julielab.jcore.types.EmbeddingVector; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; @@ -30,6 +31,8 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; @ResourceMetaData(name = "JCoRe Flair Token Embedding Annotator", description = "Adds the Flair compatible embedding vectors to the token annotations.") @TypeCapability(inputs = {"de.julielab.jcore.types.Sentence", "de.julielab.jcore.types.Token"}, outputs = {"de.julielab.jcore.types.EmbeddingVector"}) @@ -37,7 +40,7 @@ public class FlairTokenEmbeddingAnnotator extends JCasAnnotator_ImplBase { public static final String PARAM_EMBEDDING_PATH = "EmbeddingPath"; public static final String PARAM_COMPUTATION_FILTER = "ComputationFilter"; - public static final String PARAM_EMBEDDING_SOURCE = "EmbeddingSource"; + public static final String PARAM_EMBEDDING_SOURCE = "EmbeddingSource"; public static final String PARAM_PYTHON_EXECUTABLE = "PythonExecutable"; private final static Logger log = LoggerFactory.getLogger(FlairTokenEmbeddingAnnotator.class); /** @@ -48,9 +51,9 @@ public class FlairTokenEmbeddingAnnotator extends JCasAnnotator_ImplBase { private String embeddingPath; @ConfigurationParameter(name = PARAM_COMPUTATION_FILTER, mandatory = false, description = "This parameter may be set to a fully qualified annotation type. If given, only for documents containing at least one annotation of this type embeddings will be retrieved from the computing flair python script. However, for contextualized embeddings, all embedding vectors are computed anyway and the the I/O cost is minor in comparison to the embedding computation. Thus, setting this parameter will most probably only result in small time savings.") private String computationFilter; - @ConfigurationParameter(name=PARAM_EMBEDDING_SOURCE, mandatory = false, description = "The value of this parameter will be set to the source feature of the EmbeddingVector annotation instance created on the tokens. If left blank, the value of the " + PARAM_EMBEDDING_PATH + " will be used.") + @ConfigurationParameter(name = PARAM_EMBEDDING_SOURCE, mandatory = false, description = "The value of this parameter will be set to the source feature of the EmbeddingVector annotation instance created on the tokens. If left blank, the value of the " + PARAM_EMBEDDING_PATH + " will be used.") private String embeddingSource; - @ConfigurationParameter(name=PARAM_PYTHON_EXECUTABLE, mandatory = false, description = "The path to the python executable. Required is a python verion >=3.6.") + @ConfigurationParameter(name = PARAM_PYTHON_EXECUTABLE, mandatory = false, description = "The path to the python executable. Required is a python version >=3.6.") private String pythonExecutable; private StdioBridge flairBridge; private Gson gson; @@ -68,9 +71,9 @@ public void initialize(final UimaContext aContext) throws ResourceInitialization computationFilter = (String) aContext.getConfigParameterValue(PARAM_COMPUTATION_FILTER); embeddingSource = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_EMBEDDING_SOURCE)).orElse(embeddingPath); - Optional pythonExecutableOpt = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_PYTHON_EXECUTABLE)); + Optional pythonExecutableOpt = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_PYTHON_EXECUTABLE)); if (!pythonExecutableOpt.isPresent()) { - log.debug("No python executable given in the component descriptor, trying to read PYTHON environment variable." ); + log.debug("No Python executable given in the component descriptor, trying to read PYTHON environment variable."); final String pythonExecutableEnv = System.getenv("PYTHON"); if (pythonExecutableEnv != null) { pythonExecutable = pythonExecutableEnv; @@ -80,9 +83,35 @@ public void initialize(final UimaContext aContext) throws ResourceInitialization pythonExecutable = pythonExecutableOpt.get(); log.info("Python executable: {} (from descriptor)", pythonExecutable); } + List pythonCommands = List.of("python3", "python3.6", "python36", "python3.7", "python37", "python"); + for (int i = 0; i < pythonCommands.size() && pythonExecutable == null; i++) { + String currentPythonExecutable = pythonCommands.get(i); + log.debug("Trying Python executable: {}", currentPythonExecutable); + try { + try { + Process exec = new ProcessBuilder(List.of(currentPythonExecutable, "--version")).redirectErrorStream(true).start(); + List pythonOutput = IOStreamUtilities.getLinesFromInputStream(exec.getInputStream()); + int exitCode = exec.waitFor(); + if (exitCode == 0 && !pythonOutput.isEmpty()) { + String versionLine = pythonOutput.get(0); + Matcher m = Pattern.compile("3\\..*$").matcher(versionLine); + if (m.find()) { + pythonExecutable = currentPythonExecutable; + log.info("Found Python {} with command {}.", m.group(), pythonExecutable); + } + } + } catch (IOException e) { + log.trace("Python command {} does not exist. Trying the next.", currentPythonExecutable); + } + } catch (InterruptedException e) { + log.error("Error why trying to call python.", e); + throw new ResourceInitializationException(e); + } + } if (pythonExecutable == null) { - pythonExecutable = "python3.6"; - log.info("Python executable: {} (default)", pythonExecutable); + String msg = String.format("Could not find Python 3.x installation. The following commands were tried: %s. Please make Python 3.x available under one of those commands or specify the Python executable explicitly in the component descriptor.", String.join(", ", pythonCommands)); + log.error(msg); + throw new ResourceInitializationException(new IllegalArgumentException(msg)); } try { @@ -183,7 +212,7 @@ private String constructEmbeddingRequest(JCas aJCas, List tokenToAddEmbed } ++tokenIndex; } - sentenceTextSb.deleteCharAt(sentenceTextSb.length()-1); + sentenceTextSb.deleteCharAt(sentenceTextSb.length() - 1); Map sentenceAndIndices = new HashMap<>(); sentenceAndIndices.put("sentence", sentenceTextSb.toString()); sentenceAndIndices.put("tokenIndicesToReturn", tokenIndicesToSet); diff --git a/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotatorTest.java b/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotatorTest.java index d67615d3e..f6ef8acce 100644 --- a/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotatorTest.java +++ b/jcore-flair-token-embedding-ae/src/test/java/de/julielab/jcore/ae/fte/FlairTokenEmbeddingAnnotatorTest.java @@ -18,6 +18,7 @@ * Unit tests for jcore-flair-token-embedding-ae. */ public class FlairTokenEmbeddingAnnotatorTest { + @Test public void testEmbeddingAnnotator() throws Exception { final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types"); @@ -30,8 +31,7 @@ public void testEmbeddingAnnotator() throws Exception { final String embeddingPath = "flair:src/test/resources/gene_small_best_lm.pt"; final AnalysisEngine engine = AnalysisEngineFactory.createEngine("de.julielab.jcore.ae.fte.desc.jcore-flair-token-embedding-ae", - FlairTokenEmbeddingAnnotator.PARAM_EMBEDDING_PATH, embeddingPath, - FlairTokenEmbeddingAnnotator.PARAM_PYTHON_EXECUTABLE, "python"); + FlairTokenEmbeddingAnnotator.PARAM_EMBEDDING_PATH, embeddingPath); engine.process(jCas); @@ -62,8 +62,7 @@ public void testEmbeddingAnnotatorWithFilterAnnotation() throws Exception { final String embeddingPath = "flair:src/test/resources/gene_small_best_lm.pt"; final AnalysisEngine engine = AnalysisEngineFactory.createEngine("de.julielab.jcore.ae.fte.desc.jcore-flair-token-embedding-ae", FlairTokenEmbeddingAnnotator.PARAM_EMBEDDING_PATH, embeddingPath, - FlairTokenEmbeddingAnnotator.PARAM_COMPUTATION_FILTER, "de.julielab.jcore.types.Gene", - FlairTokenEmbeddingAnnotator.PARAM_PYTHON_EXECUTABLE, "python"); + FlairTokenEmbeddingAnnotator.PARAM_COMPUTATION_FILTER, "de.julielab.jcore.types.Gene"); engine.process(jCas); From b14b0725a9a9a3c779a433b02105c1c65e718c86 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 8 Jun 2021 10:16:02 +0200 Subject: [PATCH 065/269] Documentation updates. Travis CI is working. Finishing the current line of work. Fixes #120,#119,#118. --- README.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9035ccbb1..79c1fbc99 100644 --- a/README.md +++ b/README.md @@ -12,24 +12,29 @@ In order to automate the builds of complex NLP pipelines and properly represent A description for each individual component can be found in their respective `README.md`. ### Requirements & Dependencies -In order to use our components you need at least [JDK 11](https://www.oracle.com/technetwork/java/javase/downloads/jdk11-downloads-5066655.html) (Java SE Development Kit 11), [UIMA 2.10](https://uima.apache.org/index.html) & [Maven 3](https://maven.apache.org/). We develop with the [Eclipse IDE for Java Developers](http://www.eclipse.org/downloads/) and [IntelliJ IDEA](https://www.jetbrains.com/idea/) Java IDEs. If course you're free to try it with different versions or tools than those mentioned, but we can't make promises for a flawless functioning of our components in these cases. +In order to use our components you need at least [JDK 11](https://www.oracle.com/technetwork/java/javase/downloads/jdk11-downloads-5066655.html) (Java SE Development Kit 11), [UIMA 2.x](https://uima.apache.org/index.html) & [Maven 3](https://maven.apache.org/). We develop with the [Eclipse IDE for Java Developers](http://www.eclipse.org/downloads/) and [IntelliJ IDEA](https://www.jetbrains.com/idea/) Java IDEs. If course you're free to try it with different versions or tools than those mentioned, but we can't make promises for a flawless functioning of our components in these cases. ### UIMA's Collection Processing Engine (CPE) -UIMA features a relatively easy way to combine UIMA components together in order to analyze a collection of artifacts. If you're not firm or willing to deal with Java Code, the usage of a CPE might be the right choice. +UIMA offers a relatively easy way to combine UIMA components together in order to analyze a collection of artifacts. If you're not firm or willing to deal with Java Code, the usage of a CPE might be the right choice. For more detailed information see [UIMA's CPE Documentation](https://uima.apache.org/downloads/releaseDocs/2.1.0-incubating/docs/html/tutorials_and_users_guides/tutorials_and_users_guides.html#ugr.tug.cpe). -We're also working on a simple [Python script](https://github.com/JULIELab/jcore-misc/tree/master/jcore-cpe-builder) that builds rudimentary and preconfigured CPEs of your choice. It's working but still work in progress so please bear with us and post issues. +A newer alternative is [UIMA AS](https://uima.apache.org/doc-uimaas-what.html). It is today's officially recommended way to use and scale UIMA pipelines. Our existing CPE infrastructure serves us well, however, so we mostly stick to those for the time being. + +### JCoRe UIMA Pipeline Builder + +Most CPE configurations consisting of JCoRe components can be easily built using the [JCoRe UIMA Pipeline Builder](https://github.com/JULIELab/jcore-pipeline-modules). +This is a Java program that offers a simple command line interface for the creation of CPEs. There is also support for UIMA AS. ### Maven Artifacts If not stated otherwise, all the components found in this project are at least in their latest release version also available as Maven artifacts: ``` de.julielab - #COMPONENT-NAME + COMPONENT-NAME ${jcore-version} ``` -Where `#COMPONENT-NAME` is exactly the same as the name on GitHub. +Where `COMPONENT-NAME` is exactly the same as the name on GitHub. For instance, to get the Acronym Resolver, include this in your Maven dependencies: ``` From 2317cfa0011351f23ea6df834b068daf7fe95b93 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 8 Jun 2021 15:17:41 +0200 Subject: [PATCH 066/269] Adding trace-level logging to the `AnnotationDefinedFlow` in order to see the flow when actually running all components together. --- .../AnnotationDefinedFlow.java | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlow.java b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlow.java index 0243a7f36..e93616aab 100644 --- a/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlow.java +++ b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlow.java @@ -1,14 +1,18 @@ package de.julielab.jcore.flow.annotationdefined; import de.julielab.jcore.types.casflow.ToVisit; +import de.julielab.jcore.utility.JCoReTools; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.analysis_engine.metadata.FixedFlow; import org.apache.uima.analysis_engine.metadata.FlowConstraints; +import org.apache.uima.cas.CASException; import org.apache.uima.flow.FinalStep; import org.apache.uima.flow.JCasFlow_ImplBase; import org.apache.uima.flow.SimpleStep; import org.apache.uima.flow.Step; import org.jetbrains.annotations.Nullable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** *

Returns steps according an existing {@link ToVisit} annotation of the CAS or, if not present, the default aggregate flow.

@@ -16,6 +20,7 @@ * the currently read document already exists in the database.

*/ public class AnnotationDefinedFlow extends JCasFlow_ImplBase { + private final static Logger log = LoggerFactory.getLogger(AnnotationDefinedFlow.class); private String[] toVisitKeys; private String[] fixedFlow; private int currentPos; @@ -24,7 +29,8 @@ public class AnnotationDefinedFlow extends JCasFlow_ImplBase { *

Creates a flow that follows to entries in {@link ToVisit#getDelegateKeys()} of toVisit or, if * toVisit is null, falls back to the default fixed flow.

*

If toVisit is not null but the delegateKeys are null or empty, no component in the aggregate using this flow will process the respective CAS.

- * @param toVisit An annotation containing the keys of the delegate AEs to visit. May be null which case the default fixed flow will be used. + * + * @param toVisit An annotation containing the keys of the delegate AEs to visit. May be null which case the default fixed flow will be used. * @param flowConstraints The default fixed flow of the aggregate analysis engine. * @throws AnalysisEngineProcessException If flowConstraints is not a fixed flow. */ @@ -36,7 +42,21 @@ public AnnotationDefinedFlow(@Nullable ToVisit toVisit, FlowConstraints flowCons // 1. There are given keys to visit, use them. // 2. There are no keys given but the ToVisit annotation is not null, skip all components. // 3. There is not ToVisit annotation at all, use the default fixed flow. - if(toVisit != null && toVisit.getDelegateKeys() != null) + if (log.isTraceEnabled()) { + try { + String docId = JCoReTools.getDocId(toVisit.getCAS().getJCas()); + if (toVisit != null) { + String[] delegateKeys = toVisit.getDelegateKeys() != null ? toVisit.getDelegateKeys().toArray() : null; + log.trace("Found ToVisit annotation for document {} with the following component keys: {}", docId, delegateKeys); + } else { + log.trace("Got no ToVisit annotation for document {}.", docId); + } + } catch (CASException e) { + log.error("Could not retrieve JCas from ToVisit annotation.", e); + throw new AnalysisEngineProcessException(e); + } + } + if (toVisit != null && toVisit.getDelegateKeys() != null) toVisitKeys = toVisit.getDelegateKeys().toArray(); else if (toVisit != null) toVisitKeys = new String[0]; From 362255c1c75e7d8c01d1036020a5aacc06c6ab6a Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 10 Jun 2021 17:48:20 +0200 Subject: [PATCH 067/269] Created the a project for an integration test with XML to XMI database writing with hash comparison and flow control. Fixed a few bugs on the way, now everything is working fine. --- .../jcore/ae/checkpoint/DBCheckpointAE.java | 1 + .../jcore/reader/db/DBMultiplier.java | 4 + .../jcore/reader/db/DBMultiplierReader.java | 5 + .../jcore/reader/db/DBReaderBase.java | 2 + .../AnnotationDefinedFlow.java | 37 +- .../AnnotationDefinedFlowController.java | 2 +- jcore-jedis-integration-tests/pom.xml | 64 +++ .../UpdateWithHashComparison.java | 257 ++++++++++ .../src/test/resources/logback-test.xml | 19 + .../src/test/resources/medlineMappingFile.xml | 457 ++++++++++++++++++ .../pubmed21n1016_excerpt_original.xml.gz | Bin 0 -> 3038 bytes ...ed21n1016_excerpt_partially_changed.xml.gz | Bin 0 -> 3075 bytes .../src/test/resources/pubmedMappingFile.xml | 436 +++++++++++++++++ .../jcore/consumer/xmi/XMIDBWriter.java | 17 +- .../jcore/reader/xml/XMLDBMultiplier.java | 8 +- .../jcore/reader/xml/XMLDBMultiplierTest.java | 2 +- pom.xml | 3 +- 17 files changed, 1285 insertions(+), 29 deletions(-) create mode 100644 jcore-jedis-integration-tests/pom.xml create mode 100644 jcore-jedis-integration-tests/src/test/java/de/julielab/jcore/jedis/integrationtests/UpdateWithHashComparison.java create mode 100644 jcore-jedis-integration-tests/src/test/resources/logback-test.xml create mode 100644 jcore-jedis-integration-tests/src/test/resources/medlineMappingFile.xml create mode 100644 jcore-jedis-integration-tests/src/test/resources/pubmed21n1016_excerpt_original.xml.gz create mode 100644 jcore-jedis-integration-tests/src/test/resources/pubmed21n1016_excerpt_partially_changed.xml.gz create mode 100644 jcore-jedis-integration-tests/src/test/resources/pubmedMappingFile.xml diff --git a/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DBCheckpointAE.java b/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DBCheckpointAE.java index 1a70c23cd..264c32999 100644 --- a/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DBCheckpointAE.java +++ b/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DBCheckpointAE.java @@ -69,6 +69,7 @@ public class DBCheckpointAE extends JCasAnnotator_ImplBase { */ @Override public void initialize(final UimaContext aContext) throws ResourceInitializationException { + super.initialize(aContext); componentDbName = (String) aContext.getConfigParameterValue(PARAM_CHECKPOINT_NAME); dbcConfigPath = (String) aContext.getConfigParameterValue(PARAM_COSTOSYS_CONFIG); indicateFinished = Optional.ofNullable((Boolean) aContext.getConfigParameterValue(PARAM_INDICATE_FINISHED)).orElse(false); diff --git a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java index 17040c15e..c83fcaebb 100644 --- a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java +++ b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java @@ -72,6 +72,10 @@ private DataBaseConnector getDataBaseConnector(String costosysConfig) throws Ana @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { RowBatch rowbatch = JCasUtil.selectSingle(aJCas, RowBatch.class); + if (rowbatch.getIdentifiers() == null) + throw new AnalysisEngineProcessException(new IllegalArgumentException("The identifiers of the passed row batch are null.")); + if (rowbatch.getIdentifiers().size() == 0) + throw new AnalysisEngineProcessException(new IllegalArgumentException("The identifiers of the passed row batch are empty.")); tables = rowbatch.getTables().toStringArray(); schemaNames = rowbatch.getTableSchemas().toStringArray(); tableName = rowbatch.getTableName(); diff --git a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplierReader.java b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplierReader.java index 83370feae..bfe474de8 100644 --- a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplierReader.java +++ b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplierReader.java @@ -49,6 +49,9 @@ public class DBMultiplierReader extends DBSubsetReader { @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); + // reset the state in case of reconfigure() + retriever = null; + dataTableDocumentIds = null; // Check whether a subset table name or a data table name was given. if (readDataTable) { @@ -65,6 +68,8 @@ public void initialize(UimaContext context) throws ResourceInitializationExcepti public void getNext(JCas jCas) throws CollectionException { log.trace("Requesting next batch of document IDs from the database."); List idList = getNextDocumentIdBatch(); + if (idList.isEmpty()) + throw new CollectionException(new IllegalStateException("There are no documents to read in the database. Please call hasNext() to check if there is more data to read.")); log.trace("Received a list of {} ID from the database.", idList.size()); RowBatch rowbatch = new RowBatch(jCas); FSArray ids = new FSArray(jCas, idList.size()); diff --git a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBReaderBase.java b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBReaderBase.java index 082909cb5..c46d6a105 100644 --- a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBReaderBase.java +++ b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBReaderBase.java @@ -95,6 +95,8 @@ public void initialize(UimaContext context) throws ResourceInitializationExcepti } catch (FileNotFoundException e) { throw new ResourceInitializationException(e); } + + numberFetchedDocIDs = 0; } private void checkTableExists() throws ResourceInitializationException { diff --git a/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlow.java b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlow.java index e93616aab..c945ef0eb 100644 --- a/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlow.java +++ b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlow.java @@ -5,15 +5,19 @@ import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.analysis_engine.metadata.FixedFlow; import org.apache.uima.analysis_engine.metadata.FlowConstraints; -import org.apache.uima.cas.CASException; import org.apache.uima.flow.FinalStep; import org.apache.uima.flow.JCasFlow_ImplBase; import org.apache.uima.flow.SimpleStep; import org.apache.uima.flow.Step; +import org.apache.uima.jcas.JCas; import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.Arrays; +import java.util.Set; +import java.util.stream.Collectors; + /** *

Returns steps according an existing {@link ToVisit} annotation of the CAS or, if not present, the default aggregate flow.

*

This is, for example, used by the XMLDBMultiplier to let CASes skip large parts of the pipeline when @@ -24,6 +28,7 @@ public class AnnotationDefinedFlow extends JCasFlow_ImplBase { private String[] toVisitKeys; private String[] fixedFlow; private int currentPos; + private String docId; /** *

Creates a flow that follows to entries in {@link ToVisit#getDelegateKeys()} of toVisit or, if @@ -32,9 +37,10 @@ public class AnnotationDefinedFlow extends JCasFlow_ImplBase { * * @param toVisit An annotation containing the keys of the delegate AEs to visit. May be null which case the default fixed flow will be used. * @param flowConstraints The default fixed flow of the aggregate analysis engine. + * @param jCas * @throws AnalysisEngineProcessException If flowConstraints is not a fixed flow. */ - public AnnotationDefinedFlow(@Nullable ToVisit toVisit, FlowConstraints flowConstraints) throws AnalysisEngineProcessException { + public AnnotationDefinedFlow(@Nullable ToVisit toVisit, FlowConstraints flowConstraints, JCas jCas) throws AnalysisEngineProcessException { if (!(flowConstraints instanceof FixedFlow)) throw new AnalysisEngineProcessException(new IllegalArgumentException("This flow requires the FixedFlow to determine the default processing order. However, the flow constraints are of type " + flowConstraints.getClass().getCanonicalName())); this.fixedFlow = ((FixedFlow) flowConstraints).getFixedFlow(); @@ -43,22 +49,19 @@ public AnnotationDefinedFlow(@Nullable ToVisit toVisit, FlowConstraints flowCons // 2. There are no keys given but the ToVisit annotation is not null, skip all components. // 3. There is not ToVisit annotation at all, use the default fixed flow. if (log.isTraceEnabled()) { - try { - String docId = JCoReTools.getDocId(toVisit.getCAS().getJCas()); - if (toVisit != null) { - String[] delegateKeys = toVisit.getDelegateKeys() != null ? toVisit.getDelegateKeys().toArray() : null; - log.trace("Found ToVisit annotation for document {} with the following component keys: {}", docId, delegateKeys); - } else { - log.trace("Got no ToVisit annotation for document {}.", docId); - } - } catch (CASException e) { - log.error("Could not retrieve JCas from ToVisit annotation.", e); - throw new AnalysisEngineProcessException(e); + docId = JCoReTools.getDocId(jCas); + if (toVisit != null) { + String[] delegateKeys = toVisit.getDelegateKeys() != null ? toVisit.getDelegateKeys().toArray() : null; + log.trace("Found ToVisit annotation for document {} with the following component keys: {}", docId, delegateKeys); + } else { + log.trace("Got no ToVisit annotation for document {}, the CAS is routed through the aggregate in the default order.", docId); } } - if (toVisit != null && toVisit.getDelegateKeys() != null) - toVisitKeys = toVisit.getDelegateKeys().toArray(); - else if (toVisit != null) + if (toVisit != null && toVisit.getDelegateKeys() != null) { + // filter for delegates actually contained in the current AAE. + Set knownKeys = Arrays.stream(this.fixedFlow).collect(Collectors.toSet()); + toVisitKeys = Arrays.stream(toVisit.getDelegateKeys().toArray()).filter(knownKeys::contains).toArray(String[]::new); + } else if (toVisit != null) toVisitKeys = new String[0]; else toVisitKeys = null; @@ -77,8 +80,10 @@ public Step next() { if ((toVisitKeys == null && currentPos < fixedFlow.length) || (toVisitKeys != null && currentPos < toVisitKeys.length)) { String nextAEKey = toVisitKeys != null ? toVisitKeys[currentPos] : fixedFlow[currentPos]; ++currentPos; + log.trace("Next component key to visit for document {}: {}", docId, nextAEKey); return new SimpleStep(nextAEKey); } + log.trace("Flow finished for document {}.", docId); return new FinalStep(); } } diff --git a/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowController.java b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowController.java index 77a803e23..4158059a3 100644 --- a/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowController.java +++ b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowController.java @@ -20,6 +20,6 @@ public Flow computeFlow(JCas jCas) throws AnalysisEngineProcessException { boolean exists = JCasUtil.exists(jCas, ToVisit.class); ToVisit toVisit = exists ? JCasUtil.selectSingle(jCas, ToVisit.class) : null; // When toVisit is null, the default, fixed flow is used. - return new AnnotationDefinedFlow(toVisit, getContext().getAggregateMetadata().getFlowConstraints()); + return new AnnotationDefinedFlow(toVisit, getContext().getAggregateMetadata().getFlowConstraints(), jCas); } } diff --git a/jcore-jedis-integration-tests/pom.xml b/jcore-jedis-integration-tests/pom.xml new file mode 100644 index 000000000..2bcc39022 --- /dev/null +++ b/jcore-jedis-integration-tests/pom.xml @@ -0,0 +1,64 @@ + + + + jedis-parent + de.julielab + 2.6.0-SNAPSHOT + ../jedis-parent + + 4.0.0 + + jcore-jedis-integration-tests + + + + de.julielab + jcore-xml-db-reader + ${project.parent.version} + + + de.julielab + jcore-xmi-db-writer + ${project.parent.version} + + + de.julielab + jcore-db-checkpoint-ae + ${project.parent.version} + + + de.julielab + jcore-flow-controllers + ${project.parent.version} + + + de.julielab + jcore-types + ${jcore-types-version} + test + + + de.julielab + costosys + + + ch.qos.logback + logback-classic + + + org.assertj + assertj-core + + + de.julielab + jcore-db-test-utilities + + + org.junit.jupiter + junit-jupiter-engine + + + + \ No newline at end of file diff --git a/jcore-jedis-integration-tests/src/test/java/de/julielab/jcore/jedis/integrationtests/UpdateWithHashComparison.java b/jcore-jedis-integration-tests/src/test/java/de/julielab/jcore/jedis/integrationtests/UpdateWithHashComparison.java new file mode 100644 index 000000000..52754055b --- /dev/null +++ b/jcore-jedis-integration-tests/src/test/java/de/julielab/jcore/jedis/integrationtests/UpdateWithHashComparison.java @@ -0,0 +1,257 @@ +package de.julielab.jcore.jedis.integrationtests; + +import de.julielab.costosys.dbconnection.DataBaseConnector; +import de.julielab.costosys.dbconnection.SubsetStatus; +import de.julielab.jcore.ae.checkpoint.DBCheckpointAE; +import de.julielab.jcore.consumer.xmi.XMIDBWriter; +import de.julielab.jcore.db.test.DBTestUtils; +import de.julielab.jcore.flow.annotationdefined.AnnotationDefinedFlowController; +import de.julielab.jcore.reader.db.DBMultiplierReader; +import de.julielab.jcore.reader.xml.XMLDBMultiplier; +import de.julielab.jcore.types.Annotation; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.factory.*; +import org.apache.uima.flow.FlowControllerDescription; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.metadata.TypeSystemDescription; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.io.File; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +@Testcontainers +public class UpdateWithHashComparison { + private static final String SOURCE_XML_TABLE = "_data.source_xml_table"; + private static final String TARGET_XMI_TABLE = "_data_xmi.target_xmi_table"; + private static final String XML_SUBSET_TABLE = "test_subset"; + private static final String XMI_MIRROR_TABLE = "test_xmi_mirror"; + @Container + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:11.12"); + private static String costosysConfigSourceTable; + private static String costosysConfigTargetTable; + /** + * The collection reader that feeds the XMLDBMultiplier the database rows to read. + */ + private static CollectionReader testCr; + /** + * The top-level aggregate containing the XMLDBMultiplier and two "child" aggregates, one for the analysis engines + * and one for the CAS consumers. In this test, the aggregate delegates are all realized by instances of {@link TestAnnotator}. + */ + private static AnalysisEngine testAggregate; + private static JCas cas; + private static DataBaseConnector dbc; + private static List namesOfRunComponents = new ArrayList<>(); + + @BeforeAll + public static void setup() throws Exception { + DBTestUtils.createAndSetHiddenConfig(Path.of("src", "test", "resources", "hiddenConfig").toString(), postgres); + + dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setActiveTableSchema("medline_2017"); + costosysConfigSourceTable = DBTestUtils.createTestCostosysConfig("medline_2017", 1, postgres); + costosysConfigTargetTable = DBTestUtils.createTestCostosysConfig("xmi_text", 1, postgres); + new File(costosysConfigSourceTable).deleteOnExit(); + new File(costosysConfigTargetTable).deleteOnExit(); + prepareSourceXMLTable(dbc); + dbc.defineMirrorSubset(XML_SUBSET_TABLE, SOURCE_XML_TABLE, true, "Test subset"); + assertThat(dbc.getNumRows(SOURCE_XML_TABLE)).isEqualTo(3); + createTestPipelineComponents(); + } + + @AfterAll + public static void shutdown() { + dbc.close(); + } + + private static void prepareSourceXMLTable(DataBaseConnector dbc) throws Exception { + dbc.createTable(SOURCE_XML_TABLE, "Test XML Table"); + dbc.importFromXMLFile(Path.of("src", "test", "resources", "pubmed21n1016_excerpt_original.xml.gz").toString(), SOURCE_XML_TABLE); + } + + /** + *

Creates test components in a structure that mimics the structure used by the jcore-pipeline-builder.

+ *

This consists of: + *

    + *
  1. a CollectionReader
  2. + *
  3. an AAE containing all other components: + *
      + *
    1. an optional CAS multiplier
    2. + *
    3. an aggregate containing all AEs
    4. + *
    5. an aggregate containing all CAS consumers
    6. + *
    + *
  4. + * The CAS consumers in this test consist of two "mock" CCs, a "real" XMI Writer and DB Checkpoint AE. + *
+ * We here want to test if we can successfully route the CAS through those inner AAEs when the multiplier adds + * the correct {@link de.julielab.jcore.types.casflow.ToVisit} annotation using a {@link de.julielab.jcore.flow.annotationdefined.AnnotationDefinedFlowController}. + *

+ */ + private static void createTestPipelineComponents() throws Exception { + TypeSystemDescription tsDesc = TypeSystemDescriptionFactory.createTypeSystemDescription("de.julielab.jcore.types.jcore-document-meta-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-casflow-types", "de.julielab.jcore.types.jcore-xmi-splitter-types"); + + testCr = CollectionReaderFactory.createReader(DBMultiplierReader.class, + tsDesc, + DBMultiplierReader.PARAM_TABLE, XML_SUBSET_TABLE, + DBMultiplierReader.PARAM_RESET_TABLE, false, + DBMultiplierReader.PARAM_COSTOSYS_CONFIG_NAME, costosysConfigSourceTable, + // We set a batch size of 1 to have more refined testing. + // Otherwise, the multiplier would receive all 3 test documents at once and + // would process them all in one batch + DBMultiplierReader.PARAM_BATCH_SIZE, 1 + ); + + AnalysisEngineDescription testAe1 = AnalysisEngineFactory.createEngineDescription(TestAnnotator.class, tsDesc, "name", "TestAE 1"); + AnalysisEngineDescription testAe2 = AnalysisEngineFactory.createEngineDescription(TestAnnotator.class, tsDesc, "name", "TestAE 2"); + AnalysisEngineDescription testCc1 = AnalysisEngineFactory.createEngineDescription(TestAnnotator.class, tsDesc, "name", "TestCC 1"); + AnalysisEngineDescription testCc2 = AnalysisEngineFactory.createEngineDescription(TestAnnotator.class, tsDesc, "name", "TestCC 2"); + AnalysisEngineDescription xmiDbWriter = AnalysisEngineFactory.createEngineDescription(XMIDBWriter.class, + XMIDBWriter.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE, + XMIDBWriter.PARAM_ANNOS_TO_STORE, new String[]{"de.julielab.jcore.types.Annotation"}, + XMIDBWriter.PARAM_STORE_ALL, false, + XMIDBWriter.PARAM_STORE_BASE_DOCUMENT, true, + XMIDBWriter.PARAM_STORE_RECURSIVELY, false, + XMIDBWriter.PARAM_ADD_SHA_HASH, "document_text", + XMIDBWriter.PARAM_COSTOSYS_CONFIG, costosysConfigTargetTable, + XMIDBWriter.PARAM_UPDATE_MODE, true, + XMIDBWriter.PARAM_DO_GZIP, false + ); + AnalysisEngineDescription dbCheckpointAe = AnalysisEngineFactory.createEngineDescription(DBCheckpointAE.class, + DBCheckpointAE.PARAM_CHECKPOINT_NAME, "end", + DBCheckpointAE.PARAM_COSTOSYS_CONFIG, costosysConfigSourceTable, + DBCheckpointAE.PARAM_INDICATE_FINISHED, true + ); + + FlowControllerDescription flowControllerDescription = FlowControllerFactory.createFlowControllerDescription(AnnotationDefinedFlowController.class); + AnalysisEngineDescription aeAaeDesc = AnalysisEngineFactory.createEngineDescription(List.of(testAe1, testAe2), List.of("TestAE 1", "TestAE 2"), null, null, flowControllerDescription); + AnalysisEngineDescription ccAaeDesc = AnalysisEngineFactory.createEngineDescription(List.of(testCc1, testCc2, xmiDbWriter, dbCheckpointAe), List.of("TestCC 1", "TestCC 2", "XMI Writer", "Checkpoint Writer"), null, null, flowControllerDescription); + + AnalysisEngineDescription multiplierDescription = AnalysisEngineFactory.createEngineDescription(XMLDBMultiplier.class, + tsDesc, + XMLDBMultiplier.PARAM_MAPPING_FILE, Path.of("src", "test", "resources", "medlineMappingFile.xml").toString(), + // The core of this whole test: The components to be visited in case of matching hash codes. + // We want to skip all components except the checkpoint writer that marks the document as + // "processed" in the XML subset table + XMLDBMultiplier.PARAM_TO_VISIT_KEYS, new String[]{"Checkpoint Writer"}, + // The next three parameters are required for the hash comparison + XMLDBMultiplier.PARAM_ADD_SHA_HASH, "document_text", + XMLDBMultiplier.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE, + XMLDBMultiplier.PARAM_TABLE_DOCUMENT_SCHEMA, "xmi_text"); + + testAggregate = AnalysisEngineFactory.createEngine(List.of(multiplierDescription, aeAaeDesc, ccAaeDesc), List.of("multiplier", "AeAAE", "CcAAE"), null, null); + + cas = JCasFactory.createJCas(tsDesc); + } + + @Test + public void testInitialProcessingProcessing() throws Exception { + assertThat(testCr.hasNext()); + while (testCr.hasNext()) { + testCr.getNext(cas.getCas()); + testAggregate.process(cas); + // Check that all components have been visited as expected from a normal, fixed flow + assertThat(namesOfRunComponents).containsExactly("TestAE 1", "TestAE 2", "TestCC 1", "TestCC 2"); + namesOfRunComponents.clear(); + cas.reset(); + } + testAggregate.collectionProcessComplete(); + assertThat(dbc.tableExists(TARGET_XMI_TABLE)); + // After this first processing, the XMI document table exists. We can now create a mirror on it. This is important + // because we want to see that the mirror is only reset for rows that have actually changed in subsequent tests. + dbc.defineMirrorSubset(XMI_MIRROR_TABLE, TARGET_XMI_TABLE, true, "The XMI test mirror table.", "xmi_text"); + // We mark the XMI mirror subset as completely processed. This simulates a state where the initial batch of + // documents has been completely processed, before the update comes in. + dbc.markAsProcessed(XMI_MIRROR_TABLE); + SubsetStatus status = dbc.status(XML_SUBSET_TABLE, EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED, DataBaseConnector.StatusElement.IN_PROCESS)); + // Check that all rows have been processed in the XML source subset table. + assertThat(status.isProcessed).isEqualTo(3); + assertThat(status.inProcess).isEqualTo(0); + } + + /** + * Adds its name to {@link #namesOfRunComponents}. + */ + public static class TestAnnotator extends JCasAnnotator_ImplBase { + @ConfigurationParameter(name = "name") + private String name; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException { + super.initialize(aContext); + this.name = (String) aContext.getConfigParameterValue("name"); + } + + @Override + public void process(JCas jCas) { + namesOfRunComponents.add(name); + new Annotation(jCas).addToIndexes(); + } + } + + @Nested + class AfterInitialProcessing { + @Test + public void updateXML() throws Exception { + dbc.updateFromXML(Path.of("src", "test", "resources", "pubmed21n1016_excerpt_partially_changed.xml.gz").toString(), SOURCE_XML_TABLE, true); + // The update contains all three originally imported XML documents. Only that the second has not been changed. + // But the XML mirror should have been reset completely. + SubsetStatus status = dbc.status(XML_SUBSET_TABLE, EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED, DataBaseConnector.StatusElement.IN_PROCESS)); + // Check that the XML mirror subset has been reset due to the update + assertThat(status.isProcessed).isEqualTo(0); + assertThat(status.inProcess).isEqualTo(0); + } + + @Nested + class AfterUpdatingXML { + @Test + public void testOnlyNewDocumentsProcessed() throws Exception { + + testCr.reconfigure(); + testAggregate.reconfigure(); + assertThat(testCr.hasNext()).withFailMessage("The XML DB Collection reader does not report any non-processed rows.").isTrue(); + // Run the whole pipeline again. Only this time we only expect all the components run in a single case. + List allNamesOfRunComponents = new ArrayList<>(); + while (testCr.hasNext()) { + cas.reset(); + testCr.getNext(cas.getCas()); + testAggregate.process(cas); + // Check that all components have been visited as expected from a normal, fixed flow + allNamesOfRunComponents.addAll(namesOfRunComponents); + namesOfRunComponents.clear(); + cas.reset(); + } + testAggregate.collectionProcessComplete(); + // There should be only two components documents now that have visited all components + assertThat(allNamesOfRunComponents).containsExactly("TestAE 1", "TestAE 2", "TestCC 1", "TestCC 2", "TestAE 1", "TestAE 2", "TestCC 1", "TestCC 2"); + testAggregate.collectionProcessComplete(); + // Check again that all the XML documents have been processed. + SubsetStatus status = dbc.status(XML_SUBSET_TABLE, EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED)); + // Check that all rows have been processed in the XML source subset table. + assertThat(status.isProcessed).isEqualTo(3); + + // Now the more interesting part: In the XMI mirror there should now be two unprocessed tables, namely + // the two documents with a changed document text. The unchanged document should still be marked as + // processed. + SubsetStatus xmiMirrorStatus = dbc.status(XMI_MIRROR_TABLE, EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED)); + // Check that all rows have been processed in the XML source subset table. + assertThat(xmiMirrorStatus.isProcessed).isEqualTo(1); + } + } + } +} diff --git a/jcore-jedis-integration-tests/src/test/resources/logback-test.xml b/jcore-jedis-integration-tests/src/test/resources/logback-test.xml new file mode 100644 index 000000000..e2ec34c57 --- /dev/null +++ b/jcore-jedis-integration-tests/src/test/resources/logback-test.xml @@ -0,0 +1,19 @@ + + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + + + + + \ No newline at end of file diff --git a/jcore-jedis-integration-tests/src/test/resources/medlineMappingFile.xml b/jcore-jedis-integration-tests/src/test/resources/medlineMappingFile.xml new file mode 100644 index 000000000..cd9892953 --- /dev/null +++ b/jcore-jedis-integration-tests/src/test/resources/medlineMappingFile.xml @@ -0,0 +1,457 @@ + + + + /MedlineCitation/Article/ArticleTitle + + + /MedlineCitation/Article/Abstract + de.julielab.jcore.reader.xmlmapper.mapper.StructuredAbstractParser + + + + /MedlineCitation/OtherAbstract + + + /MedlineCitation/Article/VernacularTitle + + + + + de.julielab.jcore.types.Title + + + 0 + + + + titleType + java.lang.String + + document + + + + + + + de.julielab.jcore.types.pubmed.AbstractText + + + + 2 + + + + abstractType + java.lang.String + + other + + + + + + de.julielab.jcore.types.Title + + + 3 + + + + titleType + java.lang.String + + document_vernacular + + + + + + + de.julielab.jcore.types.pubmed.Header + + + + /MedlineCitation/ArticleIdList/ArticleId[@IdType="doi"] + + doi + java.lang.String + + + /MedlineCitation/PMID + docId + java.lang.String + + + /MedlineCitation/@Status + citationStatus + java.lang.String + + + + /MedlineCitation/Article/Language + + language + java.lang.String + + de + + + en + + + es + + + fr + + + it + + + pt + + + eng + + + ger + + + fre + + + ita + + + other + + + + source + java.lang.String + + de.julielab.jcore.reader.xmlmapper.typeParser.SourceParser + + + + authors + + org.apache.uima.jcas.cas.FSArray + + de.julielab.jcore.reader.xmlmapper.typeParser.FSArrayParser + + true + + authorInfo + + de.julielab.jcore.types.AuthorInfo + + true + + + /MedlineCitation/Article/AuthorList/Author[LastName] + + + foreName + java.lang.String + ForeName + + + foreName + java.lang.String + FirstName + + + lastName + java.lang.String + LastName + + + initials + java.lang.String + Initials + + + affiliation + java.lang.String + + AffiliationInfo/Affiliation + + + + + + + org.apache.uima.jcas.cas.FSArray + + pubTypeList + true + + + de.julielab.jcore.types.Journal + + + /MedlineCitation/Article/PublicationTypeList/PublicationType + + Journal + true + + java.lang.String + name + . + + + java.lang.String + ISSN + + /MedlineCitation/Article/Journal/ISSN + + + + java.lang.String + Volume + + /MedlineCitation/Article/Journal/JournalIssue/Volume + + + + java.lang.String + Issue + + /MedlineCitation/Article/Journal/JournalIssue/Issue + + + + java.lang.String + Title + + /MedlineCitation/Article/Journal/Title + + + + java.lang.String + ShortTitle + + /MedlineCitation/MedlineJournalInfo/MedlineTA + + + + java.lang.String + nlmId + + /MedlineCitation/MedlineJournalInfo/NlmUniqueID + + + + java.lang.String + Pages + + /MedlineCitation/Article/Pagination/MedlinePgn + + + + true + + de.julielab.jcore.types.Date + + PubDate + + de.julielab.jcore.reader.xmlmapper.typeParser.PubDateParser + + + /MedlineCitation/Article/Journal/JournalIssue/PubDate + + + int + month + + + int + year + + + int + day + + + + + + org.apache.uima.jcas.cas.FSArray + otherIDs + true + + de.julielab.jcore.types.pubmed.OtherID + + /MedlineCitation/OtherID + true + + id + java.lang.String + . + + + source + java.lang.String + @Source + + + + + + + de.julielab.jcore.types.pubmed.ManualDescriptor + + + /MedlineCitation/GeneSymbolList + GeneSymbolList + true + + org.apache.uima.jcas.cas.StringArray + + + + KeywordList + true + + org.apache.uima.jcas.cas.FSArray + + + Keyword + true + + /MedlineCitation/KeywordList/Keyword + + + de.julielab.jcore.types.Keyword + + + Name + . + java.lang.String + + + + + ChemicalList + true + + org.apache.uima.jcas.cas.FSArray + + + Chemical + true + + /MedlineCitation/ChemicalList/Chemical + + + de.julielab.jcore.types.Chemical + + + RegistryNumber + RegistryNumber + java.lang.String + + + NameOfSubstance + NameOfSubstance + java.lang.String + + + + + DBInfoList + true + + org.apache.uima.jcas.cas.FSArray + + + DBInfo + true + + /MedlineCitation/DataBankList/DataBank + + + de.julielab.jcore.types.DBInfo + + + Name + DataBankName + java.lang.String + + + AcList + + AccessionNumberList + + + true + + org.apache.uima.jcas.cas.StringArray + + + + + + MeSHList + true + + org.apache.uima.jcas.cas.FSArray + + + meshHeading + true + + /MedlineCitation/MeshHeadingList/MeshHeading + + + de.julielab.jcore.types.MeshHeading + + + DescriptorName + java.lang.String + DescriptorName + + + + DescriptorNameMajorTopic + + DescriptorName/@MajorTopicYN + boolean + + Y + true + + + N + false + + + + QualifierName + java.lang.String + QualifierName + + + + QualifierNameMajorTopic + + QualifierName/@MajorTopicYN + boolean + + Y + true + + + N + false + + + + + + \ No newline at end of file diff --git a/jcore-jedis-integration-tests/src/test/resources/pubmed21n1016_excerpt_original.xml.gz b/jcore-jedis-integration-tests/src/test/resources/pubmed21n1016_excerpt_original.xml.gz new file mode 100644 index 0000000000000000000000000000000000000000..365b8d3e0e6dc6da6a0355a1a07f4791b1292c1b GIT binary patch literal 3038 zcmV<43nBC$iwFpSV!&Vk18{X>ZDnLKF>Wz1F*aXicw=R9aCBd9a%pF2ZeeULcx`L| z?OWS++cp+`&sSjdl&odlEcwz|vbr+1QoFGnckCv0UuHohB%&q(mH_0)U)7iSf%&r8 z2P7rZmsn}4$?DWEH3b|ToI7xK(80GKlbFp#W~5dJjc%*cU_u2tlxlX+aCX|4HJ zQ@^OKoh?59)%~W^?Q~n9Zh-q;ro$JE)dy(=V`CA3%b4`_I+PSSuS1yW4|y{wE1o#pBfo#!fCS&e(}D>T;2agT^Qp zfz7m%L1WnMeA93CJNtWsHc{1<6|At(2?j5U5N?w$IcS_HD>5aj8~YXSb?g!`+y=Me zvp(-x4Yzl(ng6QKLMu|bKKdy?)(m?jf_s*&*mt{Evu|nT2lgDatBTiVE~Q0uzf0i- z!^Fol6UGeM`DoqX?_N(P8NBBspH{F@Nq%?o#l4LyLxsolEk@HRJSXR(nRrJugP4b* z2w5nF_o)on4WEmdlc6wdsxt;Z@knI$)=!w$pk3p*+UT6mq`D)}Du>T!YS`P~+wL~; zUrNBrct!q)m<&dzI`Cu2=Fv%*$6%;sgT3dm46n`(8dnsEt!{6(-+ni0_4c>B`$&_@ z!m33k#%4UQH8IPr7ERwT`iRn}Z5*2j8cqUko zxj^z5s~BgKO!JUsT+Ia2Q^wOYmbo+r1VQYm=vK@d=p=EfB-z5skY`&=CS;Ns<0@oE zD1+}7d#41u77NsFK8bTSL8Nr7XA34tz)(zR0g0alS}F3nRxNfJ(X`I4S(@oo8#viG zueue@7}3Re95)}1hEPo5u#ioltRmI(P-1x~U5ZT^@6DMQK43@Qb(1DGSFd$BW8$K0?OU)9gWFl{AW~L-O z!IfovVsw`J;{t9-gypgE##vlDWiu^bk{Ua*U?v{f2uOZ5&e zUSKw^*ky~^NHF>)brLv&D`^t)0}d!Q;i$h%6Wb*d^39B=P?nfmA2C!9@sWvXoIA^q z!K(Hr^Muj_&Jaf^NBfeH2w94-$3h8o&{rmfyI}!@BrFw*1acUXEfpDsHdldYQJoQ) zOhDr0F)p@P7U`Ru5$&1@B;GBgsE7yaW8*ow1{7E=A@BBX2Zuw4eRq7D2T0+|P{VoT zu0+#XC^ONRMNC9`rpXYgj(H|;3l+$L;E&xhKBJK11|fqnYlU+8?SN<6fvnT&?6B}3 z?I3BfQ^beOLzaXpmXT_RL+jGqcM1O}6GX7cGMzOM4v>#q&L}CEOUeH|s4$VXZOE;n zVjx5*4Gj5Gb))4g)mjV5>oR=H2Ckm(g__UOVQHEU5X|zUo9&SX` z4>)7^0u|XMI9{$@yhc9Jk{fRSz>F$Ks4`SWPjMG?IKGCs8JVqH%@)HS zl|N{yLlKgZo zyS->RywvP*O$H^#aDm&fu)`B%{GFi}H6G<;F`T0pXaxC^<1wW|%>+4YQ8bLyf|qy( z4ya7r)=!O3^sJAg4`r*mf3H;?97E|L`ey&P#!M=Q{>!!)HK6V>v+n7acIorHI4*K(Yd4S4D56{fbN`Sdy|8^iu~ zf3LT*?h@+tAL{S#t*#8M z7|_&Nh?dy5cpyNZR`mY3Pk59q;#uMwLL*Y1w{3($F$-Qzgb`o(W>qNT=Rm822n{bs zJ^VtvvbRHcWxuy^ys}RY{=InRc%?C{B;r`+EppA2FAvXOF&7i0g{!HB|1Ciw`OsWy zR0sBQ^yB+iEQZkugL7FRR$wPwRC1P<@Dw!Hoz{Mb5Rp*KMXXcoQa1il&4saYwnQ2d zY~cONf=FhtgI##IAqV9k`V0vP>CrqWoe0HSZ1k`q{k^72?nK7R69Xc{7}WKVf%t2jW_|bk(}Cc=n*wSF(Y&gL{)f>gh5ZF31iIkd?ikGYn|hjD}E^b z+QYbBgetqGVqR04cIsN`8?}Ufs0^Tv;B0+NV6t9+&uh8UVThL-LGUT9Nj!?}sQ9qM z^Ln%s{n)`<@B}=c$)E(69@yHmoO;j^YD^Ghu{I)Ci9R4&5fvU|Cv+8&lC!IoK1gwV zd^hdlm{<}#2ZeBGdVrMDE01Dr0-fe=q`{@B=c4>4gRhHao!B+7theZd*MK$NZUieh zh9_qZ)#zzCN7A_KH9(EgMg)D!l|Xi`gA=_KYf8EqlU|5xdChOu0<^L}63==@T%Hk^ zXT;^tt;sXu@_C5M+VIH=aoPO>#Km_ge@?_@_Ze|{MqDcMe;LH(jhu+A>06gA-|$QH zuK|0*C)uKTE;60}TX;6bC*d-f3muP}0iRQ+O7D$u@$)2^o~@7ZQ@CamuAp*cfZ5_^Y5g5EHvkV zbIEk0GcQlD)X7HAiT)cY?V78P&3 zusAK=+&#+3zc=&=h3IVd*!Y}#n(a=rw*$q0JN&w^z+oiR)7{}QmutNrIo?PYyPk|c zpwanNQeXL$ls|0dsRPe@+9zKNeXXZLKSygx=wAzcT+5J~c`h_w1FrZvl9M&?ik~+k z;OQC&)fHdIfL9xl?|ZqPpTF9Juq(&cNazc9yUxRZDnLKF>Wz1F*aXicw=R9aCBdAVRCe7VQg%9Ut?%t zZf9j=E_iKh0PS1da@#f*zRy!&b(PFmf0q1{9a$Z(Eyrps+pV3X?#&KFLLzFCUzm5~yR_p-_=l;HUMQ}FVOKX-*MkwO1;!+QZ#I47H{vjA z#4>11)vOtNW7$mHqNcJY|NLkBZL8gCH$Ytj_lHdTuNbQi(g?;z!VhIEj-=rR0<$ap zwfdk2T1?erW$L5B@!9ZvP-7Q&u}BYU=Vzl@wwid3hR5vVLfcxu)860P-|cjJO`2NJ z9djd&RFs6m2)`J`_>JIcf4kLw+iSYd<;+OMCg``fd(G@?IdjbC{Z6abq`w9K=E@D+ zwm*uQ)T=h#ZauMStG#;D*8Yl3AF%J`_9efLlrKWqGL^BZm3V!xY#Q@$IfAj_)zvv0 zY8_wC6LC-*gu*kailtZUw_9(!^=@l_uh%51%Cejl7FxmJMd8D3(nbfhVQfSii}J>9 z#YYv|Nc6YCE&r^}J68SeU2NvQssy=GbQ z%FMMii0+F7UeHW@PEw(DubGWj4SsPr9;fi0i+oza1_k-U$ybjyE)C@#&$bv$Ch(k` ziF)K5QTIab`@(0w6was8V|RQerdIkwvx!O>{KNy1ntL~)8@*!LApEK*ES= z3a9%l3AurlBF*lk30NSf0TYvn^dyeeIpe8dUTQtbL#$$)jZ?*amhyNin3^!2B%#bi z)FALeODA#3=AMco8y6&7Sn2b0i^+)WG-W*YnHI6ecY|HTg58Qa4s<>aGd5ljt58kn zOp<`U7}Ek0KlM}`Qz)nynguki(p#3KDp4Aq*Vfs?hSV>}G&t3$*W6L`^QV<;Pw z>REiTEWR?UaAHSmLJ?2NN(v_sPg6dpy@eLgA)M5;O%jFJg5=oSdd4E$oe9TQhK6Y@ z3+E)hB9@>78!cf#Tn%@8A`q;KMKYF=yr-FIEa3?r8^*_4rHQ*k;D$&T9%^U2rBe89 zN`{}L;#UhFN`yQclIyhLun_iwU9_RCl0X?c1mWU2X2X(QwwMV7qi<3tf+Kh=bwqx^ zt;NP1hcZ*dcEN;vGvx`CCFaIO3>}jANW~<~oTbTNW&4wPe5pOFi6fMweMv~fF-6!@ zp#(bUE2G@qumD05mhwdeIW)W-tFBMZj2UtcY2=%NbXBt!FlAaNYPp-Ggg>IOayAG$PkGNc`EO7 z707|$kKIx}rI6zqA%ihvL~OGs15cF&S*y|7Vg5gwUesV`h!2y6ED4n_Bh?Ux#wMBX zBK}!Mh+vVXDy<_NARjlJQBp9QkpDYSVJuBklUqf}K!{TA1yanMFs#$rak@Hxcj{iW zv`~Y65^Q3jAxs3@LoJtfP6T|#I+B{6FXT&$ms;fR0PyX;uj31Mq*HSSUdReBRhnE& z`MliXy7GH*W>L0-yR&q5pym7cl3s{N(JYn8%n64SGm0!kObjJ8v9*ZvJK>xRhDc<= zB$7>2SSv%`hR-nTOP_Rd*GWM7Qfik%vvi<(GWD!9U`*#Fd`_oVIoDRul8qHzso1TO zx0EaSk>*oUcmmfbNxP5^o*Ii7XP7B7R@wawwg|VWpgXU8J(TXN$9GBZ&>~#ZGl?^z zc=F{pROP;LRFw;?`mi{|A#=|00D8w&uetbDB|3rC-S8(Nf03fsTns#THpP;()%{O+ zJrv)S7fUA3?-cRIJ}==<2(IZ^#?Q<6(?(=GvA>{}>q=g~$){e(M)&v7E#2-nN44cEvgT5`wD zADB_a2%QY2(KEaSEv~OV-bN%>{;Tw^w&UWBNH5>&^2hV+!|3td4N!nsna(5W9<(%65Uer5G+%v>BJ{!)k%E z9A1*1hY{G}Y{y%^-D{+b$J1D;DSoUef8G(LLvJ!BvI6_`;Y3^ z-YJysqnLJoE6ikUQPj4;Ec>d)$x`BVG|51q{|8N?%+KFemG;`IfJZgn%7;a!Q$I;) zNC%bt&*vVXMnnI>*|ivFMO9w7dI_{GXuiBo%38m>-QDZ#thZ2rfB_0S+r{%Rj-V+XG z3wRp2f>4WuXJs2bZc5|RrUVws^<2?LI5VNyGAZE9-aS*dh4*tC$ z=4h!gtR%uvW+igPl&_C2-!K~zq=l!cf&UFbA=%JODx41N_29$DH!OtF34^n#CzfC* zTvTwDhVT>=*R95Wix3fC%tWXX>{1l|GM))-N)_A`Wtl$)$oLZbl&&xTG+TN}KY790a z=zAUuWam0K(R;C`q?jjeydW;W4RKi;K3O6z zyI+C0xC-UZiMZ^(ATBS6OKJWugSec?u}JH#blGwRzeN4&u@gQ{=k-gGs_fsw(+NHa zm%*HGd06-Oj4D;?H^Rfuk)&$6L|z<`7E|hh*kYl&L-~v1RZxX$+c6Gs4wxS)f z$#3`J>%qz8@XgjLUq;+_g{kIA5>SWE9oK9EUUWH84@=QC_Lcso#;vR`rt0>m++2x= ztVAv;&Pbt^`UKoH2Fa{Spl&PIhvG=`9yDmXyWe8}`FB*b5zV+~ZS^ORaF@p$E?)x<7bUjzu7L#Q#SMo6tPvsSSi%hyQgD|^3AOVia0TJ?ffy`WVuXw?f^^%$*M8+uxzReN87R=INQ4??RR R47mS|@xLZ-s&QXA006tW2Jip? literal 0 HcmV?d00001 diff --git a/jcore-jedis-integration-tests/src/test/resources/pubmedMappingFile.xml b/jcore-jedis-integration-tests/src/test/resources/pubmedMappingFile.xml new file mode 100644 index 000000000..9a76854ae --- /dev/null +++ b/jcore-jedis-integration-tests/src/test/resources/pubmedMappingFile.xml @@ -0,0 +1,436 @@ + + + + /PubmedArticle/MedlineCitation/Article/ArticleTitle + + + /PubmedArticle/MedlineCitation/Article/Abstract + de.julielab.jcore.reader.xmlmapper.mapper.StructuredAbstractParser + + + + /PubmedArticle/MedlineCitation/OtherAbstract + + + /PubmedArticle/MedlineCitation/Article/VernacularTitle + + + + + de.julielab.jcore.types.Title + + + 0 + + + + titleType + java.lang.String + + document + + + + + + + de.julielab.jcore.types.pubmed.AbstractText + + + + 2 + + + + abstractType + java.lang.String + + other + + + + + + de.julielab.jcore.types.Title + + + 3 + + + + titleType + java.lang.String + + document_vernacular + + + + + + + de.julielab.jcore.types.pubmed.Header + + + + /PubmedArticle/MedlineCitation/ArticleIdList/ArticleId[@IdType="doi"] + + doi + java.lang.String + + + /PubmedArticle/MedlineCitation/PMID + docId + java.lang.String + + + /PubmedArticle/MedlineCitation/@Status + citationStatus + java.lang.String + + + + /PubmedArticle/MedlineCitation/Article/Language + + language + java.lang.String + + de + + + en + + + es + + + fr + + + it + + + pt + + + eng + + + ger + + + fre + + + ita + + + other + + + + source + java.lang.String + + de.julielab.jcore.reader.xmlmapper.typeParser.SourceParser + + + + authors + + org.apache.uima.jcas.cas.FSArray + + de.julielab.jcore.reader.xmlmapper.typeParser.FSArrayParser + + true + + authorInfo + + de.julielab.jcore.types.AuthorInfo + + true + + + /PubmedArticle/MedlineCitation/Article/AuthorList/Author[LastName] + + + foreName + java.lang.String + ForeName + + + foreName + java.lang.String + FirstName + + + lastName + java.lang.String + LastName + + + initials + java.lang.String + Initials + + + affiliation + java.lang.String + + AffiliationInfo/Affiliation + + + + + + + org.apache.uima.jcas.cas.FSArray + + pubTypeList + true + + + de.julielab.jcore.types.Journal + + + /PubmedArticle/MedlineCitation/Article/PublicationTypeList/PublicationType + + Journal + true + + java.lang.String + name + . + + + java.lang.String + ISSN + + /PubmedArticle/MedlineCitation/Article/Journal/ISSN + + + + java.lang.String + Volume + + /PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/Volume + + + + java.lang.String + Issue + + /PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/Issue + + + + java.lang.String + Title + + /PubmedArticle/MedlineCitation/Article/Journal/Title + + + + java.lang.String + ShortTitle + + /PubmedArticle/MedlineCitation/MedlineJournalInfo/MedlineTA + + + + java.lang.String + nlmId + + /PubmedArticle/MedlineCitation/MedlineJournalInfo/NlmUniqueID + + + + java.lang.String + Pages + + /PubmedArticle/MedlineCitation/Article/Pagination/MedlinePgn + + + + true + + de.julielab.jcore.types.Date + + PubDate + + de.julielab.jcore.reader.xmlmapper.typeParser.PubDateParser + + + /PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/PubDate + + + int + month + + + int + year + + + int + day + + + + + + + + de.julielab.jcore.types.pubmed.ManualDescriptor + + + /PubmedArticle/MedlineCitation/GeneSymbolList + GeneSymbolList + true + + org.apache.uima.jcas.cas.StringArray + + + + KeywordList + true + + org.apache.uima.jcas.cas.FSArray + + + Keyword + true + + /PubmedArticle/MedlineCitation/KeywordList/Keyword + + + de.julielab.jcore.types.Keyword + + + Name + . + java.lang.String + + + + + ChemicalList + true + + org.apache.uima.jcas.cas.FSArray + + + Chemical + true + + /PubmedArticle/MedlineCitation/ChemicalList/Chemical + + + de.julielab.jcore.types.Chemical + + + RegistryNumber + RegistryNumber + java.lang.String + + + NameOfSubstance + NameOfSubstance + java.lang.String + + + + + DBInfoList + true + + org.apache.uima.jcas.cas.FSArray + + + DBInfo + true + + /PubmedArticle/MedlineCitation/DataBankList/DataBank + + + de.julielab.jcore.types.DBInfo + + + Name + DataBankName + java.lang.String + + + AcList + + AccessionNumberList + + + true + + org.apache.uima.jcas.cas.StringArray + + + + + + MeSHList + true + + org.apache.uima.jcas.cas.FSArray + + + meshHeading + true + + /PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading + + + de.julielab.jcore.types.MeshHeading + + + DescriptorName + java.lang.String + DescriptorName + + + + DescriptorNameMajorTopic + + DescriptorName/@MajorTopicYN + boolean + + Y + true + + + N + false + + + + QualifierName + java.lang.String + QualifierName + + + + QualifierNameMajorTopic + + QualifierName/@MajorTopicYN + boolean + + Y + true + + + N + false + + + + + + \ No newline at end of file diff --git a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java index 004c085d9..3596db300 100644 --- a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java +++ b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java @@ -115,7 +115,6 @@ public class XMIDBWriter extends JCasAnnotator_ImplBase { public static final String PARAM_FEATURES_TO_MAP_DRYRUN = "BinaryFeaturesToMapDryRun"; public static final String PARAM_BINARY_FEATURES_BLACKLIST = "BinaryFeaturesBlacklist"; public static final String PARAM_ADD_SHA_HASH = "AddShaHash"; - public static final String PARAM_SKIP_MATCHING_HASH = "SkipMatchingHash"; private static final Logger log = LoggerFactory.getLogger(XMIDBWriter.class); // The mappings are keyed by the costosys.xml path and the table schema, see 'mappingCacheKey'. // The idea is to save costly database connections by sharing updating mapping across threads. @@ -250,7 +249,6 @@ public class XMIDBWriter extends JCasAnnotator_ImplBase { private String[] binaryFeaturesBlacklistParameter; @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, mandatory = false, description = "Possible values: document_text. If this parameter is set to a valid value, the SHA256 hash for the given value will be calculated, base64 encoded and added to each document as a new column in the document table. The column will be named after the parameter value, suffixed by '_sha256'.") private String documentItemToHash; - @ConfigurationParameter(name =PARAM_SKIP_MATCHING_HASH, mandatory = false, description = "Only in effect, if: " + PARAM_ADD_SHA_HASH + " is active; if the target XMI table has also been read from by the XMI DB reader and the reader has been configured to read the document's current hash value. Then, compares the hash value retrieved and relied by the XMI DB reader to the ") private Map shaMap; private String mappingCacheKey; private DocumentReleaseCheckpoint docReleaseCheckpoint; @@ -322,7 +320,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept } if (xmiMetaSchema.isBlank()) - throw new ResourceInitializationException(new IllegalArgumentException("The XMI meta table Postgres schema must either be omitted at all or non-empty but was.")); + throw new ResourceInitializationException(new IllegalArgumentException("The XMI meta table Postgres schema must either be omitted at all or non-empty but was '" + xmiMetaSchema + "'.")); unqualifiedAnnotationNames = Collections.emptyList(); @@ -852,11 +850,15 @@ private DocumentId getDocumentId(JCas aJCas) { AnnotationIndex headerIndex = aJCas.getAnnotationIndex(Header.type); FSIterator headerIt = headerIndex.iterator(); if (!headerIt.hasNext()) { - int min = Math.min(100, aJCas.getDocumentText().length()); + String docText = ""; + if (aJCas.getDocumentText() != null) { + int min = Math.min(100, aJCas.getDocumentText().length()); + docText = aJCas.getDocumentText().substring(0, min); + } log.warn( "Got document without a header and without DBProcessingMetaData; cannot obtain document ID." + " This document will not be written into the database. Document text begins with: {}", - aJCas.getDocumentText().substring(0, min)); + docText); ++headerlessDocuments; return null; } @@ -1044,8 +1046,9 @@ public void collectionProcessComplete() throws AnalysisEngineProcessException { } catch (XmiDataInsertionException e) { throw new AnalysisEngineProcessException(e); } - log.info("{} documents without a head occured overall. Those could not be written into the database.", - headerlessDocuments); + if (headerlessDocuments > 0) + log.info("{} documents without a head occured overall. Those could not be written into the database.", + headerlessDocuments); dbc.close(); } diff --git a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java index 8cd4ce9b4..03c2b1160 100644 --- a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java +++ b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java @@ -138,7 +138,7 @@ public AbstractCas next() throws AnalysisEngineProcessException { * @param jCas The newly read JCas. */ private void setToVisitAnnotation(JCas jCas) { - if (xmiStorageDataTable != null) { + if (xmiStorageDataTable != null && dbc.tableExists(xmiStorageDataTable)) { DBProcessingMetaData dbProcessingMetaData = JCasUtil.selectSingle(jCas, DBProcessingMetaData.class); StringArray pkArray = dbProcessingMetaData.getPrimaryKey(); String pkString = String.join(",", pkArray.toArray()); @@ -146,6 +146,8 @@ private void setToVisitAnnotation(JCas jCas) { if (existingHash != null) { String newHash = getHash(jCas); if (existingHash.equals(newHash)) { + if (log.isTraceEnabled()) + log.trace("Document {} has a document text hash that equals the one present in the database. Creating a ToVisit annotation routing it only to the components with delegate keys {}.", pkString, toVisitKeys); ToVisit toVisit = new ToVisit(jCas); if (toVisitKeys != null && toVisitKeys.length != 0) { StringArray keysArray = new StringArray(jCas, toVisitKeys.length); @@ -186,7 +188,7 @@ protected List> getAllRetrievedColumns() { * @throws AnalysisEngineProcessException If the SQL request fails. */ private Map fetchCurrentHashesFromDatabase(RowBatch rowBatch) throws AnalysisEngineProcessException { - if (xmiStorageDataTable != null) { + if (xmiStorageDataTable != null && dbc.tableExists(xmiStorageDataTable) && rowBatch.getIdentifiers() != null && rowBatch.getIdentifiers().size() > 0) { String hashColumn = documentItemToHash + "_sha256"; // Extract the document IDs in this RowBatch. The IDs could be composite keys. List documentIds = new ArrayList<>(rowBatch.getIdentifiers().size()); @@ -217,7 +219,7 @@ private Map fetchCurrentHashesFromDatabase(RowBatch rowBatch) th id2hash.put(pkSb.toString(), hash); } } catch (SQLException e) { - log.error("Could not retrieve hashes from the database. SQL query was {}:", sql, e); + log.error("Could not retrieve hashes from the database. SQL query was '{}':", sql, e); throw new AnalysisEngineProcessException(e); } return id2hash; diff --git a/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java b/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java index a56950c00..f14839236 100644 --- a/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java +++ b/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java @@ -49,7 +49,7 @@ public class XMLDBMultiplierTest { private static final String MAX_XMI_ID_FIELD_NAME = "max_xmi_id"; private static final String SOFA_MAPPING_FIELD_NAME = "sofa_mapping"; private static final String SUBSET_TABLE = "test_subset"; - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer("postgres:11.12"); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:11.12"); private static String costosysConfig; @BeforeAll diff --git a/pom.xml b/pom.xml index 87deb6229..6db724ae5 100644 --- a/pom.xml +++ b/pom.xml @@ -210,7 +210,8 @@ jcore-xmi-writer jedis-parent - + jcore-jedis-integration-tests + From 639d1a3f8b739c8cf1eb1805fae3cc4faf842459 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 15 Jun 2021 14:12:40 +0200 Subject: [PATCH 068/269] Excluding jUnit 3 from a subdependency. --- jcore-banner-ae/pom.xml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/jcore-banner-ae/pom.xml b/jcore-banner-ae/pom.xml index d50f90b07..6235ec58d 100644 --- a/jcore-banner-ae/pom.xml +++ b/jcore-banner-ae/pom.xml @@ -37,6 +37,10 @@ log4j log4j + + junit + junit +
From 0c267ef7c44c6a93730a789f324146dd316f1857 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 15 Jun 2021 14:21:28 +0200 Subject: [PATCH 069/269] More junit 3 and 4 exclusions. --- jcore-jnet-ae/pom.xml | 9 ++++++++- jcore-jsbd-ae/pom.xml | 9 ++++++++- jcore-jtbd-ae/pom.xml | 9 ++++++++- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/jcore-jnet-ae/pom.xml b/jcore-jnet-ae/pom.xml index 31f7e544b..7805ab9f6 100644 --- a/jcore-jnet-ae/pom.xml +++ b/jcore-jnet-ae/pom.xml @@ -17,8 +17,9 @@ + org.apache.maven.plugins maven-assembly-plugin - 2.4 + 3.3.0 jar-with-dependencies @@ -106,6 +107,12 @@ de.julielab uea-stemmer 0.1 + + + junit + junit + + de.julielab diff --git a/jcore-jsbd-ae/pom.xml b/jcore-jsbd-ae/pom.xml index e21b02e2b..c23dc7e7c 100644 --- a/jcore-jsbd-ae/pom.xml +++ b/jcore-jsbd-ae/pom.xml @@ -17,8 +17,9 @@ + org.apache.maven.plugins maven-assembly-plugin - 2.4 + 3.3.0 jar-with-dependencies @@ -101,6 +102,12 @@ cc.mallet mallet 2.0.8 + + + junit + junit + + org.apache.commons diff --git a/jcore-jtbd-ae/pom.xml b/jcore-jtbd-ae/pom.xml index 54671bfc1..c773cf55d 100644 --- a/jcore-jtbd-ae/pom.xml +++ b/jcore-jtbd-ae/pom.xml @@ -16,8 +16,9 @@ + org.apache.maven.plugins maven-assembly-plugin - 2.4 + 3.3.0 jar-with-dependencies @@ -89,6 +90,12 @@ cc.mallet mallet 2.0.8 + + + junit + junit + + org.junit.jupiter From e152e95fda814f5bc79d0129a714f2672f6b4cb2 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 15 Jun 2021 14:44:55 +0200 Subject: [PATCH 070/269] More test fixes. --- .../reader/db/DBMultiplierReaderTest.java | 8 +++++--- .../jcore/reader/db/DBMultiplierTest.java | 8 +++++--- .../julielab/jcore/reader/db/DBReaderTest.java | 9 +++++---- jcore-elasticsearch-consumer/pom.xml | 18 ++++++++++++------ .../consumer/es/ElasticSearchConsumerIT.java | 10 ++++++---- .../jcore/ae/jnet/cli/JNETApplicationTest.java | 4 ++-- .../jnet/uima/ConsistencyPreservationTest.java | 16 +++++++++++++--- .../ae/jnet/uima/EntityAnnotatorTest.java | 16 +++++++++------- .../jcore/ae/jtbd/Sentence2TokenPipeTest.java | 8 ++++++-- .../jcore/ae/jtbd/main/TokenAnnotatorTest.java | 5 +++-- 10 files changed, 66 insertions(+), 36 deletions(-) diff --git a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierReaderTest.java b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierReaderTest.java index 11aa0d9ab..33f73c0eb 100644 --- a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierReaderTest.java +++ b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierReaderTest.java @@ -12,10 +12,11 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.ClassRule; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; import java.io.IOException; import java.sql.SQLException; @@ -23,9 +24,10 @@ import static de.julielab.jcore.reader.db.TableReaderConstants.*; import static org.junit.jupiter.api.Assertions.*; +@Testcontainers public class DBMultiplierReaderTest { - @ClassRule - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + @Container + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:11.12"); @BeforeAll public static void setup() throws SQLException { diff --git a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierTest.java b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierTest.java index fa378c49e..7a90917ad 100644 --- a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierTest.java +++ b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierTest.java @@ -19,12 +19,13 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; -import org.junit.ClassRule; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; import java.io.File; import java.io.FileInputStream; @@ -34,10 +35,11 @@ import static de.julielab.jcore.reader.db.TableReaderConstants.*; import static org.junit.jupiter.api.Assertions.*; +@Testcontainers public class DBMultiplierTest { private final static Logger log = LoggerFactory.getLogger(DBMultiplierTest.class); - @ClassRule - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + @Container + public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer("postgres:11.12"); @BeforeAll public static void setup() throws SQLException, IOException { diff --git a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBReaderTest.java b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBReaderTest.java index 015d3e3f5..6cb6f3fcf 100644 --- a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBReaderTest.java +++ b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBReaderTest.java @@ -12,12 +12,13 @@ import org.apache.uima.fit.factory.CollectionReaderFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.ClassRule; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; import java.io.File; import java.io.FileInputStream; @@ -27,10 +28,10 @@ import static de.julielab.jcore.reader.db.TableReaderConstants.*; import static org.junit.jupiter.api.Assertions.*; - +@Testcontainers public class DBReaderTest { - @ClassRule - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + @Container + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:11.12"); @BeforeAll public static void setup() throws SQLException { diff --git a/jcore-elasticsearch-consumer/pom.xml b/jcore-elasticsearch-consumer/pom.xml index a4fed0dc9..57f9452c2 100644 --- a/jcore-elasticsearch-consumer/pom.xml +++ b/jcore-elasticsearch-consumer/pom.xml @@ -82,12 +82,6 @@ org.testng testng - - org.testcontainers - testcontainers - 1.12.0 - test - ch.qos.logback logback-classic @@ -97,6 +91,18 @@ org.junit.jupiter junit-jupiter + + org.testcontainers + testcontainers + 1.15.3 + test + + + org.testcontainers + junit-jupiter + 1.15.3 + test + JULIE Lab Jena, Germany diff --git a/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/ElasticSearchConsumerIT.java b/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/ElasticSearchConsumerIT.java index 36a71fbe0..c780ee2f9 100644 --- a/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/ElasticSearchConsumerIT.java +++ b/jcore-elasticsearch-consumer/src/test/java/de/julielab/jcore/consumer/es/ElasticSearchConsumerIT.java @@ -7,7 +7,6 @@ import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.junit.ClassRule; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.slf4j.Logger; @@ -15,6 +14,8 @@ import org.testcontainers.containers.GenericContainer; import org.testcontainers.containers.output.OutputFrame; import org.testcontainers.containers.output.Slf4jLogConsumer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; import org.testcontainers.shaded.com.fasterxml.jackson.databind.ObjectMapper; import java.net.URL; @@ -23,12 +24,13 @@ import static org.junit.jupiter.api.Assertions.assertEquals; +@Testcontainers public class ElasticSearchConsumerIT { public static final String TEST_INDEX = "testindex"; public static final String TEST_CLUSTER = "testcluster"; private final static Logger log = LoggerFactory.getLogger(ElasticSearchConsumerIT.class); // in case we need to disable X-shield: https://stackoverflow.com/a/51172136/1314955 - @ClassRule + @Container public static GenericContainer es = new GenericContainer("docker.elastic.co/elasticsearch/elasticsearch:7.0.1") .withEnv("xpack.security.enabled", "false") .withEnv("discovery.type", "single-node") @@ -57,8 +59,8 @@ public void testMinimal() throws Exception { consumer.collectionProcessComplete(); final URL url = new URL("http://localhost:" + es.getMappedPort(9200) + "/" + TEST_INDEX + "/_doc/987"); final ObjectMapper om = new ObjectMapper(); - final Map map = om.readValue(url.openStream(), Map.class); - assertEquals(jCas.getDocumentText(), ((Map)map.get("_source")).get("text")); + final Map map = om.readValue(url.openStream(), Map.class); + assertEquals(jCas.getDocumentText(), ((Map) map.get("_source")).get("text")); } /** diff --git a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/cli/JNETApplicationTest.java b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/cli/JNETApplicationTest.java index 4cc449a62..153d2714c 100644 --- a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/cli/JNETApplicationTest.java +++ b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/cli/JNETApplicationTest.java @@ -6,7 +6,7 @@ package de.julielab.jcore.ae.jnet.cli; -import org.junit.After; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; import java.io.File; @@ -32,7 +32,7 @@ public class JNETApplicationTest { - @After + @AfterEach public void deleteModel() { File modelFile = new File(UNITTEST_MODEL_GZ); if (modelFile.exists()) diff --git a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/ConsistencyPreservationTest.java b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/ConsistencyPreservationTest.java index 3031116d3..f551411fd 100644 --- a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/ConsistencyPreservationTest.java +++ b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/ConsistencyPreservationTest.java @@ -18,7 +18,6 @@ package de.julielab.jcore.ae.jnet.uima; import de.julielab.jcore.types.*; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.cas.CAS; import org.apache.uima.cas.FSIterator; @@ -28,6 +27,7 @@ import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.util.CasCreationUtils; import org.apache.uima.util.XMLInputSource; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -35,6 +35,8 @@ import java.util.Iterator; import java.util.TreeSet; +import static org.junit.jupiter.api.Assertions.*; + /** * Please note that in the original test there were "GoodEntityMentions" and * "BadEntityMentions". Both types were only used for this test which caused @@ -45,7 +47,7 @@ * @author faessler * */ -public class ConsistencyPreservationTest extends TestCase { +public class ConsistencyPreservationTest { private static final Logger LOGGER = LoggerFactory.getLogger(ConsistencyPreservationTest.class); @@ -133,12 +135,14 @@ private void initJCas4DoAbbreviationBased(final JCas jcas) throws Exception { e5.addToIndexes(); } + @Test public void testConsistencyPreservation() throws Exception { final String modeString = ConsistencyPreservation.MODE_STRING + "," + ConsistencyPreservation.MODE_ACRO2FULL + "," + ConsistencyPreservation.MODE_FULL2ACRO; new ConsistencyPreservation(modeString); } + @Test public void testAcroMatch() throws Exception { final String modeString = ConsistencyPreservation.MODE_FULL2ACRO + "," + ConsistencyPreservation.MODE_ACRO2FULL; @@ -186,6 +190,7 @@ public void testAcroMatch() throws Exception { } + @Test public void testStringMatch() throws Exception { LOGGER.info("testStringMatch() - starting..."); final CAS cas = CasCreationUtils.createCas( @@ -229,6 +234,7 @@ public void testStringMatch() throws Exception { assertTrue(allOK); } + @Test public void testStringMatch2() throws Exception { // This test checks whether the consistence preservation algorithm // correctly detects already existing annotations even when there are @@ -269,6 +275,7 @@ public void testStringMatch2() throws Exception { assertEquals(3, count); } + @Test public void testStringMatch3() throws Exception { // This test checks whether the consistence preservation algorithm // correctly detects already existing annotations even when there are @@ -309,6 +316,7 @@ public void testStringMatch3() throws Exception { assertEquals(5, count); } + @Test public void testStringMatchTokenBoundaries() throws Exception { // This test checks whether the consistency preservation algorithm // sticks to token boundaries if the respective mode is on @@ -350,6 +358,7 @@ public void testStringMatchTokenBoundaries() throws Exception { assertEquals(1, count); } + @Test public void testStringMatchTokenBoundaries2() throws Exception { // Test for multi token entities String text = "This is BCA alpha. But we haven't annotated BCA alpha in all cases. Also not some other BCA."; @@ -430,7 +439,8 @@ else if (g.getSpecificType().equals("type2")) } assertEquals(2, oCount); } - + + @Test public void testStringMatchTokenBoundaries3() throws Exception { // Test for multi token entities with correct prefix but wrong ending String text = "Group 1. And Group B."; diff --git a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.java b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.java index 44dd4e90d..e2143f3e9 100644 --- a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.java +++ b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.java @@ -20,7 +20,6 @@ import de.julielab.jcore.types.*; import de.julielab.jcore.utility.index.JCoReCoverIndex; import de.julielab.jnet.tagger.Unit; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; @@ -37,6 +36,7 @@ import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; import org.apache.uima.util.XMLParser; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; @@ -52,7 +52,9 @@ import java.util.regex.Pattern; import java.util.stream.Collectors; -public class EntityAnnotatorTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class EntityAnnotatorTest { /** * Logger for this class @@ -66,12 +68,8 @@ public class EntityAnnotatorTest extends TestCase { private static final String ENTITY_ANNOTATOR_DESC = PREFIX+"EntityAnnotatorTest.xml"; private static final String NEGATIVE_LIST = PREFIX+"negativeList"; - @Override - protected void setUp() throws Exception { - super.setUp(); - // PropertyConfigurator.configure("src/test/java/log4j.properties"); - } + @Test public void testIgnoreLabel() throws ResourceInitializationException { // load AE @@ -124,6 +122,7 @@ public void testIgnoreLabel() throws ResourceInitializationException { /** * test whether Annotator can be initialized properly from given descriptor */ + @Test public void testInitialize() { LOGGER.debug("testInitialize()"); AnalysisEngine entityAnnotator = null; @@ -150,6 +149,7 @@ public void testInitialize() { * test whether process method runs successfully. Output must be checked by * a human manually */ + @Test public void testProcess() throws InvalidXMLException, ResourceInitializationException, IOException, SAXException, CASException, AnalysisEngineProcessException { LOGGER.debug("testProcess()"); @@ -176,6 +176,7 @@ public void testProcess() throws InvalidXMLException, ResourceInitializationExce * unit sentence and removing duplicates. Prediction is "simulated" (labels * are set). */ + @Test public void testSimulatedProcess() throws IllegalAccessException, NoSuchFieldException, ResourceInitializationException, InvalidXMLException, IOException, CASException, SAXException { LOGGER.debug("testCreateUnitSentence() - starting"); @@ -280,6 +281,7 @@ else if (unit.getRep().equals("ceta")) * @throws IllegalAccessException * @throws IllegalArgumentException */ + @Test public void testWriteToCAS() throws SecurityException, NoSuchFieldException, ResourceInitializationException, InvalidXMLException, IOException, CASException, IllegalArgumentException, IllegalAccessException { LOGGER.debug("testWriteToCAS()"); diff --git a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipeTest.java b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipeTest.java index 46d4826c1..140945584 100644 --- a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipeTest.java +++ b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipeTest.java @@ -17,19 +17,22 @@ package de.julielab.jcore.ae.jtbd; -import junit.framework.TestCase; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; -public class Sentence2TokenPipeTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class Sentence2TokenPipeTest { private static final Logger LOGGER = LoggerFactory .getLogger(Sentence2TokenPipeTest.class); private static final String TEST_SENTENCE = "this is a \t junit -test"; + @Test public void testMakeLabel() { final ArrayList expectedLabels = new ArrayList(); expectedLabels.add("P"); @@ -55,6 +58,7 @@ public void testMakeLabel() { assertTrue(allOK); } + @Test public void testMakeUnits() { final ArrayList expectedUnits = new ArrayList(); expectedUnits.add("this"); diff --git a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotatorTest.java b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotatorTest.java index 37d8571f9..543abf443 100644 --- a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotatorTest.java +++ b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotatorTest.java @@ -18,7 +18,6 @@ import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; -import junit.framework.TestCase; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.jcas.JCas; @@ -32,7 +31,9 @@ import java.util.Iterator; -public class TokenAnnotatorTest extends TestCase { +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TokenAnnotatorTest { /** * Logger for this class From daaca4c0d6ac468847527d36fd88ea98407d4028 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 15 Jun 2021 14:50:34 +0200 Subject: [PATCH 071/269] Updated the test XMI of the JNET mini app to the new output. Why the output changed is unknown. It seems to be semantically equal, though. The XMI IDs have changed for some reason. --- .../de/julielab/jcore/ae/jnet/uima/miniapp_out_template.xmi | 1 - 1 file changed, 1 deletion(-) delete mode 100644 jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/miniapp_out_template.xmi diff --git a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/miniapp_out_template.xmi b/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/miniapp_out_template.xmi deleted file mode 100644 index 029dc8db3..000000000 --- a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/miniapp_out_template.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file From 06ea3af94dc5263f3a26237f1cd56586fc73bab7 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 16 Jun 2021 08:13:51 +0200 Subject: [PATCH 072/269] Setting the UIMA type capabilities for the BioLemmatizer. --- .../biolemmatizer/desc/jcore-biolemmatizer-ae.xml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/jcore-biolemmatizer-ae/src/main/resources/de/julielab/jcore/ae/biolemmatizer/desc/jcore-biolemmatizer-ae.xml b/jcore-biolemmatizer-ae/src/main/resources/de/julielab/jcore/ae/biolemmatizer/desc/jcore-biolemmatizer-ae.xml index 137eb219c..9fe2de8b8 100644 --- a/jcore-biolemmatizer-ae/src/main/resources/de/julielab/jcore/ae/biolemmatizer/desc/jcore-biolemmatizer-ae.xml +++ b/jcore-biolemmatizer-ae/src/main/resources/de/julielab/jcore/ae/biolemmatizer/desc/jcore-biolemmatizer-ae.xml @@ -13,7 +13,18 @@ - + + + + de.julielab.jcore.types.Token + de.julielab.jcore.types.PennBioIEPOSTag + + + de.julielab.jcore.types.Lemma + + + + true true From 0e2a9e620fef59a06a0b27f52927c305c47a6169 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 16 Jun 2021 08:19:02 +0200 Subject: [PATCH 073/269] Setting the UIMA type capabilities for the BioLemmatizer and the BioSem event annotator. --- .../de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java | 4 +++- .../julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotator.java | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java b/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java index 2b4011ff0..12720ec9d 100644 --- a/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java +++ b/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java @@ -18,6 +18,7 @@ import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException; import org.apache.uima.cas.FSIterator; import org.apache.uima.fit.descriptor.ExternalResource; +import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; @@ -35,6 +36,7 @@ import java.util.*; import java.util.Map.Entry; +@TypeCapability(inputs = {"de.julielab.jcore.types.Gene"}, outputs = {"de.julielab.jcore.types.EventTrigger", "de.julielab.jcore.types.EventMention"}) public class BioSemEventAnnotator extends JCasAnnotator_ImplBase { private final static Logger log = LoggerFactory.getLogger(BioSemEventAnnotator.class); @@ -45,7 +47,7 @@ public class BioSemEventAnnotator extends JCasAnnotator_ImplBase { private DBUtils trainedDb; - @ExternalResource(key = RESOURCE_TRAINED_DB, mandatory = true) + @ExternalResource(key = RESOURCE_TRAINED_DB) private DBUtilsProvider dbUtilsProvider; private EventExtraction xtr; diff --git a/jcore-linnaeus-species-ae/src/main/java/de/julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotator.java b/jcore-linnaeus-species-ae/src/main/java/de/julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotator.java index cbab4f7e9..1853e3f50 100644 --- a/jcore-linnaeus-species-ae/src/main/java/de/julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotator.java +++ b/jcore-linnaeus-species-ae/src/main/java/de/julielab/jcore/ae/linnaeus/LinnaeusSpeciesAnnotator.java @@ -35,7 +35,7 @@ * */ @ResourceMetaData(name="JCore LINNAEUS Species AE") -@TypeCapability(inputs = {"de.julielab.jcore.types.Organism", "de.julielab.jcore.types.ResourceEntry"}) +@TypeCapability(outputs = {"de.julielab.jcore.types.Organism", "de.julielab.jcore.types.ResourceEntry"}) public class LinnaeusSpeciesAnnotator extends JCasAnnotator_ImplBase { public static final String RES_KEY_LINNAEUS_MATCHER = "LinnaeusMatcher"; public static final String PARAM_CONFIG_FILE = "ConfigFile"; From 0e7a6cfc0671de41d37268f3fd7345148a6fce0c Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 17 Jun 2021 16:45:46 +0200 Subject: [PATCH 074/269] Bumping CoStoSys to 1.6.0-SNAPSHOT. --- .../jcore/ae/flairner/FlairNerAnnotator.java | 8 ++++++++ .../integrationtests/UpdateWithHashComparison.java | 13 ++++++++++--- .../de/julielab/jcore/reader/xmi/CasPopulator.java | 10 +++------- .../reader/xmi/desc/jcore-xmi-db-multiplier.xml | 1 - .../jcore/reader/xmi/XmiDBMultiplierTest.java | 2 +- .../src/test/resources/logback-test.xml | 4 ++-- jedis-parent/pom.xml | 2 +- 7 files changed, 25 insertions(+), 15 deletions(-) diff --git a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java index 8ce44a6f5..215b07718 100644 --- a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java +++ b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java @@ -162,11 +162,19 @@ public void process(final JCas aJCas) throws AnalysisEngineProcessException { sentence.setId("s" + i++); sentenceMap.put(sentence.getId(), sentence); } + if ( log.isDebugEnabled()) { + if (sentenceMap.isEmpty()) + log.debug("Document {} does not have any sentences.", JCoReTools.getDocId(aJCas)); + if (!aJCas.getAnnotationIndex(Token.class).iterator().hasNext()) + log.debug("Document {} does not have any tokens", JCoReTools.getDocId(aJCas)); + } try { final AnnotationAdderHelper helper = new AnnotationAdderHelper(); + log.trace("Sending document sentences to flair for entity tagging."); final NerTaggingResponse taggingResponse = connector.tagSentences(StreamSupport.stream(sentIndex.spliterator(), false)); final List taggedEntities = taggingResponse.getTaggedEntities(); for (TaggedEntity entity : taggedEntities) { + log.trace("Adding flair-tagged entity to the CAS: {}", entity); final Sentence sentence = sentenceMap.get(entity.getDocumentId()); EntityMention em = (EntityMention) JCoReAnnotationTools.getAnnotationByClassName(aJCas, entityClass); helper.setAnnotationOffsetsRelativeToSentence(sentence, em, entity, adderConfig); diff --git a/jcore-jedis-integration-tests/src/test/java/de/julielab/jcore/jedis/integrationtests/UpdateWithHashComparison.java b/jcore-jedis-integration-tests/src/test/java/de/julielab/jcore/jedis/integrationtests/UpdateWithHashComparison.java index 52754055b..63e967924 100644 --- a/jcore-jedis-integration-tests/src/test/java/de/julielab/jcore/jedis/integrationtests/UpdateWithHashComparison.java +++ b/jcore-jedis-integration-tests/src/test/java/de/julielab/jcore/jedis/integrationtests/UpdateWithHashComparison.java @@ -9,6 +9,7 @@ import de.julielab.jcore.reader.db.DBMultiplierReader; import de.julielab.jcore.reader.xml.XMLDBMultiplier; import de.julielab.jcore.types.Annotation; +import de.julielab.jcore.utility.JCoReTools; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngine; @@ -30,9 +31,7 @@ import java.io.File; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.EnumSet; -import java.util.List; +import java.util.*; import static org.assertj.core.api.Assertions.assertThat; @@ -58,6 +57,7 @@ public class UpdateWithHashComparison { private static JCas cas; private static DataBaseConnector dbc; private static List namesOfRunComponents = new ArrayList<>(); + private static Set idsOfProcessedDocuments = new LinkedHashSet<>(); @BeforeAll public static void setup() throws Exception { @@ -182,6 +182,11 @@ public void testInitialProcessingProcessing() throws Exception { // Check that all rows have been processed in the XML source subset table. assertThat(status.isProcessed).isEqualTo(3); assertThat(status.inProcess).isEqualTo(0); + + assertThat(idsOfProcessedDocuments).hasSize(3); + // Check that there are actual IDs, not null string or something like that + for (String id : idsOfProcessedDocuments) + assertThat(id).matches("[0-9]+"); } /** @@ -199,7 +204,9 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept @Override public void process(JCas jCas) { + assertThat(jCas.getDocumentText()).isNotBlank(); namesOfRunComponents.add(name); + idsOfProcessedDocuments.add(JCoReTools.getDocId(jCas)); new Annotation(jCas).addToIndexes(); } } diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java index fd631e58f..e5d3bf36d 100644 --- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java +++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java @@ -40,10 +40,8 @@ public class CasPopulator { private final static Logger log = LoggerFactory.getLogger(CasPopulator.class); private final DataBaseConnector dbc; private final boolean readsBaseDocument; - private final int numAdditionalTables; private final int numDataRetrievedDataFields; - private final String dataTable; - private final String[] additionalTableNames; + private final String[] unqualifiedAnnotationModuleNames; private final XmiBuilder builder; private final Boolean logFinalXmi; private final int xercesAttributeBufferSize; @@ -72,10 +70,8 @@ public CasPopulator(String dataTable, Initializer initializer, Boolean readDataT this.tableName = tableName; this.readsBaseDocument = initializer.getReadsBaseDocument(); this.joinTables = initializer.isJoinTables(); - this.numAdditionalTables = initializer.getNumAdditionalTables(); this.numDataRetrievedDataFields = initializer.getNumDataRetrievedDataFields(); - this.dataTable = dataTable; - this.additionalTableNames = initializer.getUnqualifiedAnnotationModuleNames(); + this.unqualifiedAnnotationModuleNames = initializer.getUnqualifiedAnnotationModuleNames(); this.builder = initializer.getXmiBuilder(); binaryBuilder = initializer.getBinaryBuilder(); useBinaryFormat = initializer.isUseBinaryFormat(); @@ -85,7 +81,7 @@ public CasPopulator(String dataTable, Initializer initializer, Boolean readDataT reverseBinaryMapping = initializer.getReverseBinaryMapping(); featuresToMapBinary = initializer.getFeaturesToMapBinary(); if (useBinaryFormat) { - binaryJeDISNodeDecoder = new BinaryJeDISNodeDecoder(Stream.of(additionalTableNames).collect(Collectors.toSet()), true); + binaryJeDISNodeDecoder = new BinaryJeDISNodeDecoder(Stream.of(unqualifiedAnnotationModuleNames).collect(Collectors.toSet()), true); } else binaryJeDISNodeDecoder = null; } diff --git a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml index 992ed962a..c124b4804 100644 --- a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml +++ b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml @@ -29,7 +29,6 @@ - diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java index 2af097f43..cde2d026f 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java @@ -31,7 +31,7 @@ public class XmiDBMultiplierTest { - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:11.12"); private static String costosysConfig; private static int subsetCounter; diff --git a/jcore-xmi-db-reader/src/test/resources/logback-test.xml b/jcore-xmi-db-reader/src/test/resources/logback-test.xml index b8337ca9b..edc553153 100644 --- a/jcore-xmi-db-reader/src/test/resources/logback-test.xml +++ b/jcore-xmi-db-reader/src/test/resources/logback-test.xml @@ -9,8 +9,8 @@ - - + + diff --git a/jedis-parent/pom.xml b/jedis-parent/pom.xml index 226e35c36..b66c3be70 100644 --- a/jedis-parent/pom.xml +++ b/jedis-parent/pom.xml @@ -17,7 +17,7 @@ de.julielab costosys - 1.5.2-SNAPSHOT + 1.6.0-SNAPSHOT de.julielab From bdc31c92ef8beda950908172f8314b3923d2ce7a Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 18 Jun 2021 09:19:57 +0200 Subject: [PATCH 075/269] Fixed JeDIS tests that got stuck due to too tight restrictions on the number of available database connections. With the possibility to reserve non-shared connections in CoStoSys, the old connections pool size limits did not suffice any more in a few cases. --- .../jcore/reader/db/DBMultiplierReaderTest.java | 4 ++-- .../julielab/jcore/reader/db/DBMultiplierTest.java | 2 +- .../de/julielab/jcore/reader/db/DBReaderTest.java | 4 ++-- .../jcore/reader/xmi/XmiDBMultiplierReader.java | 12 +++++++----- .../xmi/XmiDBMultiplierDifferentNsSchemaTest.java | 2 +- .../jcore/reader/xmi/XmiDBMultiplierTest.java | 4 +++- .../reader/xmi/XmiDBReaderBinaryFormatTest.java | 2 +- .../reader/xmi/XmiDBReaderDifferentNsSchemaTest.java | 2 +- .../jcore/reader/xmi/XmiDBReaderGzippedDataTest.java | 4 ++-- .../xmi/XmiDBReaderMonolithicDocumentsTest.java | 4 ++-- .../julielab/jcore/reader/xmi/XmiDBReaderTest.java | 2 +- .../src/test/resources/logback-test.xml | 3 ++- .../consumer/xmi/XmiDBWriterBinaryFormatTest.java | 2 +- .../xmi/XmiDBWriterMonolithicDocumentTest.java | 2 +- .../julielab/jcore/consumer/xmi/XmiDBWriterTest.java | 2 +- .../jcore/reader/xml/XMLDBMultiplierTest.java | 4 ++-- 16 files changed, 30 insertions(+), 25 deletions(-) diff --git a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierReaderTest.java b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierReaderTest.java index 33f73c0eb..c10ff9670 100644 --- a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierReaderTest.java +++ b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierReaderTest.java @@ -27,7 +27,7 @@ @Testcontainers public class DBMultiplierReaderTest { @Container - public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:11.12"); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); @BeforeAll public static void setup() throws SQLException { @@ -41,7 +41,7 @@ public static void setup() throws SQLException { @Test public void testDBMultiplierReader() throws UIMAException, IOException, ConfigurationException { - String costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2017", 1, postgres); + String costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2017", 2, postgres); CollectionReader reader = CollectionReaderFactory.createReader(DBMultiplierReader.class, PARAM_BATCH_SIZE, 5, PARAM_TABLE, "testsubset", diff --git a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierTest.java b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierTest.java index 7a90917ad..350f610fb 100644 --- a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierTest.java +++ b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBMultiplierTest.java @@ -39,7 +39,7 @@ public class DBMultiplierTest { private final static Logger log = LoggerFactory.getLogger(DBMultiplierTest.class); @Container - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer("postgres:11.12"); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); @BeforeAll public static void setup() throws SQLException, IOException { diff --git a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBReaderTest.java b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBReaderTest.java index 6cb6f3fcf..46b8ac436 100644 --- a/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBReaderTest.java +++ b/jcore-db-reader/src/test/java/de/julielab/jcore/reader/db/DBReaderTest.java @@ -31,7 +31,7 @@ @Testcontainers public class DBReaderTest { @Container - public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:11.12"); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); @BeforeAll public static void setup() throws SQLException { @@ -43,7 +43,7 @@ public static void setup() throws SQLException { @Test public void testDBReader() throws UIMAException, IOException, ConfigurationException { - String costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2017", 1, postgres); + String costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2017", 2, postgres); CollectionReader reader = CollectionReaderFactory.createReader(DBReaderTestImpl.class, PARAM_BATCH_SIZE, 5, PARAM_TABLE, "testsubset", diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierReader.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierReader.java index 22cadadcc..185bdd1d4 100644 --- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierReader.java +++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierReader.java @@ -42,14 +42,14 @@ public class XmiDBMultiplierReader extends DBMultiplierReader { public static final String PARAM_ANNOTATIONS_TO_LOAD = Initializer.PARAM_ANNOTATIONS_TO_LOAD; public static final String PARAM_XMI_META_SCHEMA = "XmiMetaTablesSchema"; private final static Logger log = LoggerFactory.getLogger(XmiDBMultiplierReader.class); + @ConfigurationParameter(name = PARAM_ANNOTATIONS_TO_LOAD, mandatory = false, description = "An array of qualified UIMA type names. The provided names will be converted to database table column names in an equivalent manner as the XMIDBWriter does when storing the annotations. Thus, by default the columns of the XMI table holding annotation module information are named by lowercased UIMA type name where dots are replaced by underscores.. This can be overwritten by appending ':' to a table name. The given type names will be converted to valid Postgres columns names by replacing dots with underscores and the colon will be converted to the dollar character. From the resolved columns, annotation modules in segmented XMI format are read where an annotation module contains all annotation instances of a specific type in a specific document. All annotation modules read this way are merged with the base document, resulting in valid XMI data which is then deserialized into the CAS.") + protected String[] qualifiedAnnotationColumnNames; @ConfigurationParameter(name = PARAM_READS_BASE_DOCUMENT, description = "Indicates if this reader reads segmented " + "annotation data. If set to false, the XMI data is expected to represent complete annotated documents. " + "If it is set to true, a segmented annotation graph is expected and the table given with the 'Table' parameter " + "will contain the document text together with some basic annotations. What exactly is stored in which manner " + "is determined by the jcore-xmi-db-consumer used to write the data into the database.") private Boolean readsBaseDocument; - @ConfigurationParameter(name = PARAM_ANNOTATIONS_TO_LOAD, mandatory = false, description = "An array of qualified UIMA type names. The provided names will be converted to database table column names in an equivalent manner as the XMIDBWriter does when storing the annotations. Thus, by default the columns of the XMI table holding annotation module information are named by lowercased UIMA type name where dots are replaced by underscores.. This can be overwritten by appending ':' to a table name. The given type names will be converted to valid Postgres columns names by replacing dots with underscores and the colon will be converted to the dollar character. From the resolved columns, annotation modules in segmented XMI format are read where an annotation module contains all annotation instances of a specific type in a specific document. All annotation modules read this way are merged with the base document, resulting in valid XMI data which is then deserialized into the CAS.") - protected String[] qualifiedAnnotationColumnNames; @ConfigurationParameter(name = PARAM_STORE_XMI_ID, mandatory = false, description = "This parameter is required " + "to be set to true, if this reader is contained in a pipeline that also contains a jcore-xmi-db-writer and" + "the writer will segment the CAS annotation graph and store only parts of it. Then, it is important to " + @@ -68,7 +68,7 @@ public class XmiDBMultiplierReader extends DBMultiplierReader { "(j)visualvm, the hot spots of work can be identified. If one of those is the XML attribute buffer " + "resizing, this parameter should be set to a size that makes buffer resizing unnecessary.") private int xercesAttributeBufferSize; - @ConfigurationParameter(name = PARAM_XMI_META_SCHEMA, mandatory = false, defaultValue = "public", description = "Each XMI file defines a number of XML namespaces according to the types used in the document. Those namespaces are stored in a table named '" +XmiSplitConstants.XMI_NS_TABLE + "' when splitting annotations in annotation modules by the XMI DB writer. This parameter allows to specify in which Postgres schema this table should be looked for. Also, the table listing the annotation tables is stored in this Postgres schema. Defaults to 'public'.") + @ConfigurationParameter(name = PARAM_XMI_META_SCHEMA, mandatory = false, defaultValue = "public", description = "Each XMI file defines a number of XML namespaces according to the types used in the document. Those namespaces are stored in a table named '" + XmiSplitConstants.XMI_NS_TABLE + "' when splitting annotations in annotation modules by the XMI DB writer. This parameter allows to specify in which Postgres schema this table should be looked for. Also, the table listing the annotation tables is stored in this Postgres schema. Defaults to 'public'.") private String xmiMetaSchema; private boolean doGzip; private String[] additionalTableNames; @@ -107,7 +107,7 @@ public void getNext(JCas jCas) throws CollectionException { rowBatch.setXercesAttributeBufferSize(xercesAttributeBufferSize); rowBatch.setXmiMetaTablesPostgresSchema(xmiMetaSchema); } catch (Throwable throwable) { - log.error("Exception ocurred while trying to get the next document", throwable); + log.error("Exception occurred while trying to get the next document", throwable); throw throwable; } } @@ -122,6 +122,8 @@ private void adaptReaderConfigurationForXmiData() throws ResourceInitializationE costosysConfig = (String) getConfigParameterValue(PARAM_COSTOSYS_CONFIG_NAME); try { dbc = new DataBaseConnector(costosysConfig); + if (dbc.getMaxConnections() < 3) + dbc.setMaxConnections(3); } catch (FileNotFoundException e) { throw new ResourceInitializationException(e); } @@ -185,7 +187,7 @@ private void determineDataFormat(String table) throws ResourceInitializationExce } private void checkForJeDISBinaryFormat(byte[] firstTwoBytes) { - short header = (short) ((firstTwoBytes[0]<<8) | (0xff & firstTwoBytes[1])); + short header = (short) ((firstTwoBytes[0] << 8) | (0xff & firstTwoBytes[1])); if (header != BinaryJeDISNodeEncoder.JEDIS_BINARY_MAGIC) { useBinaryFormat = false; log.debug("Is data encoded in JeDIS binary format: false"); diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierDifferentNsSchemaTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierDifferentNsSchemaTest.java index ff60e41a0..73dcdc055 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierDifferentNsSchemaTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierDifferentNsSchemaTest.java @@ -31,7 +31,7 @@ public class XmiDBMultiplierDifferentNsSchemaTest { - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static int subsetCounter; diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java index cde2d026f..fabc558aa 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierTest.java @@ -31,7 +31,7 @@ public class XmiDBMultiplierTest { - public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:11.12"); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static int subsetCounter; @@ -40,6 +40,7 @@ public static void setup() throws UIMAException, IOException, ConfigurationExcep postgres.start(); XmiDBSetupHelper.createDbcConfig(postgres); DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setMaxConnections(3); costosysConfig = DBTestUtils.createTestCostosysConfig("xmi_text", 10, postgres); new File(costosysConfig).deleteOnExit(); XmiDBSetupHelper.processAndSplitData(costosysConfig, false, false,"public"); @@ -57,6 +58,7 @@ public static void shutdown() { @Test(threadPoolSize = 3, invocationCount = 10, timeOut = 500000) public void testXmiDBMultiplierReader() throws Exception { DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setMaxConnections(5); String xmisubset; synchronized (XmiDBMultiplierDifferentNsSchemaTest.class) { xmisubset = "xmisubset" + subsetCounter++; diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java index d2fc88444..5af87e804 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderBinaryFormatTest.java @@ -26,7 +26,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue; public class XmiDBReaderBinaryFormatTest { - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static String xmisubset; diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderDifferentNsSchemaTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderDifferentNsSchemaTest.java index d592bec9e..8ae996691 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderDifferentNsSchemaTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderDifferentNsSchemaTest.java @@ -26,7 +26,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue; public class XmiDBReaderDifferentNsSchemaTest { - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static String xmisubset; diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderGzippedDataTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderGzippedDataTest.java index 9a7fea0b3..e25808419 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderGzippedDataTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderGzippedDataTest.java @@ -31,7 +31,7 @@ * The exact same test as {@link XmiDBReaderTest} but here, the data is gzipped. */ public class XmiDBReaderGzippedDataTest { - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static String xmisubset; @@ -41,7 +41,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf XmiDBSetupHelper.createDbcConfig(postgres); DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); - costosysConfig = DBTestUtils.createTestCostosysConfig("xmi_text", 1, postgres); + costosysConfig = DBTestUtils.createTestCostosysConfig("xmi_text", 2, postgres); new File(costosysConfig).deleteOnExit(); XmiDBSetupHelper.processAndSplitData(costosysConfig, true, false,"public"); assertTrue(dbc.withConnectionQueryBoolean( c -> c.tableExists("_data.documents")), "The data document table exists"); diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderMonolithicDocumentsTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderMonolithicDocumentsTest.java index e0ae7f3ed..8b0dab1d2 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderMonolithicDocumentsTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderMonolithicDocumentsTest.java @@ -28,7 +28,7 @@ public class XmiDBReaderMonolithicDocumentsTest { - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static String xmisubset; @@ -38,7 +38,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf XmiDBSetupHelper.createDbcConfig(postgres); DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); - costosysConfig = DBTestUtils.createTestCostosysConfig("xmi_complete_cas", 1, postgres); + costosysConfig = DBTestUtils.createTestCostosysConfig("xmi_complete_cas", 2, postgres); new File(costosysConfig).deleteOnExit(); XmiDBSetupHelper.processAndStoreCompleteXMIData(costosysConfig, true); dbc.reserveConnection(); diff --git a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderTest.java b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderTest.java index cf1d089ef..36ca9601a 100644 --- a/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderTest.java +++ b/jcore-xmi-db-reader/src/test/java/de/julielab/jcore/reader/xmi/XmiDBReaderTest.java @@ -28,7 +28,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue; public class XmiDBReaderTest { - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static String xmisubset; diff --git a/jcore-xmi-db-reader/src/test/resources/logback-test.xml b/jcore-xmi-db-reader/src/test/resources/logback-test.xml index edc553153..6a4a567cd 100644 --- a/jcore-xmi-db-reader/src/test/resources/logback-test.xml +++ b/jcore-xmi-db-reader/src/test/resources/logback-test.xml @@ -10,7 +10,8 @@ - + + diff --git a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterBinaryFormatTest.java b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterBinaryFormatTest.java index 15b5fc5c9..135affc2d 100644 --- a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterBinaryFormatTest.java +++ b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterBinaryFormatTest.java @@ -42,7 +42,7 @@ @Testcontainers public class XmiDBWriterBinaryFormatTest { @Container - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static String xmlSubsetTable; private static DataBaseConnector dbc; diff --git a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterMonolithicDocumentTest.java b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterMonolithicDocumentTest.java index 6af2d578d..6f8611d29 100644 --- a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterMonolithicDocumentTest.java +++ b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterMonolithicDocumentTest.java @@ -31,7 +31,7 @@ @Testcontainers public class XmiDBWriterMonolithicDocumentTest { @Container - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static DataBaseConnector dbc; diff --git a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java index 5f3a979bb..fbcb62164 100644 --- a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java +++ b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java @@ -31,7 +31,7 @@ @Testcontainers public class XmiDBWriterTest { @Container - public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static String xmlSubsetTable; private static DataBaseConnector dbc; diff --git a/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java b/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java index f14839236..3e2cd9f79 100644 --- a/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java +++ b/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java @@ -49,7 +49,7 @@ public class XMLDBMultiplierTest { private static final String MAX_XMI_ID_FIELD_NAME = "max_xmi_id"; private static final String SOFA_MAPPING_FIELD_NAME = "sofa_mapping"; private static final String SUBSET_TABLE = "test_subset"; - public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:11.12"); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; @BeforeAll @@ -59,7 +59,7 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); dbc.setActiveTableSchema("medline_2016_nozip"); - costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2016_nozip", 1, postgres); + costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2016_nozip", 2, postgres); new File(costosysConfig).deleteOnExit(); try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { // We create two tables. One is the XML table the multiplier reads from and maps the contents to the JCas. From f1f84d55b4dc08cf52475d42bfd3675a9378d8bb Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 1 Jul 2021 13:38:05 +0200 Subject: [PATCH 076/269] Added "cut away characters" to JCoReCondensedDocumentText. --- .../java/banner/tagging/pipe/LemmaPOS.java | 14 +- .../jcore/ae/jsbd/main/SentenceAnnotator.java | 136 ++++++++++-------- .../ae/jsbd/main/SentenceAnnotatorTest.java | 48 ++++--- .../src/test/resources/errordocs/README.md | 4 + .../utility/JCoReCondensedDocumentText.java | 41 ++++-- .../JCoReCondensedDocumentTextTest.java | 56 ++++++++ .../src/test/resources/PMC5478802.xmi | 5 + .../jcore/consumer/xmi/XmiDBWriterTest.java | 40 +++++- 8 files changed, 251 insertions(+), 93 deletions(-) create mode 100644 jcore-jsbd-ae/src/test/resources/errordocs/README.md create mode 100644 jcore-utilities/src/test/resources/PMC5478802.xmi diff --git a/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java b/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java index 1c28c28b0..8068cfa1b 100644 --- a/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java +++ b/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java @@ -43,14 +43,16 @@ public LemmaPOS(Lemmatiser lemmatiser, Tagger posTagger) { public void setLemmatiser(Lemmatiser lemmatiser) { initResourcesMap(); getResources().lemmatiser = lemmatiser; + System.out.println("Setting lemmatiser to " + Thread.currentThread()); } public void setPosTagger(Tagger posTagger) { initResourcesMap(); getResources().posTagger = posTagger; + System.out.println("Setting PoS Tagger to " + Thread.currentThread()); } - private void initResourcesMap() { + synchronized private void initResourcesMap() { if (resourcesByThread == null) resourcesByThread = new HashMap<>(); } @@ -67,7 +69,7 @@ private Resources getResources() { @Override public Instance pipe(Instance carrier) { if (expectLemmatiser != (getResources().lemmatiser != null)) - throw new IllegalStateException("Model was trained with lemmatiser; not present in current config"); + throw new IllegalStateException("Model was trained with lemmatiser; not present in current config; resource map: " + resourcesByThread + ", current thread: " + Thread.currentThread()); if (expectPOSTagger != (getResources().posTagger != null)) throw new IllegalStateException("Model was trained with POS tagger; not present in current config"); // TODO Add prefix ability @@ -112,5 +114,13 @@ public Instance pipe(Instance carrier) { private class Resources { public Lemmatiser lemmatiser; public Tagger posTagger; + + @Override + public String toString() { + return "Resources{" + + "lemmatiser=" + lemmatiser + + ", posTagger=" + posTagger + + '}'; + } } } diff --git a/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java b/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java index a27107477..c91869654 100644 --- a/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java +++ b/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java @@ -146,77 +146,91 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept * @throws AnalysisEngineProcessException */ public void process(JCas aJCas) throws AnalysisEngineProcessException { - if (StringUtils.isBlank(aJCas.getDocumentText())) { - final String docId = JCoReTools.getDocId(aJCas); - LOGGER.warn("The document text of document {} is empty.", docId); - return; - } - JCoReCondensedDocumentText documentText; try { - // If there are no cut-away types, the document text will remain unchanged. - documentText = new JCoReCondensedDocumentText(aJCas, cutAwayTypes); - } catch (ClassNotFoundException e1) { - throw new AnalysisEngineProcessException(e1); - } - - if (sentenceDelimiterTypes != null) { + if (StringUtils.isBlank(aJCas.getDocumentText())) { + final String docId = JCoReTools.getDocId(aJCas); + LOGGER.warn("The document text of document {} is empty.", docId); + return; + } + JCoReCondensedDocumentText documentText; try { - // the index merger gives us access to all delimiter type - // indexes in one - JCoReAnnotationIndexMerger indexMerger = new JCoReAnnotationIndexMerger(sentenceDelimiterTypes, false, - null, aJCas); + // If there are no cut-away types, the document text will remain unchanged. + documentText = new JCoReCondensedDocumentText(aJCas, cutAwayTypes); + } catch (ClassNotFoundException e1) { + LOGGER.error("Could not create the text without annotations to be cut away in document {}", JCoReTools.getDocId(aJCas), e1); + throw new AnalysisEngineProcessException(e1); + } - // the idea: collect all start and end offsets of sentence - // delimiter annotations (sections, titles, captions, ...) in a - // list and sort ascending; then, perform sentence segmentation - // between every two adjacent offsets. This way, no sentence can - // cross any delimiter annotation border - List borders = new ArrayList<>(); - borders.add(0); - borders.add(aJCas.getDocumentText().length()); - while (indexMerger.incrementAnnotation()) { - Annotation a = (Annotation) indexMerger.getAnnotation(); - // Here we convert the original offsets to the condensed offsets. If there are - // no cut-away types, the offsets will just remain unchanged. Otherwise we now - // have the borders of the condensed text passages associated with the sentence - // delimiter annotation. - borders.add(documentText.getCondensedOffsetForOriginalOffset(a.getBegin())); - borders.add(documentText.getCondensedOffsetForOriginalOffset(a.getEnd())); - } - borders.sort(null); + if (sentenceDelimiterTypes != null) { + try { + // the index merger gives us access to all delimiter type + // indexes in one + JCoReAnnotationIndexMerger indexMerger = new JCoReAnnotationIndexMerger(sentenceDelimiterTypes, false, + null, aJCas); - // now do sentence segmentation between annotation borders - for (int i = 1; i < borders.size(); ++i) { - int start = borders.get(i - 1); - int end = borders.get(i); + // the idea: collect all start and end offsets of sentence + // delimiter annotations (sections, titles, captions, ...) in a + // list and sort ascending; then, perform sentence segmentation + // between every two adjacent offsets. This way, no sentence can + // cross any delimiter annotation border + List borders = new ArrayList<>(); + borders.add(0); + borders.add(aJCas.getDocumentText().length()); + while (indexMerger.incrementAnnotation()) { + Annotation a = (Annotation) indexMerger.getAnnotation(); + System.out.println(a.getCoveredText()); + System.out.println("--"); + System.out.println(documentText.getCodensedText().substring(documentText.getCondensedOffsetForOriginalOffset(a.getBegin()), documentText.getOriginalOffsetForCondensedOffset(a.getEnd()))); + System.out.println(a.getBegin() + " - " + a.getEnd() + ", " + documentText.getCondensedOffsetForOriginalOffset(a.getBegin()) + " - " + documentText.getOriginalOffsetForCondensedOffset(a.getEnd())); + System.out.println(); + // Here we convert the original offsets to the condensed offsets. If there are + // no cut-away types, the offsets will just remain unchanged. Otherwise we now + // have the borders of the condensed text passages associated with the sentence + // delimiter annotation. + borders.add(documentText.getCondensedOffsetForOriginalOffset(a.getBegin())); + assert borders.get(borders.size() - 1) < documentText.getCodensedText().length(); + borders.add(documentText.getCondensedOffsetForOriginalOffset(a.getEnd())); + assert borders.get(borders.size() - 1) < documentText.getCodensedText().length() : "Original offset "+a.getEnd()+" is mapped to condensed offset " + documentText.getCondensedOffsetForOriginalOffset(a.getEnd()); + } + borders.sort(null); - // skip leading whites spaces - while (start < end && Character.isWhitespace(aJCas.getDocumentText().charAt(start))) - ++start; + // now do sentence segmentation between annotation borders + for (int i = 1; i < borders.size(); ++i) { + int start = borders.get(i - 1); + int end = borders.get(i); - // get the string between the current annotation borders and recognize sentences - String textSpan = documentText.getCodensedText().substring(start, end); - if (!StringUtils.isBlank(textSpan)) - doSegmentation(documentText, textSpan, start); - } + // skip leading whites spaces + while (start < end && Character.isWhitespace(aJCas.getDocumentText().charAt(start))) + ++start; - } catch (ClassNotFoundException e) { - throw new AnalysisEngineProcessException(e); - } - } else { - // if no processingScope set -> use documentText - if (aJCas.getDocumentText() != null && aJCas.getDocumentText().length() > 0) { - doSegmentation(documentText, documentText.getCodensedText(), 0); - } else { - if (numEmptyCases.get() < 10) { - LOGGER.debug("document text empty. Skipping this document."); - numEmptyCases.incrementAndGet(); - } else if (numEmptyCases.get() == 10) { - LOGGER.warn("Encountered 10 documents with an empty text body. This message will not appear again " + - "to avoid scrolling in cases where this is expected."); + // get the string between the current annotation borders and recognized sentences + String textSpan = documentText.getCodensedText().substring(start, end); + if (!StringUtils.isBlank(textSpan)) + doSegmentation(documentText, textSpan, start); + } + + } catch (ClassNotFoundException e) { + throw new AnalysisEngineProcessException(e); } + } else { + // sentence delimiter types are not given + // if no processingScope set -> use documentText + if (aJCas.getDocumentText() != null && aJCas.getDocumentText().length() > 0) { + doSegmentation(documentText, documentText.getCodensedText(), 0); + } else { + if (numEmptyCases.get() < 10) { + LOGGER.debug("document text empty. Skipping this document."); + numEmptyCases.incrementAndGet(); + } else if (numEmptyCases.get() == 10) { + LOGGER.warn("Encountered 10 documents with an empty text body. This message will not appear again " + + "to avoid scrolling in cases where this is expected."); + } + } } + } catch (Throwable t) { + LOGGER.error("Could not perform sentence splitting of document {}", JCoReTools.getDocId(aJCas), t); + throw t; } } diff --git a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java index 0f0870ae8..1455b9339 100644 --- a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java +++ b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java @@ -25,6 +25,7 @@ import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.impl.XmiCasDeserializer; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; @@ -39,7 +40,9 @@ import org.slf4j.LoggerFactory; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; +import java.nio.file.Path; import java.util.*; import java.util.stream.Collectors; @@ -284,26 +287,31 @@ public void testSplitAtNewlines() throws Exception { assertThat(sentences).containsExactly("line1", "line2", "line3"); } -// -// @Test -// public void testmuh() throws Exception { -// JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", -// "de.julielab.jcore.types.jcore-document-structure-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", -// "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types"); -// -// XmiCasDeserializer.deserialize(new FileInputStream("/Users/faessler/uima-pipelines/jedis-doc-to-xmi/data/output-xmi/4768370.xmi"), jCas.getCas()); -// JCasUtil.select(jCas, Sentence.class).forEach(Annotation::removeFromIndexes); -// AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, -// "/Users/faessler/Coding/git/jcore-projects/jcore-jsbd-ae-biomedical-english/src/main/resources/de/julielab/jcore/ae/jsbd/model/jsbd-biomed-oversampled-abstracts-split-at-punctuation.mod.gz", SentenceAnnotator.PARAM_MAX_SENTENCE_LENGTH, 1000); -// -// jsbd.process(jCas.getCas()); -// -// Set set = new TreeSet<>(); -// for (Sentence s : JCasUtil.select(jCas, Sentence.class)) { -// set.add(s.getEnd() - s.getBegin()); -// } -// XmiCasSerializer.serialize(jCas.getCas(), new FileOutputStream("smallSentences.xmi")); -// } + + @Test + public void testErrordoc() throws Exception { + // The XMI document uses here is from PMC and is an example of a source of error the previously occurred. + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", + "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types"); + + XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "errordocs", "PMC5478802.xmi").toFile()), jCas.getCas()); + JCasUtil.select(jCas, Sentence.class).forEach(Annotation::removeFromIndexes); + AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, + "/Users/faessler/Coding/git/jcore-projects/jcore-jsbd-ae-biomedical-english/src/main/resources/de/julielab/jcore/ae/jsbd/model/jsbd-biomed-oversampled-abstracts-split-at-punctuation.mod.gz", + SentenceAnnotator.PARAM_MAX_SENTENCE_LENGTH, 1000, + SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, new String[]{ + "de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"}, + SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{de.julielab.jcore.types.pubmed.InternalReference.class.getCanonicalName()} + ); + + jsbd.process(jCas.getCas()); + for (var s : JCasUtil.select(jCas, Sentence.class)) { + System.out.println(s.getCoveredText()); + System.out.println("--"); + } + + } } diff --git a/jcore-jsbd-ae/src/test/resources/errordocs/README.md b/jcore-jsbd-ae/src/test/resources/errordocs/README.md new file mode 100644 index 000000000..d2278611f --- /dev/null +++ b/jcore-jsbd-ae/src/test/resources/errordocs/README.md @@ -0,0 +1,4 @@ +# Errored Documents for Tests + +Documents in this directory were subject of sentence splitting errors. The errors are fixed +using the documents in a test. \ No newline at end of file diff --git a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java index a3e4bd532..76a8c5f45 100644 --- a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java +++ b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java @@ -22,10 +22,28 @@ public class JCoReCondensedDocumentText { private NavigableMap originalPos2SumCutMap; private String condensedText; private JCas cas; + private Set cutAwayFillCharacters; public JCas getCas() { return cas; } + /** + *

+ * Cuts away the covered text of annotations of a type in cutAwayTypes + * from the cas document text. If cutAwayTypes is null or + * empty, this class' methods will return the original CAS data. + *

+ * + * @param cas + * The CAS for which the document text should be cut. + * @param cutAwayTypes + * The types for cutting. May be null. + * @throws ClassNotFoundException + * If cutAwayTypes contains non-existing type names. + */ + public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes) throws ClassNotFoundException { + this(cas, cutAwayTypes, null); + } /** *

@@ -33,16 +51,21 @@ public JCas getCas() { * from the cas document text. If cutAwayTypes is null or * empty, this class' methods will return the original CAS data. *

+ *

The cutAwayFillCharacters set may provide characters that, when being the only character between + * to cut-away annotations, will add to the span of text being cut away. This way, enumerations of references + * (e.g. "4,6,8") can be completely removed, for example.

* * @param cas * The CAS for which the document text should be cut. * @param cutAwayTypes * The types for cutting. May be null. + * @param cutAwayFillCharacters Characters that, when being the only separator between two cut away annotations, are also cut away. * @throws ClassNotFoundException * If cutAwayTypes contains non-existing type names. */ - public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes) throws ClassNotFoundException { + public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes, Set cutAwayFillCharacters) throws ClassNotFoundException { this.cas = cas; + this.cutAwayFillCharacters = cutAwayFillCharacters; buildMap(cas, cutAwayTypes); } @@ -80,24 +103,26 @@ public void buildMap(JCas cas, Set cutAwayTypes) throws ClassNotFoundExc int lastBegin = 0; int lastEnd = -1; // For each ignored annotation, there could be following annotations overlapping - // with the first, effectively enlargeing the ignored span. Thus, we iterate - // until we find an ignored annotation the has a positive (not 0) distance to a + // with the first, effectively enlarging the ignored span. Thus, we iterate + // until we find an ignored annotation that has a positive (not 0) distance to a // previous one. Then, we store the length of the span of cut-away annotations // for the largest end of the previous annotations. while (merger.incrementAnnotation()) { int end = merger.getCurrentEnd(); int begin = merger.getCurrentBegin(); - if (lastEnd > 0 && begin > lastEnd) { + boolean moreThanOneCharacterDistance = begin - lastEnd > 2; + boolean previousCharacterIsCutAwayDelimiter = cutAwayFillCharacters == null || cutAwayFillCharacters.isEmpty() || (begin - lastEnd == 2 && cutAwayFillCharacters.contains(cas.getDocumentText().charAt(begin - 1))); + if (lastEnd > 0 && begin > lastEnd && (previousCharacterIsCutAwayDelimiter || moreThanOneCharacterDistance)) { cutSum += lastEnd - lastBegin; int condensedPosition = lastEnd - cutSum + 1; condensedPos2SumCutMap.put(condensedPosition, cutSum); originalPos2SumCutMap.put(lastEnd, cutSum); lastBegin = begin; - sb.append(cas.getDocumentText().substring(lastEnd, begin)); + sb.append(cas.getDocumentText(), lastEnd, begin); } else if (lastEnd < 0) { lastBegin = begin; - sb.append(cas.getDocumentText().substring(0, begin)); + sb.append(cas.getDocumentText(), 0, begin); } lastEnd = end; } @@ -110,11 +135,11 @@ public void buildMap(JCas cas, Set cutAwayTypes) throws ClassNotFoundExc condensedPos2SumCutMap.put(condensedPosition, cutSum); originalPos2SumCutMap.put(lastEnd, cutSum); } - // If lastEnd is still -1 one, we just did not find any of the cut away annotations. Thus, we just copy the whole text. + // If lastEnd is still -1, we just did not find any of the cut away annotations. Thus, we just copy the whole text. if (lastEnd == -1) lastEnd = 0; if (lastEnd < cas.getDocumentText().length()) - sb.append(cas.getDocumentText().substring(lastEnd, cas.getDocumentText().length())); + sb.append(cas.getDocumentText().substring(lastEnd)); condensedText = sb.toString(); } diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java index 12672e122..58fdcc137 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java @@ -1,12 +1,17 @@ package de.julielab.jcore.utility; import de.julielab.jcore.types.InternalReference; +import org.apache.uima.cas.impl.XmiCasDeserializer; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; import org.junit.jupiter.api.Test; +import java.io.FileInputStream; +import java.nio.file.Path; import java.util.Arrays; import java.util.HashSet; +import java.util.Set; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -59,4 +64,55 @@ public void testReduce2() throws Exception { assertEquals(28, condensedText.getCondensedOffsetForOriginalOffset(30)); assertEquals(29, condensedText.getCondensedOffsetForOriginalOffset(31)); } + + @Test + public void testReduce3() throws Exception { + // Here we also add commas as cut away characters, offering the possibility to remove enumerations of + // references completely. + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("This sentence has multiple references.2,5,42 This is a second sentence.7,8"); + InternalReference ref1 = new InternalReference(jcas, 38, 39); + ref1.addToIndexes(); + InternalReference ref2 = new InternalReference(jcas, 40, 41); + ref2.addToIndexes(); + InternalReference ref3 = new InternalReference(jcas, 42, 44); + ref3.addToIndexes(); + InternalReference ref4 = new InternalReference(jcas, 71, 72); + ref4.addToIndexes(); + InternalReference ref5 = new InternalReference(jcas, 73, 74); + ref5.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(InternalReference.class.getCanonicalName())), Set.of(',')); + assertEquals("This sentence has multiple references. This is a second sentence.", condensedText.getCodensedText()); + } + + @Test + public void testErrorDoc() throws Exception{ + // The XMI document uses here is from PMC and is an example of a source of error the previously occurred. + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", + "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types"); + + XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "PMC5478802.xmi").toFile()), jCas.getCas()); + JCoReCondensedDocumentText text = new JCoReCondensedDocumentText(jCas, Set.of(de.julielab.jcore.types.pubmed.InternalReference.class.getCanonicalName())); +// Set sentenceBoundaryTypes = Set.of("de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"); + Set sentenceBoundaryTypes = Set.of("de.julielab.jcore.types.Section"); + JCoReAnnotationIndexMerger indexMerger = new JCoReAnnotationIndexMerger(sentenceBoundaryTypes, false, + null, jCas); + + while (indexMerger.incrementAnnotation()) { + Annotation a = (Annotation) indexMerger.getAnnotation(); + System.out.println(a.getCoveredText()); + System.out.println("--"); + int condensedBegin = text.getCondensedOffsetForOriginalOffset(a.getBegin()); + int condensedEnd = text.getOriginalOffsetForCondensedOffset(a.getEnd()); + if (condensedEnd > text.getCodensedText().length()) + System.out.println(); + System.out.println(text.getCodensedText().substring(condensedBegin, condensedEnd)); + System.out.println(a.getBegin() + " - " + a.getEnd() + ", " + condensedBegin + " - " + condensedEnd); + System.out.println(); + } + } } diff --git a/jcore-utilities/src/test/resources/PMC5478802.xmi b/jcore-utilities/src/test/resources/PMC5478802.xmi new file mode 100644 index 000000000..c4d8ca95a --- /dev/null +++ b/jcore-utilities/src/test/resources/PMC5478802.xmi @@ -0,0 +1,5 @@ + +PMC5478802 \ No newline at end of file diff --git a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java index fbcb62164..68150ad75 100644 --- a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java +++ b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java @@ -19,6 +19,7 @@ import org.testcontainers.junit.jupiter.Testcontainers; import java.io.IOException; +import java.sql.ResultSet; import java.sql.SQLException; import java.util.List; import java.util.Map; @@ -33,7 +34,6 @@ public class XmiDBWriterTest { @Container public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; - private static String xmlSubsetTable; private static DataBaseConnector dbc; @BeforeAll @@ -41,8 +41,8 @@ public static void setup() throws SQLException, UIMAException, IOException, Conf dbc = DBTestUtils.getDataBaseConnector(postgres); dbc.reserveConnection(); costosysConfig = DBTestUtils.createTestCostosysConfig("medline_2017", 1, postgres); - xmlSubsetTable = DBTestUtils.setupDatabase(dbc, "src/test/resources/pubmedsample18n0001.xml.gz", "medline_2017", 177, postgres); dbc.releaseConnections(); + DBTestUtils.createAndSetHiddenConfig("src/test/resources/hiddenConfig.txt", postgres); } @AfterAll @@ -185,4 +185,40 @@ public void testXmiDBWriterSplitAnnotationsDefaultAnnotationSchemas() throws Exc assertThat(columnNames).contains(tokenColumn, sentenceColumn); } } + + @Test + public void testXmiSubtypeStorage() throws Exception { + + AnalysisEngine xmiWriter = AnalysisEngineFactory.createEngine("de.julielab.jcore.consumer.xmi.desc.jcore-xmi-db-writer", + XMIDBWriter.PARAM_ANNOS_TO_STORE, new String[]{Token.class.getCanonicalName(), Sentence.class.getCanonicalName()}, + XMIDBWriter.PARAM_COSTOSYS_CONFIG, costosysConfig, + XMIDBWriter.PARAM_STORE_ALL, false, + XMIDBWriter.PARAM_STORE_BASE_DOCUMENT, true, + XMIDBWriter.PARAM_TABLE_DOCUMENT, "_data.documents3", + XMIDBWriter.PARAM_DO_GZIP, false, + XMIDBWriter.PARAM_STORE_RECURSIVELY, true, + XMIDBWriter.PARAM_UPDATE_MODE, true, + XMIDBWriter.PARAM_BASE_DOCUMENT_ANNOTATION_TYPES, new String[]{InternalReference.class.getCanonicalName()} + ); + JCas jCas = getJCasWithRequiredTypes(); + final Header header = new Header(jCas); + header.setDocId("789"); + header.addToIndexes(); + jCas.setDocumentText("This is a sentence.1,2"); + new de.julielab.jcore.types.pubmed.InternalReference(jCas, 19, 20).addToIndexes(); + new de.julielab.jcore.types.pubmed.InternalReference(jCas, 21, 22).addToIndexes(); + assertThatCode(() -> xmiWriter.process(jCas)).doesNotThrowAnyException(); + jCas.reset(); + xmiWriter.collectionProcessComplete(); + + dbc = DBTestUtils.getDataBaseConnector(postgres); + try (CoStoSysConnection ignored = dbc.obtainOrReserveConnection()) { + assertThat(dbc.tableExists("_data.documents3")).isTrue(); + ResultSet rs = ignored.createStatement().executeQuery("SELECT " + XmiSplitConstants.BASE_DOC_COLUMN + " FROM " + "_data.documents3"); + assertThat(rs.next()).isTrue(); + String documentString = rs.getString(1); + System.out.println(documentString); + + } + } } From 7557e934527b16c0704d3551a4cd6303933e5aa7 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 1 Jul 2021 14:49:07 +0200 Subject: [PATCH 077/269] JSBD: Fixed a bug where the document length offset was not condensation-adjusted. Fixed #121 Also adding a comma to the cut away characters. --- .../jcore/ae/jsbd/main/SentenceAnnotator.java | 11 +- .../ae/jsbd/main/SentenceAnnotatorTest.java | 497 +++++++++--------- .../test/resources/errordocs/PMC5478802.xmi | 5 + .../JCoReCondensedDocumentTextTest.java | 18 +- 4 files changed, 264 insertions(+), 267 deletions(-) create mode 100644 jcore-jsbd-ae/src/test/resources/errordocs/PMC5478802.xmi diff --git a/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java b/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java index c91869654..fe5cbd833 100644 --- a/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java +++ b/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java @@ -155,7 +155,7 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { JCoReCondensedDocumentText documentText; try { // If there are no cut-away types, the document text will remain unchanged. - documentText = new JCoReCondensedDocumentText(aJCas, cutAwayTypes); + documentText = new JCoReCondensedDocumentText(aJCas, cutAwayTypes, Set.of(',')); } catch (ClassNotFoundException e1) { LOGGER.error("Could not create the text without annotations to be cut away in document {}", JCoReTools.getDocId(aJCas), e1); throw new AnalysisEngineProcessException(e1); @@ -175,22 +175,15 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { // cross any delimiter annotation border List borders = new ArrayList<>(); borders.add(0); - borders.add(aJCas.getDocumentText().length()); + borders.add(documentText.getCondensedOffsetForOriginalOffset(aJCas.getDocumentText().length())); while (indexMerger.incrementAnnotation()) { Annotation a = (Annotation) indexMerger.getAnnotation(); - System.out.println(a.getCoveredText()); - System.out.println("--"); - System.out.println(documentText.getCodensedText().substring(documentText.getCondensedOffsetForOriginalOffset(a.getBegin()), documentText.getOriginalOffsetForCondensedOffset(a.getEnd()))); - System.out.println(a.getBegin() + " - " + a.getEnd() + ", " + documentText.getCondensedOffsetForOriginalOffset(a.getBegin()) + " - " + documentText.getOriginalOffsetForCondensedOffset(a.getEnd())); - System.out.println(); // Here we convert the original offsets to the condensed offsets. If there are // no cut-away types, the offsets will just remain unchanged. Otherwise we now // have the borders of the condensed text passages associated with the sentence // delimiter annotation. borders.add(documentText.getCondensedOffsetForOriginalOffset(a.getBegin())); - assert borders.get(borders.size() - 1) < documentText.getCodensedText().length(); borders.add(documentText.getCondensedOffsetForOriginalOffset(a.getEnd())); - assert borders.get(borders.size() - 1) < documentText.getCodensedText().length() : "Original offset "+a.getEnd()+" is mapped to condensed offset " + documentText.getCondensedOffsetForOriginalOffset(a.getEnd()); } borders.sort(null); diff --git a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java index 1455b9339..22edbe983 100644 --- a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java +++ b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java @@ -1,17 +1,17 @@ -/** +/** * SentenceAnnotatorTest.java - * + *

* Copyright (c) 2015, JULIE Lab. - * All rights reserved. This program and the accompanying materials + * All rights reserved. This program and the accompanying materials * are made available under the terms of the BSD-2-Clause License - * + *

* Author: tomanek - * + *

* Current version: 2.2 * Since version: 1.0 - * - * Creation date: Nov 29, 2006 - * + *

+ * Creation date: Nov 29, 2006 + *

* This is a JUnit test for the SentenceAnnotator. **/ @@ -47,271 +47,268 @@ import java.util.stream.Collectors; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatCode; import static org.junit.jupiter.api.Assertions.*; + public class SentenceAnnotatorTest { - /** - * Logger for this class - */ - private static final Logger LOGGER = LoggerFactory.getLogger(SentenceAnnotatorTest.class); - - private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties"; - - // uncomment to test with/without scope - // private static final String DESCRIPTOR = - // "src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml"; - private static final String DESCRIPTOR = "src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml"; - - // last sentence has no EOS symbol to test that also this is handled - // correctly - private static final String[] TEST_TEXT = { "First sentence. Second \t sentence! \n Last sentence?", - "Hallo, jemand da? Nein, niemand.", "A test. It can't be just one sentence. Testing the test.", "" }; - - private static final String[] TEST_TEXT_OFFSETS = { "0-15;16-34;40-54", "0-17;18-32", "0-7;8-38;39-56", "" }; - - private static final int[] endOffsets = { 54, 32, 27, 0 }; - - /** - * Use the model in resources, split the text in TEST_TEXT and compare the - * split result against TEST_TEXT_OFFSETS - */ - @Test - public void testProcess() { - - boolean annotationsOK = true; - - XMLInputSource sentenceXML = null; - ResourceSpecifier sentenceSpec = null; - AnalysisEngine sentenceAnnotator = null; - - try { - sentenceXML = new XMLInputSource(DESCRIPTOR); - sentenceSpec = UIMAFramework.getXMLParser().parseResourceSpecifier(sentenceXML); - sentenceAnnotator = UIMAFramework.produceAnalysisEngine(sentenceSpec); - } catch (Exception e) { - LOGGER.error("testProcess()", e); - } - - for (int i = 0; i < TEST_TEXT.length; i++) { - - JCas jcas = null; - try { - jcas = sentenceAnnotator.newJCas(); - } catch (ResourceInitializationException e) { - LOGGER.error("testProcess()", e); - } - - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("testProcess() - testing text: " + TEST_TEXT[i]); - } - jcas.setDocumentText(TEST_TEXT[i]); - - // make one test scope ranging over complete document text - // annotations for the processing scope - TestScope scope1 = new TestScope(jcas, 0, endOffsets[i]); - scope1.addToIndexes(); - // TestScope scope2 = new TestScope(jcas,37,54); - - - try { - sentenceAnnotator.process(jcas, null); - } catch (Exception e) { - LOGGER.error("testProcess()", e); - } - - // get the offsets of the sentences - JFSIndexRepository indexes = jcas.getJFSIndexRepository(); - Iterator sentIter = indexes.getAnnotationIndex(Sentence.type).iterator(); - - String predictedOffsets = getPredictedOffsets(i, sentIter); - - // compare offsets - if (!predictedOffsets.equals(TEST_TEXT_OFFSETS[i])) { - annotationsOK = false; - continue; - } - } - assertTrue(annotationsOK); - } - - - private String getPredictedOffsets(int i, Iterator sentIter) { - String predictedOffsets = ""; - while (sentIter.hasNext()) { - Sentence s = (Sentence) sentIter.next(); - LOGGER.debug("sentence: " + s.getCoveredText() + ": " + s.getBegin() + " - " + s.getEnd()); - predictedOffsets += (predictedOffsets.length() > 0) ? ";" : ""; - predictedOffsets += s.getBegin() + "-" + s.getEnd(); - } - - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("testProcess() - predicted: " + predictedOffsets); - } - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("testProcess() - wanted: " + TEST_TEXT_OFFSETS[i]); - } - return predictedOffsets; - } - - @Test - public void testUimaFitIntegration() throws UIMAException, IOException { - AnalysisEngine sentenceAE = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, - SentenceAnnotator.PARAM_MODEL_FILE, "de/julielab/jcore/ae/jsbd/model/test-model.gz", - SentenceAnnotator.PARAM_POSTPROCESSING, "biomed"); - JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types"); - String abstractText = FileUtils.readFileToString(new File("src/test/resources/test-abstract.txt"), "UTF-8"); - cas.setDocumentText(abstractText); - sentenceAE.process(cas); - Collection sentences = JCasUtil.select(cas, Sentence.class); - for (Sentence sentence : sentences) { - System.out.println(sentence.getCoveredText()); - } - assertEquals(14, sentences.size()); - } - - @Test - public void testModelClassPathResource() throws Exception { - AnalysisEngine sentenceAE = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, - SentenceAnnotator.PARAM_MODEL_FILE, "de/julielab/jcore/ae/jsbd/model/test-model.gz", - SentenceAnnotator.PARAM_POSTPROCESSING, "biomed"); - JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types"); - String abstractText = FileUtils.readFileToString(new File("src/test/resources/test-abstract.txt"), "UTF-8"); - cas.setDocumentText(abstractText); - sentenceAE.process(cas); - Collection sentences = JCasUtil.select(cas, Sentence.class); - System.out.println(sentences.size()); - for (Sentence sentence : sentences) { - System.out.println(sentence.getCoveredText()); - } - assertEquals(14, sentences.size()); - } - - @Test - public void testSentenceDelimiterTypes() throws Exception { - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", - "de.julielab.jcore.types.jcore-document-structure-types"); - - jCas.setDocumentText("Introduction " + "We here show good results. This is a figure caption " - + "And this is a paragraph without a fullstop for some reason " + "Conclusion " - + "We are the greatest."); - Title t1 = new Title(jCas, 0, 12); - Caption c = new Caption(jCas, 40, 64); - Paragraph p = new Paragraph(jCas, 65, 123); - Title t2 = new Title(jCas, 124, 134); - t1.addToIndexes(); - c.addToIndexes(); - p.addToIndexes(); - t2.addToIndexes(); - assertEquals("Introduction", t1.getCoveredText()); - assertEquals("This is a figure caption", c.getCoveredText()); - assertEquals("And this is a paragraph without a fullstop for some reason", p.getCoveredText()); - assertEquals("Conclusion", t2.getCoveredText()); - - AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, - "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, - new LinkedHashSet( - Arrays.asList(Title.class.getName(), Caption.class.getName(), Paragraph.class.getName()))); - - jsbd.process(jCas.getCas()); - - Set> expectedSpans = new HashSet<>(); - expectedSpans.add(Range.between(0, 12)); - expectedSpans.add(Range.between(13, 39)); - expectedSpans.add(Range.between(40, 64)); - expectedSpans.add(Range.between(65, 123)); - expectedSpans.add(Range.between(124, 134)); - expectedSpans.add(Range.between(135, 155)); - - FSIterator it = jCas.getAnnotationIndex(Sentence.type).iterator(); - assertTrue(it.hasNext()); - while (it.hasNext()) { - Annotation sentence = it.next(); - Range sentenceRange = Range.between(sentence.getBegin(), sentence.getEnd()); - assertTrue(expectedSpans.remove(sentenceRange), "Range " + sentenceRange + " was not expected"); - } - assertTrue(expectedSpans.isEmpty()); - } - - @Test - public void testSentenceWhitespaces() throws Exception { - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", - "de.julielab.jcore.types.jcore-document-structure-types"); - - // This text is taken from pmid 23092121 - jCas.setDocumentText(" : We present a theoretical study of the electronic subband structure and collective electronic excitation associated with plasmon and surface plasmon modes in metal-based hollow nanosphere. The dependence of the electronic subband energy on the sample parameters of the hollow nanosphere is examined."); - - AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, - "de/julielab/jcore/ae/jsbd/model/test-model.gz"); - - jsbd.process(jCas.getCas()); + /** + * Logger for this class + */ + private static final Logger LOGGER = LoggerFactory.getLogger(SentenceAnnotatorTest.class); + + private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties"; + + // uncomment to test with/without scope + // private static final String DESCRIPTOR = + // "src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml"; + private static final String DESCRIPTOR = "src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml"; + + // last sentence has no EOS symbol to test that also this is handled + // correctly + private static final String[] TEST_TEXT = {"First sentence. Second \t sentence! \n Last sentence?", + "Hallo, jemand da? Nein, niemand.", "A test. It can't be just one sentence. Testing the test.", ""}; + + private static final String[] TEST_TEXT_OFFSETS = {"0-15;16-34;40-54", "0-17;18-32", "0-7;8-38;39-56", ""}; + + private static final int[] endOffsets = {54, 32, 27, 0}; + + /** + * Use the model in resources, split the text in TEST_TEXT and compare the + * split result against TEST_TEXT_OFFSETS + */ + @Test + public void testProcess() { + + boolean annotationsOK = true; + + XMLInputSource sentenceXML = null; + ResourceSpecifier sentenceSpec = null; + AnalysisEngine sentenceAnnotator = null; + + try { + sentenceXML = new XMLInputSource(DESCRIPTOR); + sentenceSpec = UIMAFramework.getXMLParser().parseResourceSpecifier(sentenceXML); + sentenceAnnotator = UIMAFramework.produceAnalysisEngine(sentenceSpec); + } catch (Exception e) { + LOGGER.error("testProcess()", e); + } + + for (int i = 0; i < TEST_TEXT.length; i++) { + + JCas jcas = null; + try { + jcas = sentenceAnnotator.newJCas(); + } catch (ResourceInitializationException e) { + LOGGER.error("testProcess()", e); + } + + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("testProcess() - testing text: " + TEST_TEXT[i]); + } + jcas.setDocumentText(TEST_TEXT[i]); + + // make one test scope ranging over complete document text + // annotations for the processing scope + TestScope scope1 = new TestScope(jcas, 0, endOffsets[i]); + scope1.addToIndexes(); + // TestScope scope2 = new TestScope(jcas,37,54); + + + try { + sentenceAnnotator.process(jcas, null); + } catch (Exception e) { + LOGGER.error("testProcess()", e); + } + + // get the offsets of the sentences + JFSIndexRepository indexes = jcas.getJFSIndexRepository(); + Iterator sentIter = indexes.getAnnotationIndex(Sentence.type).iterator(); + + String predictedOffsets = getPredictedOffsets(i, sentIter); + + // compare offsets + if (!predictedOffsets.equals(TEST_TEXT_OFFSETS[i])) { + annotationsOK = false; + continue; + } + } + assertTrue(annotationsOK); + } - Sentence sentence = JCasUtil.select(jCas, Sentence.class).iterator().next(); - assertFalse(sentence.getCoveredText().startsWith(" ")); + private String getPredictedOffsets(int i, Iterator sentIter) { + String predictedOffsets = ""; + while (sentIter.hasNext()) { + Sentence s = (Sentence) sentIter.next(); + LOGGER.debug("sentence: " + s.getCoveredText() + ": " + s.getBegin() + " - " + s.getEnd()); + predictedOffsets += (predictedOffsets.length() > 0) ? ";" : ""; + predictedOffsets += s.getBegin() + "-" + s.getEnd(); + } + + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("testProcess() - predicted: " + predictedOffsets); + } + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("testProcess() - wanted: " + TEST_TEXT_OFFSETS[i]); + } + return predictedOffsets; } - @Test - public void testTrailingNewline() throws Exception { - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", - "de.julielab.jcore.types.jcore-document-structure-types"); + @Test + public void testUimaFitIntegration() throws UIMAException, IOException { + AnalysisEngine sentenceAE = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, + SentenceAnnotator.PARAM_MODEL_FILE, "de/julielab/jcore/ae/jsbd/model/test-model.gz", + SentenceAnnotator.PARAM_POSTPROCESSING, "biomed"); + JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types"); + String abstractText = FileUtils.readFileToString(new File("src/test/resources/test-abstract.txt"), "UTF-8"); + cas.setDocumentText(abstractText); + sentenceAE.process(cas); + Collection sentences = JCasUtil.select(cas, Sentence.class); + for (Sentence sentence : sentences) { + System.out.println(sentence.getCoveredText()); + } + assertEquals(14, sentences.size()); + } - // This text is taken from PMC3408706. Note the "paragraph separator" at the end - jCas.setDocumentText("In1 the next step, we plan to use higher level QM/MM methods to calculate the energy barrier of the reaction catalyzed by endonuclease APE1, in compliance with the mechanism proposed, and to screen for effective inhibitors with the use of the constructed mechanistic full-atomic model of the enzyme. \u2029"); - new InternalReference(jCas, 2, 3).addToIndexes(); + @Test + public void testModelClassPathResource() throws Exception { + AnalysisEngine sentenceAE = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, + SentenceAnnotator.PARAM_MODEL_FILE, "de/julielab/jcore/ae/jsbd/model/test-model.gz", + SentenceAnnotator.PARAM_POSTPROCESSING, "biomed"); + JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types"); + String abstractText = FileUtils.readFileToString(new File("src/test/resources/test-abstract.txt"), "UTF-8"); + cas.setDocumentText(abstractText); + sentenceAE.process(cas); + Collection sentences = JCasUtil.select(cas, Sentence.class); + System.out.println(sentences.size()); + for (Sentence sentence : sentences) { + System.out.println(sentence.getCoveredText()); + } + assertEquals(14, sentences.size()); + } + + @Test + public void testSentenceDelimiterTypes() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + + jCas.setDocumentText("Introduction " + "We here show good results. This is a figure caption " + + "And this is a paragraph without a fullstop for some reason " + "Conclusion " + + "We are the greatest."); + Title t1 = new Title(jCas, 0, 12); + Caption c = new Caption(jCas, 40, 64); + Paragraph p = new Paragraph(jCas, 65, 123); + Title t2 = new Title(jCas, 124, 134); + t1.addToIndexes(); + c.addToIndexes(); + p.addToIndexes(); + t2.addToIndexes(); + assertEquals("Introduction", t1.getCoveredText()); + assertEquals("This is a figure caption", c.getCoveredText()); + assertEquals("And this is a paragraph without a fullstop for some reason", p.getCoveredText()); + assertEquals("Conclusion", t2.getCoveredText()); + + AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, + "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, + new LinkedHashSet( + Arrays.asList(Title.class.getName(), Caption.class.getName(), Paragraph.class.getName()))); + + jsbd.process(jCas.getCas()); + + Set> expectedSpans = new HashSet<>(); + expectedSpans.add(Range.between(0, 12)); + expectedSpans.add(Range.between(13, 39)); + expectedSpans.add(Range.between(40, 64)); + expectedSpans.add(Range.between(65, 123)); + expectedSpans.add(Range.between(124, 134)); + expectedSpans.add(Range.between(135, 155)); + + FSIterator it = jCas.getAnnotationIndex(Sentence.type).iterator(); + assertTrue(it.hasNext()); + while (it.hasNext()) { + Annotation sentence = it.next(); + Range sentenceRange = Range.between(sentence.getBegin(), sentence.getEnd()); + assertTrue(expectedSpans.remove(sentenceRange), "Range " + sentenceRange + " was not expected"); + } + assertTrue(expectedSpans.isEmpty()); + } + + @Test + public void testSentenceWhitespaces() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + + // This text is taken from pmid 23092121 + jCas.setDocumentText(" : We present a theoretical study of the electronic subband structure and collective electronic excitation associated with plasmon and surface plasmon modes in metal-based hollow nanosphere. The dependence of the electronic subband energy on the sample parameters of the hollow nanosphere is examined."); - AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, - "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{InternalReference.class.getCanonicalName()}); + AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, + "de/julielab/jcore/ae/jsbd/model/test-model.gz"); - jsbd.process(jCas.getCas()); + jsbd.process(jCas.getCas()); - Sentence sentence = JCasUtil.select(jCas, Sentence.class).iterator().next(); - assertFalse(sentence.getCoveredText().endsWith("\u2029")); - } + Sentence sentence = JCasUtil.select(jCas, Sentence.class).iterator().next(); + assertFalse(sentence.getCoveredText().startsWith(" ")); + } - @Test - public void testSplitAtNewlines() throws Exception { - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", - "de.julielab.jcore.types.jcore-document-structure-types"); + @Test + public void testTrailingNewline() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); - String ls = System.getProperty("line.separator"); - jCas.setDocumentText("line1"+ls+"line2"+ls+"line3"); + // This text is taken from PMC3408706. Note the "paragraph separator" at the end + jCas.setDocumentText("In1 the next step, we plan to use higher level QM/MM methods to calculate the energy barrier of the reaction catalyzed by endonuclease APE1, in compliance with the mechanism proposed, and to screen for effective inhibitors with the use of the constructed mechanistic full-atomic model of the enzyme. \u2029"); + new InternalReference(jCas, 2, 3).addToIndexes(); - AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, - "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_ALWAYS_SPLIT_NEWLINE, true); + AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, + "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{InternalReference.class.getCanonicalName()}); - jsbd.process(jCas.getCas()); + jsbd.process(jCas.getCas()); - Collection sentences = JCasUtil.select(jCas, Sentence.class).stream().map(Annotation::getCoveredText).collect(Collectors.toList()); - assertThat(sentences).containsExactly("line1", "line2", "line3"); - } + Sentence sentence = JCasUtil.select(jCas, Sentence.class).iterator().next(); + assertFalse(sentence.getCoveredText().endsWith("\u2029")); + } + @Test + public void testSplitAtNewlines() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); - @Test - public void testErrordoc() throws Exception { - // The XMI document uses here is from PMC and is an example of a source of error the previously occurred. - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", - "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", - "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types"); + String ls = System.getProperty("line.separator"); + jCas.setDocumentText("line1" + ls + "line2" + ls + "line3"); - XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "errordocs", "PMC5478802.xmi").toFile()), jCas.getCas()); - JCasUtil.select(jCas, Sentence.class).forEach(Annotation::removeFromIndexes); - AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, - "/Users/faessler/Coding/git/jcore-projects/jcore-jsbd-ae-biomedical-english/src/main/resources/de/julielab/jcore/ae/jsbd/model/jsbd-biomed-oversampled-abstracts-split-at-punctuation.mod.gz", - SentenceAnnotator.PARAM_MAX_SENTENCE_LENGTH, 1000, - SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, new String[]{ - "de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"}, - SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{de.julielab.jcore.types.pubmed.InternalReference.class.getCanonicalName()} - ); + AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, + "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_ALWAYS_SPLIT_NEWLINE, true); - jsbd.process(jCas.getCas()); - for (var s : JCasUtil.select(jCas, Sentence.class)) { - System.out.println(s.getCoveredText()); - System.out.println("--"); - } + jsbd.process(jCas.getCas()); - } + + Collection sentences = JCasUtil.select(jCas, Sentence.class).stream().map(Annotation::getCoveredText).collect(Collectors.toList()); + assertThat(sentences).containsExactly("line1", "line2", "line3"); + } + + + @Test + public void testErrordoc() throws Exception { + // The XMI document uses here is from PMC and is an example of a source of error the previously occurred. + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", + "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types"); + + XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "errordocs", "PMC5478802.xmi").toFile()), jCas.getCas()); + JCasUtil.select(jCas, Sentence.class).forEach(Annotation::removeFromIndexes); + AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, + "/Users/faessler/Coding/git/jcore-projects/jcore-jsbd-ae-biomedical-english/src/main/resources/de/julielab/jcore/ae/jsbd/model/jsbd-biomed-oversampled-abstracts-split-at-punctuation.mod.gz", + SentenceAnnotator.PARAM_MAX_SENTENCE_LENGTH, 1000, + SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, new String[]{ + "de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"}, + SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{de.julielab.jcore.types.pubmed.InternalReference.class.getCanonicalName()} + ); + + assertThatCode(() -> jsbd.process(jCas.getCas())).doesNotThrowAnyException(); + } } diff --git a/jcore-jsbd-ae/src/test/resources/errordocs/PMC5478802.xmi b/jcore-jsbd-ae/src/test/resources/errordocs/PMC5478802.xmi new file mode 100644 index 000000000..c4d8ca95a --- /dev/null +++ b/jcore-jsbd-ae/src/test/resources/errordocs/PMC5478802.xmi @@ -0,0 +1,5 @@ + +PMC5478802 \ No newline at end of file diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java index 58fdcc137..22758d549 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java @@ -71,7 +71,7 @@ public void testReduce3() throws Exception { // references completely. JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-document-structure-types"); - jcas.setDocumentText("This sentence has multiple references.2,5,42 This is a second sentence.7,8"); + jcas.setDocumentText("This sentence has multiple references.2,5;42 This is a second sentence.7,8"); InternalReference ref1 = new InternalReference(jcas, 38, 39); ref1.addToIndexes(); InternalReference ref2 = new InternalReference(jcas, 40, 41); @@ -84,7 +84,7 @@ public void testReduce3() throws Exception { ref5.addToIndexes(); JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, - new HashSet<>(Arrays.asList(InternalReference.class.getCanonicalName())), Set.of(',')); + new HashSet<>(Arrays.asList(InternalReference.class.getCanonicalName())), Set.of(',', ';')); assertEquals("This sentence has multiple references. This is a second sentence.", condensedText.getCodensedText()); } @@ -96,9 +96,9 @@ public void testErrorDoc() throws Exception{ "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types"); XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "PMC5478802.xmi").toFile()), jCas.getCas()); - JCoReCondensedDocumentText text = new JCoReCondensedDocumentText(jCas, Set.of(de.julielab.jcore.types.pubmed.InternalReference.class.getCanonicalName())); -// Set sentenceBoundaryTypes = Set.of("de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"); - Set sentenceBoundaryTypes = Set.of("de.julielab.jcore.types.Section"); + JCoReCondensedDocumentText text = new JCoReCondensedDocumentText(jCas, Set.of(de.julielab.jcore.types.pubmed.InternalReference.class.getCanonicalName()), Set.of(',')); + Set sentenceBoundaryTypes = Set.of("de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"); +// Set sentenceBoundaryTypes = Set.of("de.julielab.jcore.types.Section"); JCoReAnnotationIndexMerger indexMerger = new JCoReAnnotationIndexMerger(sentenceBoundaryTypes, false, null, jCas); @@ -106,12 +106,14 @@ public void testErrorDoc() throws Exception{ Annotation a = (Annotation) indexMerger.getAnnotation(); System.out.println(a.getCoveredText()); System.out.println("--"); - int condensedBegin = text.getCondensedOffsetForOriginalOffset(a.getBegin()); - int condensedEnd = text.getOriginalOffsetForCondensedOffset(a.getEnd()); + int begin = a.getBegin(); + int condensedBegin = text.getCondensedOffsetForOriginalOffset(begin); + int end = a.getEnd(); + int condensedEnd = text.getCondensedOffsetForOriginalOffset(end); if (condensedEnd > text.getCodensedText().length()) System.out.println(); System.out.println(text.getCodensedText().substring(condensedBegin, condensedEnd)); - System.out.println(a.getBegin() + " - " + a.getEnd() + ", " + condensedBegin + " - " + condensedEnd); + System.out.println(begin + " - " + end + ", " + condensedBegin + " - " + condensedEnd); System.out.println(); } } From 6e39d19caef183226a0fe55cbdad1e2bedc84748 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 1 Jul 2021 14:51:01 +0200 Subject: [PATCH 078/269] Removed a non-required test. --- .../JCoReCondensedDocumentTextTest.java | 34 ------------------- .../src/test/resources/PMC5478802.xmi | 5 --- 2 files changed, 39 deletions(-) delete mode 100644 jcore-utilities/src/test/resources/PMC5478802.xmi diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java index 22758d549..1c5597a3e 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java @@ -1,14 +1,10 @@ package de.julielab.jcore.utility; import de.julielab.jcore.types.InternalReference; -import org.apache.uima.cas.impl.XmiCasDeserializer; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.tcas.Annotation; import org.junit.jupiter.api.Test; -import java.io.FileInputStream; -import java.nio.file.Path; import java.util.Arrays; import java.util.HashSet; import java.util.Set; @@ -87,34 +83,4 @@ public void testReduce3() throws Exception { new HashSet<>(Arrays.asList(InternalReference.class.getCanonicalName())), Set.of(',', ';')); assertEquals("This sentence has multiple references. This is a second sentence.", condensedText.getCodensedText()); } - - @Test - public void testErrorDoc() throws Exception{ - // The XMI document uses here is from PMC and is an example of a source of error the previously occurred. - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", - "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", - "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types"); - - XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "PMC5478802.xmi").toFile()), jCas.getCas()); - JCoReCondensedDocumentText text = new JCoReCondensedDocumentText(jCas, Set.of(de.julielab.jcore.types.pubmed.InternalReference.class.getCanonicalName()), Set.of(',')); - Set sentenceBoundaryTypes = Set.of("de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"); -// Set sentenceBoundaryTypes = Set.of("de.julielab.jcore.types.Section"); - JCoReAnnotationIndexMerger indexMerger = new JCoReAnnotationIndexMerger(sentenceBoundaryTypes, false, - null, jCas); - - while (indexMerger.incrementAnnotation()) { - Annotation a = (Annotation) indexMerger.getAnnotation(); - System.out.println(a.getCoveredText()); - System.out.println("--"); - int begin = a.getBegin(); - int condensedBegin = text.getCondensedOffsetForOriginalOffset(begin); - int end = a.getEnd(); - int condensedEnd = text.getCondensedOffsetForOriginalOffset(end); - if (condensedEnd > text.getCodensedText().length()) - System.out.println(); - System.out.println(text.getCodensedText().substring(condensedBegin, condensedEnd)); - System.out.println(begin + " - " + end + ", " + condensedBegin + " - " + condensedEnd); - System.out.println(); - } - } } diff --git a/jcore-utilities/src/test/resources/PMC5478802.xmi b/jcore-utilities/src/test/resources/PMC5478802.xmi deleted file mode 100644 index c4d8ca95a..000000000 --- a/jcore-utilities/src/test/resources/PMC5478802.xmi +++ /dev/null @@ -1,5 +0,0 @@ - -PMC5478802 \ No newline at end of file From e038a6340a3cb91d4ee06388b94b07d71ba49553 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 7 Jul 2021 16:11:27 +0200 Subject: [PATCH 079/269] Fixing offset issues with JCoReCondensedDocumentText. Converting original offsets to condensed offsets when the original offset lied in a cut away area was not working correctly. In fact, the case was not even covered in the code. --- .../utility/JCoReCondensedDocumentText.java | 339 ++++++++++-------- .../JCoReCondensedDocumentTextTest.java | 89 +++++ jedis-parent/pom.xml | 2 +- 3 files changed, 273 insertions(+), 157 deletions(-) diff --git a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java index 76a8c5f45..7067539ad 100644 --- a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java +++ b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java @@ -13,169 +13,196 @@ * document text that results from cutting out said text passages. It offers a * method to return the actual text string and a method to map the character * offsets of the compacted string to the original CAS document text. - * - * @author faessler * + * @author faessler */ public class JCoReCondensedDocumentText { - private NavigableMap condensedPos2SumCutMap; - private NavigableMap originalPos2SumCutMap; - private String condensedText; - private JCas cas; - private Set cutAwayFillCharacters; + private NavigableMap condensedPos2SumCutMap; + private NavigableMap originalPos2SumCutMap; + private String condensedText; + private JCas cas; + private Set cutAwayFillCharacters; + + /** + *

+ * Cuts away the covered text of annotations of a type in cutAwayTypes + * from the cas document text. If cutAwayTypes is null or + * empty, this class' methods will return the original CAS data. + *

+ * + * @param cas The CAS for which the document text should be cut. + * @param cutAwayTypes The types for cutting. May be null. + * @throws ClassNotFoundException If cutAwayTypes contains non-existing type names. + */ + public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes) throws ClassNotFoundException { + this(cas, cutAwayTypes, null); + } + + /** + *

+ * Cuts away the covered text of annotations of a type in cutAwayTypes + * from the cas document text. If cutAwayTypes is null or + * empty, this class' methods will return the original CAS data. + *

+ *

The cutAwayFillCharacters set may provide characters that, when being the only character between + * to cut-away annotations, will add to the span of text being cut away. This way, enumerations of references + * (e.g. "4,6,8") can be completely removed, for example.

+ * + * @param cas The CAS for which the document text should be cut. + * @param cutAwayTypes The types for cutting. May be null. + * @param cutAwayFillCharacters Characters that, when being the only separator between two cut away annotations, are also cut away. + * @throws ClassNotFoundException If cutAwayTypes contains non-existing type names. + */ + public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes, Set cutAwayFillCharacters) throws ClassNotFoundException { + this.cas = cas; + this.cutAwayFillCharacters = cutAwayFillCharacters; + buildMap(cas, cutAwayTypes); + } + + public JCas getCas() { + return cas; + } - public JCas getCas() { - return cas; - } - /** - *

- * Cuts away the covered text of annotations of a type in cutAwayTypes - * from the cas document text. If cutAwayTypes is null or - * empty, this class' methods will return the original CAS data. - *

- * - * @param cas - * The CAS for which the document text should be cut. - * @param cutAwayTypes - * The types for cutting. May be null. - * @throws ClassNotFoundException - * If cutAwayTypes contains non-existing type names. - */ - public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes) throws ClassNotFoundException { - this(cas, cutAwayTypes, null); - } + /** + *

+ * Creates a map that maps those positions of the small-cut text that correspond + * to an intermediate next position after a cut-away annotation in the original + * text to the sum of ranges covered by cut-away annotations up to the original + * offset. + *

+ *

+ * If cutAwayTypes is empty, no work will be done and the methods of + * this class we return the original text and offets of the CAS. + *

+ * + * @param cas The CAS for create a cut-away document text for. + * @param cutAwayTypes The qualified type names of the annotations whose covered text + * should be cut away. + * @throws ClassNotFoundException If cutAwayTypes contains type identifiers to + * non-existing types. + */ + public void buildMap(JCas cas, Set cutAwayTypes) throws ClassNotFoundException { + if (cutAwayTypes == null || cutAwayTypes.isEmpty()) + return; + StringBuilder sb = new StringBuilder(); + condensedPos2SumCutMap = new TreeMap<>(); + condensedPos2SumCutMap.put(0, 0); + originalPos2SumCutMap = new TreeMap<>(); + originalPos2SumCutMap.put(0, 0); + JCoReAnnotationIndexMerger merger = new JCoReAnnotationIndexMerger(cutAwayTypes, true, null, cas); + int cutSum = 0; + int lastBegin = 0; + int lastEnd = -1; + int lastCutSum = 0; + // For each ignored annotation, there could be following annotations overlapping + // with the first, effectively enlarging the ignored span. Thus, we iterate + // until we find an ignored annotation that has a positive (not 0) distance to a + // previous one. Then, we store the length of the span of cut-away annotations + // for the largest end of the previous annotations. + while (merger.incrementAnnotation()) { + int begin = merger.getCurrentBegin(); + int end = merger.getCurrentEnd(); - /** - *

- * Cuts away the covered text of annotations of a type in cutAwayTypes - * from the cas document text. If cutAwayTypes is null or - * empty, this class' methods will return the original CAS data. - *

- *

The cutAwayFillCharacters set may provide characters that, when being the only character between - * to cut-away annotations, will add to the span of text being cut away. This way, enumerations of references - * (e.g. "4,6,8") can be completely removed, for example.

- * - * @param cas - * The CAS for which the document text should be cut. - * @param cutAwayTypes - * The types for cutting. May be null. - * @param cutAwayFillCharacters Characters that, when being the only separator between two cut away annotations, are also cut away. - * @throws ClassNotFoundException - * If cutAwayTypes contains non-existing type names. - */ - public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes, Set cutAwayFillCharacters) throws ClassNotFoundException { - this.cas = cas; - this.cutAwayFillCharacters = cutAwayFillCharacters; - buildMap(cas, cutAwayTypes); - } + boolean moreThanOneCharacterDistance = begin - lastEnd > 2; + boolean previousCharacterIsCutAwayDelimiter = cutAwayFillCharacters == null || cutAwayFillCharacters.isEmpty() || (begin - lastEnd == 2 && cutAwayFillCharacters.contains(cas.getDocumentText().charAt(begin - 1))); + if (lastEnd > 0 && begin > lastEnd && (previousCharacterIsCutAwayDelimiter || moreThanOneCharacterDistance)) { + // Adapt offsets to remove superfluous white spaces from the condensed text + boolean precedingCharacterIsWS = lastBegin == 0 || Character.isWhitespace(cas.getDocumentText().charAt(lastBegin - 1)); + boolean succeedingCharacterIsWS = lastEnd < cas.getDocumentText().length() && Character.isWhitespace(cas.getDocumentText().charAt(lastEnd)); + if (precedingCharacterIsWS && succeedingCharacterIsWS) + ++lastEnd; + if (precedingCharacterIsWS && end >= cas.getDocumentText().length()) + --begin; + // The current cut away annotation begins after the previous cut away annotation, thus there is no + // overlap and we can add the current state to the maps. + cutSum += lastEnd - lastBegin; + int condensedPosition = lastEnd - cutSum + 1; + condensedPos2SumCutMap.put(condensedPosition, cutSum); + // For original offsets we need to be able to know where the begin and the end of + // the cut away annotation was. This is exploited in getCondensedOffsetForOriginalOffset() + originalPos2SumCutMap.put(lastBegin, lastCutSum); + originalPos2SumCutMap.put(lastEnd, cutSum); + lastBegin = begin; + lastCutSum = cutSum; + sb.append(cas.getDocumentText(), lastEnd, begin); + } else if (lastEnd < 0) { + // This is the first annotation + if (begin > 0 && end >= cas.getDocumentText().length() && Character.isWhitespace(cas.getDocumentText().charAt(begin - 1))) + // Case: A single cut away annotation right at the end of the document text. + // Then we want to extend the cut away area to the leading whitespace to remove that as well. + --begin; + lastBegin = begin; + sb.append(cas.getDocumentText(), 0, begin); + } + lastEnd = end; + } + // Since we iterate one annotation further than the annotation we store the span + // for, we need to take care of the very last ignored annotation after the loop + // - it has never been handled itself. + if (lastEnd > 0) { + // Adapt offsets to avoid unnecessary white spaces regarding the tail of the document text. + boolean precedingCharacterIsWS = lastBegin < 1 || Character.isWhitespace(cas.getDocumentText().charAt(lastBegin - 1)); + boolean succeedingCharacterIsWS = lastEnd < cas.getDocumentText().length() && Character.isWhitespace(cas.getDocumentText().charAt(lastEnd)); + if (precedingCharacterIsWS && (succeedingCharacterIsWS || lastEnd >= cas.getDocumentText().length())) + ++lastEnd; + cutSum += lastEnd - lastBegin; + int condensedPosition = lastEnd - cutSum + 1; + condensedPos2SumCutMap.put(condensedPosition, cutSum); + originalPos2SumCutMap.put(lastBegin, lastCutSum); + originalPos2SumCutMap.put(lastEnd, cutSum); + } + // If lastEnd is still -1, we just did not find any of the cut away annotations. Thus, we just copy the whole text. + if (lastEnd == -1) + lastEnd = 0; + if (lastEnd < cas.getDocumentText().length()) + sb.append(cas.getDocumentText().substring(lastEnd)); + condensedText = sb.toString(); + } - /** - *

- * Creates a map that maps those positions of the small-cut text that correspond - * to an intermediate next position after a cut-away annotation in the original - * text to the sum of ranges covered by cut-away annotations up to the original - * offset. - *

- *

- * If cutAwayTypes is empty, no work will be done and the methods of - * this class we return the original text and offets of the CAS. - *

- * - * @param cas - * The CAS for create a cut-away document text for. - * @param cutAwayTypes - * The qualified type names of the annotations whose covered text - * should be cut away. - * @throws ClassNotFoundException - * If cutAwayTypes contains type identifiers to - * non-existing types. - */ - public void buildMap(JCas cas, Set cutAwayTypes) throws ClassNotFoundException { - if (cutAwayTypes == null || cutAwayTypes.isEmpty()) - return; - StringBuilder sb = new StringBuilder(); - condensedPos2SumCutMap = new TreeMap<>(); - condensedPos2SumCutMap.put(0, 0); - originalPos2SumCutMap = new TreeMap<>(); - originalPos2SumCutMap.put(0, 0); - JCoReAnnotationIndexMerger merger = new JCoReAnnotationIndexMerger(cutAwayTypes, true, null, cas); - int cutSum = 0; - int lastBegin = 0; - int lastEnd = -1; - // For each ignored annotation, there could be following annotations overlapping - // with the first, effectively enlarging the ignored span. Thus, we iterate - // until we find an ignored annotation that has a positive (not 0) distance to a - // previous one. Then, we store the length of the span of cut-away annotations - // for the largest end of the previous annotations. - while (merger.incrementAnnotation()) { - int end = merger.getCurrentEnd(); - int begin = merger.getCurrentBegin(); - boolean moreThanOneCharacterDistance = begin - lastEnd > 2; - boolean previousCharacterIsCutAwayDelimiter = cutAwayFillCharacters == null || cutAwayFillCharacters.isEmpty() || (begin - lastEnd == 2 && cutAwayFillCharacters.contains(cas.getDocumentText().charAt(begin - 1))); - if (lastEnd > 0 && begin > lastEnd && (previousCharacterIsCutAwayDelimiter || moreThanOneCharacterDistance)) { - cutSum += lastEnd - lastBegin; - int condensedPosition = lastEnd - cutSum + 1; - condensedPos2SumCutMap.put(condensedPosition, cutSum); - originalPos2SumCutMap.put(lastEnd, cutSum); - lastBegin = begin; - sb.append(cas.getDocumentText(), lastEnd, begin); - } else if (lastEnd < 0) { - lastBegin = begin; - sb.append(cas.getDocumentText(), 0, begin); - } - lastEnd = end; - } - // Since we iterate one annotation further than the annotation we store the span - // for, we need to take care of the very last ignored annotation after the loop - // - it has never been handled itself. - if (lastEnd > 0) { - cutSum += lastEnd - lastBegin; - int condensedPosition = lastEnd - cutSum + 1; - condensedPos2SumCutMap.put(condensedPosition, cutSum); - originalPos2SumCutMap.put(lastEnd, cutSum); - } - // If lastEnd is still -1, we just did not find any of the cut away annotations. Thus, we just copy the whole text. - if (lastEnd == -1) - lastEnd = 0; - if (lastEnd < cas.getDocumentText().length()) - sb.append(cas.getDocumentText().substring(lastEnd)); - condensedText = sb.toString(); - } + /** + * Given a character offset relative to the condensed document text, this method + * returns the corresponding offset in the original CAS document text. + * + * @param condensedOffset The character offset in the condensed document text string. + * @return The character offset relative to the original CAS document text + * associated with condensedOffset. + */ + public int getOriginalOffsetForCondensedOffset(int condensedOffset) { + if (condensedPos2SumCutMap == null) + return condensedOffset; + Entry floorEntry = condensedPos2SumCutMap.floorEntry(condensedOffset); + return condensedOffset + floorEntry.getValue(); + } - /** - * Given a character offset relative to the condensed document text, this method - * returns the corresponding offset in the original CAS document text. - * - * @param condensedOffset - * The character offset in the condensed document text string. - * @return The character offset relative to the original CAS document text - * associated with condensedOffset. - */ - public int getOriginalOffsetForCondensedOffset(int condensedOffset) { - if (condensedPos2SumCutMap == null) - return condensedOffset; - Entry floorEntry = condensedPos2SumCutMap.floorEntry(condensedOffset); - return condensedOffset + floorEntry.getValue(); - } - - /** - * Given a character offset relative to the original CAS document text, this method - * returns the corresponding offset in the condensed document text. - * - * @param originalOffset - * The character offset in the originalOffset document CAS text string. - * @return The character offset relative to the condensed document text - * associated with originalOffset. - */ - public int getCondensedOffsetForOriginalOffset(int originalOffset) { - if (originalPos2SumCutMap == null) - return originalOffset; - Entry floorEntry = originalPos2SumCutMap.floorEntry(originalOffset); - return originalOffset - floorEntry.getValue(); - } + /** + * Given a character offset relative to the original CAS document text, this method + * returns the corresponding offset in the condensed document text. + * + * @param originalOffset The character offset in the originalOffset document CAS text string. + * @return The character offset relative to the condensed document text + * associated with originalOffset. + */ + public int getCondensedOffsetForOriginalOffset(int originalOffset) { + if (originalPos2SumCutMap == null) + return originalOffset; + Entry floorEntry = originalPos2SumCutMap.floorEntry(originalOffset); + Entry ceilingEntry = originalPos2SumCutMap.ceilingEntry(originalOffset); + // floor entry can never be null because the mapping 0=0 always exists + if (floorEntry != null && ceilingEntry != null) { + // Determine if the original offset is inside or outside of a cut away annotation. + // If the difference of key and value is the same for floor and ceiling, the originalOffset + // is within of a cut away annotation. Otherwise, it is outside a cut away annotation + int floorDiff = floorEntry.getKey() - floorEntry.getValue(); + int ceilingDiff = ceilingEntry.getKey() - ceilingEntry.getValue(); + boolean withinCutAway = floorDiff == ceilingDiff; + if (withinCutAway) + return originalOffset - ceilingEntry.getValue() + (ceilingEntry.getKey() - originalOffset); + } + return originalOffset - floorEntry.getValue(); + } - public String getCodensedText() { - return condensedText != null ? condensedText : cas.getDocumentText(); - } + public String getCodensedText() { + return condensedText != null ? condensedText : cas.getDocumentText(); + } } diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java index 1c5597a3e..86ef54bf9 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java @@ -1,5 +1,6 @@ package de.julielab.jcore.utility; +import de.julielab.jcore.types.Annotation; import de.julielab.jcore.types.InternalReference; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; @@ -83,4 +84,92 @@ public void testReduce3() throws Exception { new HashSet<>(Arrays.asList(InternalReference.class.getCanonicalName())), Set.of(',', ';')); assertEquals("This sentence has multiple references. This is a second sentence.", condensedText.getCodensedText()); } + + @Test + public void testCondensedOffsetsWithinCutawayAnnotations() throws Exception { + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("Not cut away 1. Cut away 1. Not cut away 2. Cut away 2. Not cut away 3."); + Annotation cutAwayAnnotation = new Annotation(jcas, 16, 27); + cutAwayAnnotation.addToIndexes(); + Annotation cutAwayAnnotation2 = new Annotation(jcas, 44, 55); + cutAwayAnnotation2.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(Annotation.class.getCanonicalName()))); + assertEquals("Not cut away 1. Not cut away 2. Not cut away 3.", condensedText.getCodensedText()); + assertEquals(10, condensedText.getCondensedOffsetForOriginalOffset(10)); + assertEquals(15, condensedText.getCondensedOffsetForOriginalOffset(15)); + assertEquals(16, condensedText.getCondensedOffsetForOriginalOffset(16)); + assertEquals(16, condensedText.getCondensedOffsetForOriginalOffset(17)); + assertEquals(16, condensedText.getCondensedOffsetForOriginalOffset(27)); + assertEquals(19, condensedText.getCondensedOffsetForOriginalOffset(31)); + } + + @Test + public void testCutAwayAtBeginning() throws Exception { + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("Cut away. Not cut away."); + Annotation cutAwayAnnotation = new Annotation(jcas, 0, 9); + cutAwayAnnotation.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(Annotation.class.getCanonicalName()))); + assertEquals("Not cut away.", condensedText.getCodensedText()); + assertEquals(0, condensedText.getCondensedOffsetForOriginalOffset(3)); + assertEquals(3, condensedText.getCondensedOffsetForOriginalOffset(13)); + } + + @Test + public void testCutAwayAtEnd() throws Exception { + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("Not cut away. Cut away."); + Annotation cutAwayAnnotation = new Annotation(jcas, 14, 23); + cutAwayAnnotation.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(Annotation.class.getCanonicalName()))); + assertEquals("Not cut away.", condensedText.getCodensedText()); + assertEquals(10, condensedText.getCondensedOffsetForOriginalOffset(10)); + assertEquals(13, condensedText.getCondensedOffsetForOriginalOffset(16)); + assertEquals(13, condensedText.getCondensedOffsetForOriginalOffset(23)); + } + + @Test + public void testEmbeddedCutAway() throws Exception { + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("Not cut away. Cut away. Not cut away."); + Annotation cutAwayAnnotation = new Annotation(jcas, 14, 23); + cutAwayAnnotation.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(Annotation.class.getCanonicalName()))); + assertEquals("Not cut away. Not cut away.", condensedText.getCodensedText()); + assertEquals(10, condensedText.getCondensedOffsetForOriginalOffset(10)); + assertEquals(14, condensedText.getCondensedOffsetForOriginalOffset(16)); + assertEquals(14, condensedText.getCondensedOffsetForOriginalOffset(23)); + assertEquals(15, condensedText.getCondensedOffsetForOriginalOffset(25)); + } + + @Test + public void testEnclosingCutAway() throws Exception { + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("Cut away. Not cut away. Cut away."); + Annotation cutAwayAnnotation = new Annotation(jcas, 0, 9); + cutAwayAnnotation.addToIndexes(); + Annotation cutAwayAnnotation2 = new Annotation(jcas, 24, 33); + cutAwayAnnotation2.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(Annotation.class.getCanonicalName()))); + assertEquals("Not cut away.", condensedText.getCodensedText()); + assertEquals(0, condensedText.getCondensedOffsetForOriginalOffset(10)); + assertEquals(3, condensedText.getCondensedOffsetForOriginalOffset(13)); + assertEquals(13, condensedText.getCondensedOffsetForOriginalOffset(27)); + assertEquals(13, condensedText.getCondensedOffsetForOriginalOffset(33)); + } } diff --git a/jedis-parent/pom.xml b/jedis-parent/pom.xml index b66c3be70..3daef871c 100644 --- a/jedis-parent/pom.xml +++ b/jedis-parent/pom.xml @@ -27,7 +27,7 @@ de.julielab jcore-xmi-splitter - 2.3.5 + 2.4.0-SNAPSHOT From b711381cbaead72a1c24ad182d4e198b406cc661 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 7 Jul 2021 16:26:55 +0200 Subject: [PATCH 080/269] Adding a test to assure that the in-cut away-offset issue is gone. --- .../ae/jsbd/main/SentenceAnnotatorTest.java | 22 +++++++++++++++++++ .../test/resources/errordocs/PMC8205280.xmi | 1 + 2 files changed, 23 insertions(+) create mode 100644 jcore-jsbd-ae/src/test/resources/errordocs/PMC8205280.xmi diff --git a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java index 22edbe983..5a5b23a47 100644 --- a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java +++ b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java @@ -310,5 +310,27 @@ public void testErrordoc() throws Exception { assertThatCode(() -> jsbd.process(jCas.getCas())).doesNotThrowAnyException(); } + @Test + public void testErrordoc2() throws Exception { + // This XMI file has larger cut away types where an original offset request actually lies inside of a + // cut away annotation. This case led to errors prior to a respective bug fix in the + // JCoReCondensedDocumentText + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", + "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types"); + + XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "errordocs", "PMC8205280.xmi").toFile()), jCas.getCas()); + JCasUtil.select(jCas, Sentence.class).forEach(Annotation::removeFromIndexes); + AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, + "/Users/faessler/Coding/git/jcore-projects/jcore-jsbd-ae-biomedical-english/src/main/resources/de/julielab/jcore/ae/jsbd/model/jsbd-biomed-oversampled-abstracts-split-at-punctuation.mod.gz", + SentenceAnnotator.PARAM_MAX_SENTENCE_LENGTH, 1000, + SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, new String[]{ + "de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"}, + SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{de.julielab.jcore.types.pubmed.InternalReference.class.getCanonicalName()} + ); + + assertThatCode(() -> jsbd.process(jCas.getCas())).doesNotThrowAnyException(); + } + } diff --git a/jcore-jsbd-ae/src/test/resources/errordocs/PMC8205280.xmi b/jcore-jsbd-ae/src/test/resources/errordocs/PMC8205280.xmi new file mode 100644 index 000000000..b2063eca5 --- /dev/null +++ b/jcore-jsbd-ae/src/test/resources/errordocs/PMC8205280.xmi @@ -0,0 +1 @@ + \ No newline at end of file From 56fdb2157e1dbcdb1032718b36a3b8e46cbb42fb Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 9 Jul 2021 15:34:18 +0200 Subject: [PATCH 081/269] JTBD: Fixed an issue where a regular expression match took days without finishing. The issue occurred with document PMC7575323. The respective regex, `(.*[\W].*){5,}`, has an unbounded number of matches allowed. In a first try, I introduced an upper bound of 50 but this did still take several minutes without finishing. Instead, the expression is now only applied for "superunits" (I'm actually not sure what those consist of) with a length of at most 200 characters. With that change, the document in question was finished within seconds. --- .../main/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipe.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipe.java b/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipe.java index c52e1ad12..833f97e8f 100755 --- a/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipe.java +++ b/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipe.java @@ -524,7 +524,7 @@ else if (superUnitRep.length() <= 8) // check whether superunit might be a chemical // therefor we check the number typical special characters contained - if ((superUnitRep.length() > 6) + if ((superUnitRep.length() > 6 && superUnitRep.length() < 200) && superUnitRep.matches("(.*[\\W].*){5,}") && !superUnitRep.contains("-->")) token.setFeatureValue("SU_isChemical", 1); From ff922166592ffcc5c7155809a742f4d89223ad7c Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 12 Jul 2021 14:04:29 +0200 Subject: [PATCH 082/269] Adding the first GitHub Actions workflow, see #122. --- .github/workflows/maven.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 .github/workflows/maven.yml diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml new file mode 100644 index 000000000..34492a386 --- /dev/null +++ b/.github/workflows/maven.yml @@ -0,0 +1,25 @@ +# This workflow will build a Java project with Maven +# For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven + +name: Java CI with Maven + +on: + push: + branches: [ master, v2.6 ] + pull_request: + branches: [ master, v2.6 ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up JDK 11 + uses: actions/setup-java@v2 + with: + java-version: '11' + distribution: 'adopt' + - name: Build with Maven + run: mvn -B package --file pom.xml From e5659b0ef63564dcceeda61e5d613e13ae5967a2 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 12 Jul 2021 14:11:16 +0200 Subject: [PATCH 083/269] Enhancing GitHubActions: Python & snapshot repository. --- .github/maven-settings.xml | 24 ++++++++++++++++++++++++ .github/workflows/maven.yml | 10 +++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 .github/maven-settings.xml diff --git a/.github/maven-settings.xml b/.github/maven-settings.xml new file mode 100644 index 000000000..9c8a6c405 --- /dev/null +++ b/.github/maven-settings.xml @@ -0,0 +1,24 @@ + + + + + sonatype-snapshots + + + sonatype-nexus-snapshots + Sonatype Nexus Snapshots + https://oss.sonatype.org/content/repositories/snapshots + + false + + + true + + + + + + + sonatype-snapshots + + \ No newline at end of file diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml index 34492a386..2c3de94b2 100644 --- a/.github/workflows/maven.yml +++ b/.github/workflows/maven.yml @@ -15,6 +15,14 @@ jobs: runs-on: ubuntu-latest steps: + - name: Set up Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7 + - name: Install python dependencies + run: | + python -m pip install --upgrade pip + pip install flair==0.6.1 - uses: actions/checkout@v2 - name: Set up JDK 11 uses: actions/setup-java@v2 @@ -22,4 +30,4 @@ jobs: java-version: '11' distribution: 'adopt' - name: Build with Maven - run: mvn -B package --file pom.xml + run: mvn -B package --file pom.xml --settings .github/maven-settings.xml From dbcc1bfc5adac2fd23659a06858916d7cb04e7b4 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 12 Jul 2021 14:37:42 +0200 Subject: [PATCH 084/269] Using https for the BioNLP Maven repository. It was just 'http' previously which is blocked in newer Maven versions. --- jcore-biolemmatizer-ae/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-biolemmatizer-ae/pom.xml b/jcore-biolemmatizer-ae/pom.xml index b5e089a8f..51834d192 100644 --- a/jcore-biolemmatizer-ae/pom.xml +++ b/jcore-biolemmatizer-ae/pom.xml @@ -47,7 +47,7 @@ BioNLP Repository - http://svn.code.sf.net/p/bionlp/code/repo + https://svn.code.sf.net/p/bionlp/code/repo From 27fbc4a34b090953decdc1b81130397103dfda7b Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 12 Jul 2021 14:45:50 +0200 Subject: [PATCH 085/269] Adding another repository with https protocol in context of BioLemmatizer. BioLemmatizer uses MorphAdorner and has a repository for it in its own pom.xml. However, it again has no https and is blocked by Maven. I try to add the repository in the jcore-biolemmatizer-ae pom.xml in the hope that this repository will then be used instead of the original one. --- jcore-biolemmatizer-ae/pom.xml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/jcore-biolemmatizer-ae/pom.xml b/jcore-biolemmatizer-ae/pom.xml index 51834d192..cc11be3b1 100644 --- a/jcore-biolemmatizer-ae/pom.xml +++ b/jcore-biolemmatizer-ae/pom.xml @@ -49,6 +49,12 @@ BioNLP Repository https://svn.code.sf.net/p/bionlp/code/repo + + + maven.aksw.internal + AKSW Internal Release Repository + https://maven.aksw.org/repository/internal + From 3c6571102dda3398cf9bbfee4c19d565565eb96e Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 27 Jul 2021 11:36:11 +0200 Subject: [PATCH 086/269] ES Consumer: Adding a Lucene index cache for the Map- and AddonProviders. When the mapping files get very large (in my case over 30M lines) loading all of it into memory can cause memory issues. Those can be solved frequently by using internalization where all Strings are uniquely stored in the String pool. However, this pool is also of a finite size (which can be set via the -XX:StringTableSize= JVM option). In the case that most probably only a part of the whole map will ever be used, an alternative way are the new, abstract `PersistentIndexAddonTermsProvider` and `PersistentStringIndexMapProvider` classes. Both have one concrete sub class which uses a Lucene index to store the files in an indexed format. A Guava cache is used to load from this index and keep the actually required elements in memory. This relieved memory requirements a lot when dealing with very large resource files. Resolves #123. --- jcore-elasticsearch-consumer/README.md | 6 +- jcore-elasticsearch-consumer/pom.xml | 1 - .../consumer/es/AbstractFieldGenerator.java | 4 +- .../consumer/es/filter/AddonTermsFilter.java | 53 +++--- .../es/filter/SingleAddonTermsFilter.java | 35 ++++ .../sharedresources/AbstractMapProvider.java | 34 ++-- .../sharedresources/AddonTermsProvider.java | 25 ++- .../es/sharedresources/LuceneIndex.java | 160 +++++++++++++++++ .../MapDBReversedDoubleMapProvider.java | 70 -------- .../es/sharedresources/MapProvider.java | 14 ++ .../PersistentIndexAddonTermsProvider.java | 156 +++++++++++++++++ ...rsistentLuceneIndexAddonTermsProvider.java | 17 ++ .../PersistentLuceneStringMapProvider.java | 17 ++ .../PersistentStringIndexMapProvider.java | 163 ++++++++++++++++++ .../ReversedDoubleMapProvider.java | 10 ++ .../es/sharedresources/StringIndex.java | 25 +++ 16 files changed, 671 insertions(+), 119 deletions(-) create mode 100644 jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/SingleAddonTermsFilter.java create mode 100644 jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/LuceneIndex.java delete mode 100644 jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/MapDBReversedDoubleMapProvider.java create mode 100644 jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentIndexAddonTermsProvider.java create mode 100644 jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentLuceneIndexAddonTermsProvider.java create mode 100644 jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentLuceneStringMapProvider.java create mode 100644 jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java create mode 100644 jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/StringIndex.java diff --git a/jcore-elasticsearch-consumer/README.md b/jcore-elasticsearch-consumer/README.md index a034187c7..c8e69c1da 100644 --- a/jcore-elasticsearch-consumer/README.md +++ b/jcore-elasticsearch-consumer/README.md @@ -1,9 +1,9 @@ # JCoRe ElasticSearchConsumer -**Descriptor Path**: +**Descriptor Paths**: ``` -.jcore-elasticsearch-consumer.src.main.resources.de.julielab.jcore.consumer.es.desc.jcore-elasticsearch-consumer -.jcore-elasticsearch-consumer.src.main.resources.de.julielab.jcore.consumer.es.desc.jcore-json-consumer +de.julielab.jcore.consumer.es.desc.jcore-elasticsearch-consumer +de.julielab.jcore.consumer.es.desc.jcore-json-consumer ``` ### Objective diff --git a/jcore-elasticsearch-consumer/pom.xml b/jcore-elasticsearch-consumer/pom.xml index 57f9452c2..cd72f8d4f 100644 --- a/jcore-elasticsearch-consumer/pom.xml +++ b/jcore-elasticsearch-consumer/pom.xml @@ -76,7 +76,6 @@ org.mapdb mapdb 3.0.7 - provided org.testng diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/AbstractFieldGenerator.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/AbstractFieldGenerator.java index 54e1c91d7..287cd68ea 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/AbstractFieldGenerator.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/AbstractFieldGenerator.java @@ -499,8 +499,8 @@ else if (fieldValues.size() == 1) * featurePaths and reset once per featurePath. * * @param a - * @param featurePath - * @param f + * @param featurePaths + * @param filters * @return * @throws CASException */ diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/AddonTermsFilter.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/AddonTermsFilter.java index b37e52348..9114109be 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/AddonTermsFilter.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/AddonTermsFilter.java @@ -6,33 +6,36 @@ public class AddonTermsFilter extends AbstractFilter { - private Map addonTerms; + private Map addonTerms; - public AddonTermsFilter(Map addonTerms) { - this.addonTerms = addonTerms; - } + public AddonTermsFilter(Map addonTerms) { + this.addonTerms = addonTerms; + } - @Override - public List filter(String input) { - newOutput(); - if (null != input) { - output.add(input); - String[] hypernymArray = addonTerms.get(input); - if (null != hypernymArray) { - output = new ArrayList<>(hypernymArray.length + 1); - output.add(input); - for (int i = 0; i < hypernymArray.length; i++) { - String hypernym = hypernymArray[i]; - output.add(hypernym); - } - } - } - return output; - } + @Override + public List filter(String input) { + newOutput(); + if (null != input) { + output.add(input); + String[] addonArray = addonTerms.get(input); + if (null != addonArray) { + // Only create a new output array when the default ArrayList size can't hold all the elements + if (addonArray.length >= 10) { + output = new ArrayList<>(addonArray.length + 1); + output.add(input); + } + for (int i = 0; i < addonArray.length; i++) { + String addonTerm = addonArray[i]; + output.add(addonTerm); + } + } + } + return output; + } - @Override - public Filter copy() { - return new AddonTermsFilter(addonTerms); - } + @Override + public Filter copy() { + return new AddonTermsFilter(addonTerms); + } } diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/SingleAddonTermsFilter.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/SingleAddonTermsFilter.java new file mode 100644 index 000000000..1e83f2b9f --- /dev/null +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/filter/SingleAddonTermsFilter.java @@ -0,0 +1,35 @@ +package de.julielab.jcore.consumer.es.filter; + +import java.util.List; +import java.util.Map; + +/** + *

Like {@link AddonTermsFilter} but accepts single string values instead of string arrays.

+ */ +public class SingleAddonTermsFilter extends AbstractFilter { + + private Map addonTerms; + + public SingleAddonTermsFilter(Map addonTerms) { + this.addonTerms = addonTerms; + } + + @Override + public List filter(String input) { + newOutput(); + if (null != input) { + output.add(input); + String addonTerm = addonTerms.get(input); + if (null != addonTerm) { + output.add(addonTerm); + } + } + return output; + } + + @Override + public Filter copy() { + return new SingleAddonTermsFilter(addonTerms); + } + +} diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AbstractMapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AbstractMapProvider.java index fdc15aaa1..7a181d55a 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AbstractMapProvider.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AbstractMapProvider.java @@ -4,7 +4,6 @@ import org.apache.uima.resource.DataResource; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.IOException; @@ -12,11 +11,25 @@ import java.util.HashMap; import java.util.Map; - +/** + *

Base class for resources that map one term to another. Uses a HashMap. The trivial instantiable subclass is {@link MapProvider}.

+ *

This class is abstract because it is generic. To work with other data types than strings, the {@link #getKey(String)} and {@link #getValue(String)} + * methods are overridden by subclasses to deliver the correct data types from the string input.

+ *

Subclasses deal with maps where the keys and/or values are not strings but numbers. Other subclasses deal with + * String but use a persistent data structure to deal with very large maps.

+ * + * @param + * @param + */ public abstract class AbstractMapProvider implements IMapProvider { - private final static Logger log = LoggerFactory.getLogger(AbstractMapProvider.class); + protected final Logger log; protected boolean reverse = false; - private HashMap map; + protected Map map; + + public AbstractMapProvider(Logger log) { + this.log = log; + map = new HashMap<>(); + } @Override public void load(DataResource aData) throws ResourceInitializationException { @@ -44,16 +57,11 @@ public void load(DataResource aData) throws ResourceInitializationException { throw new IllegalArgumentException("Format error in map file: Expected format is 'originalValuemappedValue' but the input line '" + line + "' has " + split.length + " columns."); if (reverse) - map.put(getKey(split[1]), getValue(split[0])); + put(getKey(split[1]), getValue(split[0])); else - map.put(getKey(split[0]), getValue(split[1])); + put(getKey(split[0]), getValue(split[1])); } - log.info("Finished reading resource {}", aData.getUri()); - log.info("Copying {} values into a fresh HashMap of the exact correct size", map.size()); - HashMap tmp = new HashMap<>(map.size(), 1f); - tmp.putAll(map); - map = tmp; - log.info("Done."); + log.info("Finished reading resource {} and got {} elements.", aData.getUri(), map.size()); } catch (IOException e) { throw new ResourceInitializationException(e); } finally { @@ -66,6 +74,8 @@ public void load(DataResource aData) throws ResourceInitializationException { } } + protected abstract void put(K key, V value); + protected abstract V getValue(String valueString); protected abstract K getKey(String keyString); diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AddonTermsProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AddonTermsProvider.java index 7b4adb2d0..5118d8be4 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AddonTermsProvider.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AddonTermsProvider.java @@ -6,24 +6,37 @@ import org.apache.uima.resource.DataResource; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; +import java.net.URI; import java.util.HashMap; import java.util.List; import java.util.Map; +/** + *

Base class for addon terms (i.e. terms to be added to some key term, like synonyms or hypernyms) that uses a HashMap.

+ *

Subclasses of this class use other data structures to store and retrieve the addon terms. Useful for large numbers of such terms.

+ */ public class AddonTermsProvider implements IAddonTermsProvider { - Logger log = LoggerFactory.getLogger(AddonTermsProvider.class); + protected final Logger log; - private Map addonTerms; + protected Map addonTerms; + + public AddonTermsProvider(Logger log) { + this.log = log; + addonTerms = new HashMap<>(); + } + + protected void put(String term, String[] addonArray) { + addonTerms.put(term, addonArray); + } @Override public void load(DataResource aData) throws ResourceInitializationException { try { - addonTerms = new HashMap<>(); - log.info("Loading addon terms from " + aData.getUri()); + URI uri = aData.getUri(); + log.info("Loading addon terms from " + uri); int addons = 0; InputStream inputStream; try { @@ -56,7 +69,7 @@ public void load(DataResource aData) throws ResourceInitializationException { addonArray[i] = trimmedAddon.intern(); addons++; } - addonTerms.put(term, addonArray); + put(term, addonArray); } log.info("Loaded {} addons for {} terms.", addons, addonTerms.size()); } catch (IOException e) { diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/LuceneIndex.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/LuceneIndex.java new file mode 100644 index 000000000..204f07abb --- /dev/null +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/LuceneIndex.java @@ -0,0 +1,160 @@ +package de.julielab.jcore.consumer.es.sharedresources; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.*; +import org.apache.lucene.search.*; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.NIOFSDirectory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.Arrays; + +public class LuceneIndex implements StringIndex { + private final static Logger log = LoggerFactory.getLogger(LuceneIndex.class); + private IndexWriter iw; + private final FSDirectory directory; + private IndexSearcher searcher; + + public LuceneIndex(String indexDirectory) { + try { + Path lucene = Path.of(indexDirectory); + directory = NIOFSDirectory.open(lucene); + // Do not open a writer to an existing index. This causes locking issues when starting multiple + // pipelines in parallel. + // Of course, the first pipeline still needs to create the index, so this must be a one-time effort + // that has to be completed before the other pipelines are started. + if (!lucene.toFile().exists()) { + IndexWriterConfig iwc = new IndexWriterConfig(); + iw = new IndexWriter(directory, iwc); + } + } catch (IOException e) { + log.error("could not initialize Lucene index", e); + throw new IllegalStateException(e); + } + } + + @Override + public String get(String key) { + TermQuery tq = new TermQuery(new Term("key", key)); + BooleanQuery.Builder b = new BooleanQuery.Builder(); + b.add(tq, BooleanClause.Occur.FILTER); + BooleanQuery q = b.build(); + try { + TopDocs topDocs = searcher.search(q, 1); + if (topDocs.scoreDocs.length > 0) { + Document doc = searcher.getIndexReader().document(topDocs.scoreDocs[0].doc); + return doc.getField("value").stringValue(); + } + } catch (IOException e) { + log.error("Could not retrieve results for '{}' in Lucene index.", key, e); + throw new IllegalStateException(e); + } + return null; + } + + @Override + public String[] getArray(String key) { + TermQuery tq = new TermQuery(new Term("key", key)); + BooleanQuery.Builder b = new BooleanQuery.Builder(); + b.add(tq, BooleanClause.Occur.FILTER); + BooleanQuery q = b.build(); + try { + TopDocs topDocs = searcher.search(q, 1); + if (topDocs.scoreDocs.length > 0) { + Document doc = searcher.getIndexReader().document(topDocs.scoreDocs[0].doc); + return Arrays.stream(doc.getFields("value")).map(IndexableField::stringValue).toArray(String[]::new); + } + } catch (IOException e) { + log.error("Could not retrieve results for '{}' in Lucene index.", key, e); + throw new IllegalStateException(e); + } + return null; + } + + @Override + public void put(String key, String value) { + Field keyField = new StringField("key", key, Field.Store.NO); + Field valueField = new StoredField("value", value); + Document doc = new Document(); + doc.add(keyField); + doc.add(valueField); + try { + iw.addDocument(doc); + } catch (IOException e) { + log.error("Could not index key-value pair {}:{} with Lucene", key, value, e); + throw new IllegalStateException(e); + } + } + + @Override + public void put(String key, String[] value) { + Field keyField = new StringField("key", key, Field.Store.NO); + Document doc = new Document(); + doc.add(keyField); + for (var v : value) + doc.add(new StoredField("value", v)); + try { + iw.addDocument(doc); + } catch (IOException e) { + log.error("Could not index key-value pair {}:{} with Lucene", key, value, e); + throw new IllegalStateException(e); + } + } + + @Override + public void commit() { + try { + iw.commit(); + } catch (IOException e) { + log.error("Could not commit Lucene index", e); + throw new IllegalStateException(e); + } + } + + @Override + public boolean requiresExplicitCommit() { + return true; + } + + @Override + public void close() { + try { + if (searcher != null) { + searcher.getIndexReader().close(); + searcher = null; + } + if (iw != null) { + iw.close(); + iw = null; + } + } catch (IOException e) { + log.error("Could not close Lucene index reader.", e); + throw new IllegalStateException(e); + } + } + + @Override + public void open() { + try { + searcher = new IndexSearcher(DirectoryReader.open(directory)); + } catch (IOException e) { + log.error("Could not open Lucene index searcher.", e); + throw new IllegalStateException(e); + } + } + + @Override + public int size() { + if (iw != null && iw.isOpen()) + return iw.numDocs(); + else if (searcher != null) + return searcher.getIndexReader().numDocs(); + return 0; + } +} diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/MapDBReversedDoubleMapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/MapDBReversedDoubleMapProvider.java deleted file mode 100644 index a12a082a5..000000000 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/MapDBReversedDoubleMapProvider.java +++ /dev/null @@ -1,70 +0,0 @@ -package de.julielab.jcore.consumer.es.sharedresources; - -import de.julielab.jcore.utility.JCoReTools; -import org.apache.uima.resource.DataResource; -import org.apache.uima.resource.ResourceInitializationException; -import org.mapdb.DB; -import org.mapdb.DBMaker; -import org.mapdb.Serializer; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.Map; - -public class MapDBReversedDoubleMapProvider implements IMapProvider { - - private Map map; - - @Override - public void load(DataResource aData) throws ResourceInitializationException { - BufferedReader br = null; - try { - final DB filedb = DBMaker.tempFileDB().fileMmapEnableIfSupported().cleanerHackEnable().closeOnJvmShutdownWeakReference().make(); - map = filedb.hashMap("JCoReElasticSearchReverseMapProvider"). - keySerializer(Serializer.STRING).valueSerializer(Serializer.DOUBLE). - create(); - InputStreamReader is; - try { - is = new InputStreamReader(JCoReTools.resolveExternalResourceGzipInputStream(aData)); - } catch (Exception e) { - throw new IOException("Resource " + aData.getUri() + " not found"); - } - br = new BufferedReader(is); - String line; - String splitExpression = "\t"; - while ((line = br.readLine()) != null) { - if (line.trim().length() == 0 || line.startsWith("#")) - continue; - String[] split = line.split(splitExpression); - if (split.length != 2) { - splitExpression = "\\s+"; - split = line.split(splitExpression); - } - if (split.length != 2) - throw new IllegalArgumentException("Format error in map file: Expected format is 'originalValuemappedValue' but the input line '" + line - + "' has " + split.length + " columns."); - map.put(split[1].trim(), Double.parseDouble(split[0].trim())); - } - } catch (IOException e) { - throw new ResourceInitializationException(e); - } finally { - try { - if (null != br) - br.close(); - } catch (IOException e) { - throw new ResourceInitializationException(e); - } - } - - } - - /** - * Returns the loaded map. All strings - keys and values - are internalized. - */ - @Override - public Map getMap() { - return map; - } - -} diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/MapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/MapProvider.java index ebd90f8ed..0b8393ed7 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/MapProvider.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/MapProvider.java @@ -1,6 +1,20 @@ package de.julielab.jcore.consumer.es.sharedresources; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + public class MapProvider extends AbstractMapProvider { + private final static Logger log = LoggerFactory.getLogger(MapProvider.class); + + public MapProvider() { + super(log); + } + + @Override + protected void put(String key, String value) { + map.put(key, value); + } + @Override protected String getValue(String valueString) { return valueString.trim().intern(); diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentIndexAddonTermsProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentIndexAddonTermsProvider.java new file mode 100644 index 000000000..b20d466ef --- /dev/null +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentIndexAddonTermsProvider.java @@ -0,0 +1,156 @@ +package de.julielab.jcore.consumer.es.sharedresources; + +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.lang3.NotImplementedException; +import org.apache.uima.resource.DataResource; +import org.apache.uima.resource.ResourceInitializationException; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; +import org.slf4j.Logger; + +import java.io.File; +import java.net.MalformedURLException; +import java.net.URI; +import java.time.Duration; +import java.util.Collection; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.ExecutionException; + +/** + * Reads the original input file and converts it into a persistent index. This index is re-used in subsequent pipeline runs. + */ +abstract public class PersistentIndexAddonTermsProvider extends AddonTermsProvider { + public static final int MAXIMUM_MEMCACHE_SIZE = 10000; + private final LoadingCache> cache; + private StringIndex index; + + public PersistentIndexAddonTermsProvider(Logger log) { + super(log); + addonTerms = new Map<>() { + @Override + public int size() { + return index.size(); + } + + @Override + public boolean isEmpty() { + throw new NotImplementedException(); + } + + @Override + public boolean containsKey(Object key) { + throw new NotImplementedException(); + } + + @Override + public boolean containsValue(Object value) { + throw new NotImplementedException(); + } + + @Override + public String[] get(Object key) { + try { + return cache.get((String) key).orElse(null); + } catch (ExecutionException e) { + log.error("Could not retrieve value from the cache for key '{}'.", key); + throw new IllegalStateException(); + } + } + + @Nullable + @Override + public String[] put(String key, String[] value) { + throw new NotImplementedException(); + } + + @Override + public String[] remove(Object key) { + throw new NotImplementedException(); + } + + @Override + public void putAll(@NotNull Map m) { + throw new NotImplementedException(); + } + + @Override + public void clear() { + throw new NotImplementedException(); + } + + @NotNull + @Override + public Set keySet() { + throw new NotImplementedException(); + } + + @NotNull + @Override + public Collection values() { + throw new NotImplementedException(); + } + + @NotNull + @Override + public Set> entrySet() { + throw new NotImplementedException(); + } + }; + cache = CacheBuilder.newBuilder().maximumSize(MAXIMUM_MEMCACHE_SIZE).expireAfterAccess(Duration.ofHours(1)).build(new CacheLoader<>() { + @Override + public Optional load(String s) { + return Optional.ofNullable(index.getArray(s)); + } + }); + } + + protected abstract StringIndex initializeIndex(String cachePath); + + @Override + public void load(DataResource aData) throws ResourceInitializationException { + // prepare the persistent index + URI uri = aData.getUri(); + File indexFile; + boolean loadData = true; + try { + File resourceFile = new File(uri); + String resourceFileName = FilenameUtils.getName(uri.toURL().getPath()); + indexFile = new File("es-consumer-cache", resourceFileName); + if (resourceFile.exists() && indexFile.exists() && resourceFile.lastModified() > indexFile.lastModified()) { + log.info("Resource file {} is newer than the existing cached index at {}. Creating new index.", resourceFile, indexFile); + indexFile.delete(); + } else { + boolean indexFileExisted = indexFile.exists(); + if (!indexFileExisted) { + log.info("Creating persistent cache for resource {} at {}.", uri, indexFile); + } + else { + log.info("Using existing persistent cache {} for resource {}.", indexFile, uri); + loadData = false; + } + } + index = initializeIndex(indexFile.getAbsolutePath()); + } catch (MalformedURLException e) { + log.error("Could obtain file name from resource URI '{}'", uri, e); + throw new IllegalStateException(e); + } + if (loadData) { + super.load(aData); + if (index.requiresExplicitCommit()) + index.commit(); + } + index.close(); + index.open(); + log.info("There are {} entries in the cache at {}.", index.size(), indexFile); + } + + @Override + protected void put(String term, String[] addonArray) { + index.put(term, addonArray); + } +} diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentLuceneIndexAddonTermsProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentLuceneIndexAddonTermsProvider.java new file mode 100644 index 000000000..40ac75e83 --- /dev/null +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentLuceneIndexAddonTermsProvider.java @@ -0,0 +1,17 @@ +package de.julielab.jcore.consumer.es.sharedresources; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class PersistentLuceneIndexAddonTermsProvider extends PersistentIndexAddonTermsProvider{ + private final static Logger log = LoggerFactory.getLogger(PersistentLuceneIndexAddonTermsProvider.class); + + public PersistentLuceneIndexAddonTermsProvider() { + super(log); + } + + @Override + protected StringIndex initializeIndex(String cachePath) { + return new LuceneIndex(cachePath); + } +} diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentLuceneStringMapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentLuceneStringMapProvider.java new file mode 100644 index 000000000..c49ed7350 --- /dev/null +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentLuceneStringMapProvider.java @@ -0,0 +1,17 @@ +package de.julielab.jcore.consumer.es.sharedresources; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class PersistentLuceneStringMapProvider extends PersistentStringIndexMapProvider { + private final static Logger log = LoggerFactory.getLogger(PersistentLuceneStringMapProvider.class); + + public PersistentLuceneStringMapProvider() { + super(log); + } + + @Override + protected StringIndex initializeIndex(String cachePath) { + return new LuceneIndex(cachePath); + } +} diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java new file mode 100644 index 000000000..93dd296f2 --- /dev/null +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java @@ -0,0 +1,163 @@ +package de.julielab.jcore.consumer.es.sharedresources; + +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.lang3.NotImplementedException; +import org.apache.uima.resource.DataResource; +import org.apache.uima.resource.ResourceInitializationException; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; +import org.slf4j.Logger; + +import java.io.File; +import java.net.MalformedURLException; +import java.net.URI; +import java.time.Duration; +import java.util.Collection; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.ExecutionException; + +abstract public class PersistentStringIndexMapProvider extends AbstractMapProvider { + public static final int MAXIMUM_MEMCACHE_SIZE = 10000; + private final LoadingCache> cache; + private StringIndex index; + + public PersistentStringIndexMapProvider(Logger log) { + super(log); + map = new Map<>() { + @Override + public int size() { + return index.size(); + } + + @Override + public boolean isEmpty() { + throw new NotImplementedException(); + } + + @Override + public boolean containsKey(Object key) { + throw new NotImplementedException(); + } + + @Override + public boolean containsValue(Object value) { + throw new NotImplementedException(); + } + + @Override + public String get(Object key) { + try { + return cache.get((String) key).orElse(null); + } catch (ExecutionException e) { + log.error("Could not retrieve value from the cache for key '{}'.", key); + throw new IllegalStateException(); + } + } + + @Nullable + @Override + public String put(String key, String value) { + throw new NotImplementedException(); + } + + @Override + public String remove(Object key) { + throw new NotImplementedException(); + } + + @Override + public void putAll(@NotNull Map m) { + throw new NotImplementedException(); + } + + @Override + public void clear() { + throw new NotImplementedException(); + } + + @NotNull + @Override + public Set keySet() { + throw new NotImplementedException(); + } + + @NotNull + @Override + public Collection values() { + throw new NotImplementedException(); + } + + @NotNull + @Override + public Set> entrySet() { + throw new NotImplementedException(); + } + }; + cache = CacheBuilder.newBuilder().maximumSize(MAXIMUM_MEMCACHE_SIZE).expireAfterAccess(Duration.ofHours(1)).build(new CacheLoader<>() { + @Override + public Optional load(String s) { + return Optional.ofNullable(index.get(s)); + } + }); + } + + @Override + protected void put(String key, String value) { + index.put(key, value); + } + + protected abstract StringIndex initializeIndex(String cachePath); + + @Override + public void load(DataResource aData) throws ResourceInitializationException { + // prepare the persistent index + URI uri = aData.getUri(); + File indexFile; + boolean loadData = true; + try { + File resourceFile = new File(uri); + String resourceFileName = FilenameUtils.getName(uri.toURL().getPath()); + indexFile = new File("es-consumer-cache", resourceFileName); + if (resourceFile.exists() && indexFile.exists() && resourceFile.lastModified() > indexFile.lastModified()) { + log.info("Resource file {} is newer than the existing cached index at {}. Creating new index.", resourceFile, indexFile); + indexFile.delete(); + } else { + boolean indexFileExisted = indexFile.exists(); + if (!indexFileExisted) { + log.info("Creating persistent cache for resource {} at {}.", uri, indexFile); + } + else { + log.info("Using existing persistent cache {} for resource {}.", indexFile, uri); + loadData = false; + } + } + index = initializeIndex(indexFile.getAbsolutePath()); + } catch (MalformedURLException e) { + log.error("Could obtain file name from resource URI '{}'", uri, e); + throw new IllegalStateException(e); + } + if (loadData) { + super.load(aData); + if (index.requiresExplicitCommit()) + index.commit(); + } + index.close(); + index.open(); + log.info("There are {} entries in the cache at {}.", index.size(), indexFile); + } + + @Override + protected String getValue(String valueString) { + return valueString; + } + + @Override + protected String getKey(String keyString) { + return keyString; + } +} diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/ReversedDoubleMapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/ReversedDoubleMapProvider.java index d9caa600a..fc1184319 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/ReversedDoubleMapProvider.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/ReversedDoubleMapProvider.java @@ -1,11 +1,21 @@ package de.julielab.jcore.consumer.es.sharedresources; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + public class ReversedDoubleMapProvider extends AbstractMapProvider { + private final static Logger log = LoggerFactory.getLogger(ReversedDoubleMapProvider.class); public ReversedDoubleMapProvider() { + super(log); this.reverse = true; } + @Override + protected void put(String key, Double value) { + map.put(key, value); + } + @Override protected Double getValue(String valueString) { return Double.parseDouble(valueString.trim()); diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/StringIndex.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/StringIndex.java new file mode 100644 index 000000000..733dcc213 --- /dev/null +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/StringIndex.java @@ -0,0 +1,25 @@ +package de.julielab.jcore.consumer.es.sharedresources; + +public interface StringIndex { + String get(String key); + + String[] getArray(String key); + + void put(String key, String value); + + void put(String key, String[] value); + + void commit(); + + boolean requiresExplicitCommit(); + + void close(); + + void open(); + + int size(); + + default String getName() { + return getClass().getSimpleName(); + } +} From efaa7e1d7c693a77344af0b797e00515a35cdead Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 28 Jul 2021 14:33:46 +0200 Subject: [PATCH 087/269] DocumentReleaseCheckpoint: Fixed a bug where documents were not marked as being processed. The issue was that we counted how often a document was released and compared that number to the number of registered components. However, when a component de-registered itself but had released its documents before, their count was actually too high. The remedy - without introducing other sources of error - was to not just count the number of releases per document but to explicitly track the componentIds that had released each documents. --- .../jcore/ae/checkpoint/DBCheckpointAE.java | 19 +++++++++++--- .../checkpoint/DocumentReleaseCheckpoint.java | 26 +++++++++++++------ .../es/sharedresources/LuceneIndex.java | 8 +++++- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DBCheckpointAE.java b/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DBCheckpointAE.java index 264c32999..cc9b29c8d 100644 --- a/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DBCheckpointAE.java +++ b/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DBCheckpointAE.java @@ -109,7 +109,7 @@ public void batchProcessComplete() throws AnalysisEngineProcessException { @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { super.collectionProcessComplete(); - log.debug("BatchProcessComplete called, stashing {} documents to be ready for marked as being finished", docIds.size()); + log.debug("CollectionProcessComplete called, stashing {} documents to be ready for marked as being finished", docIds.size()); if (indicateFinished) docReleaseCheckpoint.release(jedisSyncKey, docIds.stream()); try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { @@ -121,6 +121,7 @@ public void collectionProcessComplete() throws AnalysisEngineProcessException { } private void customBatchProcessingComplete() throws AnalysisEngineProcessException { + log.debug("CustomBatchProcessComplete called, stashing {} documents to be ready for marked as being finished", docIds.size()); if (indicateFinished) docReleaseCheckpoint.release(jedisSyncKey, docIds.stream()); try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { @@ -199,13 +200,24 @@ private void setLastComponent(CoStoSysConnection conn, String sqlMarkIsProcessed = String.format("UPDATE %s SET %s='%s', %s=TRUE, %s=FALSE WHERE %s", subsetTableName, Constants.LAST_COMPONENT, componentDbName, Constants.IS_PROCESSED, Constants.IN_PROCESS, primaryKeyPsString); if (!documentIdsToSetLastComponent.isEmpty()) { - log.debug("Setting the last component to {} for {} documents", componentDbName, documentIdsToSetLastComponent.size()); + log.debug("Setting the last component to '{}' for {} documents", componentDbName, documentIdsToSetLastComponent.size()); updateSubsetTable(conn, documentIdsToSetLastComponent, sqlSetLastComponent); } if (markIsProcessed) { - log.debug("Marking {} documents to having been processed by component \"{}\".", documentIdsToSetLastComponent.size(), componentDbName); + log.debug("Marking {} documents to having been processed by component \"{}\".", processedDocumentIds.size(), componentDbName); + log.debug("SQL: {}", sqlMarkIsProcessed); updateSubsetTable(conn, processedDocumentIds, sqlMarkIsProcessed); } + try { + log.debug("Connection is auto commit: {}", conn.getAutoCommit()); + if (!conn.getAutoCommit()) { + log.debug("Committing changes"); + conn.commit(); + } + } catch (SQLException e) { + log.error("Could not commit the document processing status changes.", e); + throw new AnalysisEngineProcessException(e); + } } private void updateSubsetTable(CoStoSysConnection conn, Collection documentIdsToMark, String sql) throws AnalysisEngineProcessException { @@ -222,6 +234,7 @@ private void updateSubsetTable(CoStoSysConnection conn, Collection d ps.addBatch(); } try { + log.debug("Executing SQL command batch for being processed."); ps.executeBatch(); } catch (BatchUpdateException e) { if (e.getMessage().contains("deadlock detected")) { diff --git a/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DocumentReleaseCheckpoint.java b/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DocumentReleaseCheckpoint.java index cb94a8aa3..fd40fa5e1 100644 --- a/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DocumentReleaseCheckpoint.java +++ b/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DocumentReleaseCheckpoint.java @@ -1,11 +1,11 @@ package de.julielab.jcore.ae.checkpoint; -import com.google.common.collect.HashMultiset; -import com.google.common.collect.Multiset; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.HashMap; import java.util.HashSet; +import java.util.Map; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -33,18 +33,18 @@ public class DocumentReleaseCheckpoint { "This is useful when document data is sent batchwise to the database by multiple components: In the case of a crash or manual cancellation of a pipeline run without synchronization is might happen " + "that some components have sent their data and others haven't at the time of termination. To avoid an inconsistent database state," + "a document will only be marked as finished " + - "processed in the JeDIS subset table if all synchronied components in the pipeline have released the document. " + + "processed in the JeDIS subset table if all synchronized components in the pipeline have released the document. " + "This is done by the DBCheckpointAE which must be at the end of the pipeline and have the 'IndicateFinished' parameter set to 'true'. " + "Synchronized components are those that disclose this parameter and have a value set to it."; public static final String PARAM_JEDIS_SYNCHRONIZATION_KEY = "JedisSynchronizationKey"; private final static Logger log = LoggerFactory.getLogger(DocumentReleaseCheckpoint.class); private static DocumentReleaseCheckpoint checkpoint; - private Multiset releasedDocuments; + private Map> releasedDocuments; private Set registeredComponents; private long lastwarning = 1000; private DocumentReleaseCheckpoint() { - releasedDocuments = HashMultiset.create(); + releasedDocuments = new HashMap<>(); registeredComponents = new HashSet<>(); } @@ -83,7 +83,15 @@ public void release(String componentKey, Stream releasedDocumentIds) if (!registeredComponents.contains(componentKey)) throw new IllegalArgumentException("No component is registered for key " + componentKey); synchronized (releasedDocuments) { - releasedDocumentIds.forEach(d -> releasedDocuments.add(d)); + releasedDocumentIds.forEach(d -> releasedDocuments.compute(d, (k, v) -> { + if (v == null) { + Set ret = new HashSet<>(); + ret.add(componentKey); + return ret; + } + v.add(componentKey); + return v; + })); } } @@ -100,9 +108,11 @@ public Set getReleasedDocumentIds() { // Get all documents released by all components Set returnedIds; synchronized (releasedDocuments) { - returnedIds = this.releasedDocuments.elementSet().stream().filter(e -> this.releasedDocuments.count(e) == getNumberOfRegisteredComponents()).collect(Collectors.toSet()); + log.trace("The following {} components are registered for document release: {}", getNumberOfRegisteredComponents(), registeredComponents); + log.trace("Released document counts: {}", this.releasedDocuments); + returnedIds = this.releasedDocuments.keySet().stream().filter(k -> this.releasedDocuments.get(k).containsAll(this.registeredComponents)).collect(Collectors.toSet()); // Remove the completely released documents from the pool of potentially not yet completely released documents. - returnedIds.forEach(id -> this.releasedDocuments.remove(id, Integer.MAX_VALUE)); + returnedIds.forEach(id -> this.releasedDocuments.remove(id)); } log.debug("Returning {} documents released by all registered components. {} document IDs remain that have not yet been released by all registered components.", returnedIds.size(), this.releasedDocuments.size()); if (this.releasedDocuments.size() > lastwarning) { diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/LuceneIndex.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/LuceneIndex.java index 204f07abb..a28c0a5c1 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/LuceneIndex.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/LuceneIndex.java @@ -11,6 +11,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.File; import java.io.IOException; import java.nio.file.Path; import java.util.Arrays; @@ -24,14 +25,19 @@ public class LuceneIndex implements StringIndex { public LuceneIndex(String indexDirectory) { try { Path lucene = Path.of(indexDirectory); + File directoryFile = lucene.toFile(); + boolean indexExists = directoryFile.exists() && directoryFile.isDirectory() && directoryFile.list().length != 0; directory = NIOFSDirectory.open(lucene); // Do not open a writer to an existing index. This causes locking issues when starting multiple // pipelines in parallel. // Of course, the first pipeline still needs to create the index, so this must be a one-time effort // that has to be completed before the other pipelines are started. - if (!lucene.toFile().exists()) { + if (!indexExists) { + log.debug("Creating index writer for index directory {}.", indexDirectory); IndexWriterConfig iwc = new IndexWriterConfig(); iw = new IndexWriter(directory, iwc); + } else { + log.debug("Index directory {} already"); } } catch (IOException e) { log.error("could not initialize Lucene index", e); From 273c876671e831f431f83f11d61cf2d15f599b07 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 29 Jul 2021 16:56:17 +0200 Subject: [PATCH 088/269] XMIDbWriter: Fixed a bug where old annotation values were not set to `null` in the DB table. When in a former run an annotation value was created, e.g. because some named entities were found, and in the new run no entities were found, the old value was not removed. This is fixed now. --- .../jcore/reader/xmi/XmiDBMultiplier.java | 5 +++-- .../julielab/jcore/consumer/xmi/XMIDBWriter.java | 2 +- .../jcore/consumer/xmi/XmiDataInserter.java | 15 +++++++++------ 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java index a29dcb8dd..cb0306216 100644 --- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java +++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java @@ -90,8 +90,9 @@ public AbstractCas next() throws AnalysisEngineProcessException { populateCas(jCas); } } catch (Throwable throwable) { - log.error("Error while reading document from the database: ", throwable); - throw throwable; + log.error("Error while reading document from the database. Releasing the CAS. ", throwable); + jCas.release(); + throw new AnalysisEngineProcessException(throwable); } return jCas; } diff --git a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java index 3596db300..b9594dda3 100644 --- a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java +++ b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java @@ -737,7 +737,7 @@ private void createAnnotationModules() throws AnalysisEngineProcessException { // adapt the map keys to table names (currently, the keys are the // Java type names) splitXmiData = convertModuleLabelsToColumnNames(splitXmiData); - + log.trace("The following columns have XMI data: {}", splitXmiData.keySet()); for (String columnName : splitXmiData.keySet()) { boolean isBaseDocumentColumn = columnName.equals(XmiSplitConstants.BASE_DOC_COLUMN); diff --git a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java index 080ffd613..1a75f474e 100644 --- a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java +++ b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java @@ -19,6 +19,7 @@ import java.util.*; import java.util.function.Function; import java.util.stream.Collectors; +import java.util.stream.Stream; public class XmiDataInserter { @@ -65,16 +66,16 @@ public XmiDataInserter(Set annotationModuleColumnNames, * @throws AnalysisEngineProcessException */ public void sendXmiDataToDatabase(String xmiTableName, List annotationModules, String subsetTableName, Boolean storeBaseDocument, Boolean deleteObsolete, Map shaMap) throws XmiDataInsertionException { - if (log.isTraceEnabled()) { - log.trace("Sending XMI data for {} tables to the database", annotationModules.size()); - log.trace("Sending {} XMI data items", annotationModules.size()); - } + log.trace("Sending {} XMI data items", annotationModules.size()); final Map> dataByDoc = annotationModules.stream().collect(Collectors.groupingBy(XmiData::getDocId)); // Collect all document IDs we want to add something for into the database. This can be annotations or the hash. - final Set documentIdsWithValues = shaMap != null ? Sets.union(dataByDoc.keySet(), shaMap.keySet()) : dataByDoc.keySet(); + final Set documentIdsWithData = shaMap != null ? Sets.union(dataByDoc.keySet(), shaMap.keySet()) : dataByDoc.keySet(); + log.trace("There are {} documents with values to be updated in the database.", documentIdsWithData.size()); class RowIterator implements Iterator> { - private Iterator docIdIterator = documentIdsWithValues.iterator(); + // Add documents that have been processed but no data. We need to do this to override potentially existing + // annotation values with null to remove them. + private Iterator docIdIterator = Stream.concat(documentIdsWithData.stream(), processedDocumentIds.stream()).distinct().iterator(); private FieldConfig fieldConfig = dbc.getFieldConfiguration(schemaDocument); private List> fields = fieldConfig.getFields(); @@ -141,7 +142,9 @@ public Map next() { // Set columns without a value to null to delete a potentially existing value. if (updateMode) { Set annotationColumnsWithValues = dataList.stream().map(XmiData::getColumnName).collect(Collectors.toSet()); + log.trace("Annotation columns with values: {}", annotationColumnsWithValues); final Sets.SetView columnsWithoutValues = Sets.difference(annotationModuleColumnNames, annotationColumnsWithValues); + log.trace("Annotation columns without values: {}", columnsWithoutValues); columnsWithoutValues.forEach(col -> { row.put(col, null); log.trace("{}=null", col); From d6dd6513cb4e99b727191cc2f74e2dc375876ae4 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 20 Sep 2021 13:30:26 +0200 Subject: [PATCH 089/269] Support feature values for annotation adder. Fixes #124 --- jcore-annotation-adder-ae/README.md | 2 +- .../AnnotationAdderHelper.java | 33 +++++++++++-- .../TextAnnotationListAdder.java | 3 +- .../annotationformat/AnnotationFormat.java | 2 + .../DocumentClassAnnotationFormat.java | 5 ++ .../SimpleTSVEntityAnnotationFormat.java | 21 +++++++- ...tyWithDocumentTextShaAnnotationFormat.java | 5 ++ .../ExternalTextAnnotation.java | 6 +++ .../FileAnnotationSource.java | 3 +- .../InMemoryFileTextAnnotationProvider.java | 3 ++ .../AnnotationAdderAnnotatorTest.java | 48 ++++++++++++++++++- .../AnnotationAdderHelperTest.java | 24 ++++++++++ .../SimpleTSVEntityAnnotationFormatTest.java | 27 +++++++++++ ...eannotations_character_offsets_payload.tsv | 4 ++ 14 files changed, 176 insertions(+), 10 deletions(-) create mode 100644 jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelperTest.java create mode 100644 jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormatTest.java create mode 100644 jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets_payload.tsv diff --git a/jcore-annotation-adder-ae/README.md b/jcore-annotation-adder-ae/README.md index bf3d32b2c..cf0a558ff 100644 --- a/jcore-annotation-adder-ae/README.md +++ b/jcore-annotation-adder-ae/README.md @@ -28,7 +28,7 @@ For document class annotations, no offset mode is required, obviously. Whether t **3. External Resource Dependencies** -This component requires an external resource given with the `AnnotationSource` key. This dependency definition is present in the provided default descriptor. +This component requires an external resource given with the `AnnotationSource` key. This dependency definition is pre-configured in the provided default descriptor and must be added to point to the correct annotation source. The external dependency may currently be a file which is read completely into an in-memory map by the `de.julielab.jcore.ae.annotationadder.annotationsources.InMemoryFileTextAnnotationProvider` class for textual annotations with offsets or by the `de.julielab.jcore.ae.annotationadder.annotationsources.InMemoryFileDocumentClassAnnotationProvider` class for document classes. Both provider classes implement the required external resource interface `de.julielab.jcore.ae.annotationadder.annotationsources.AnnotationProvider`. diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelper.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelper.java index 97a2d8447..a3c87e749 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelper.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelper.java @@ -1,8 +1,10 @@ package de.julielab.jcore.ae.annotationadder; +import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation; import de.julielab.jcore.ae.annotationadder.annotationrepresentations.TextAnnotation; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; +import org.apache.commons.lang3.StringUtils; import org.apache.uima.cas.CASException; import org.apache.uima.cas.FSIterator; import org.apache.uima.fit.util.JCasUtil; @@ -11,10 +13,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -29,7 +30,10 @@ public class AnnotationAdderHelper { private Map> tokensBySentences; private Matcher wsFinder = Pattern.compile("\\s").matcher(""); private Matcher nonWsMatcher = Pattern.compile("[^\\s]+").matcher(""); - + /** + * Caches methods for feature + */ + private Map featureSetters; public void setAnnotationOffsetsRelativeToDocument(Annotation annotation, TextAnnotation a, AnnotationAdderConfiguration configuration) throws CASException, AnnotationOffsetException { if (configuration.getOffsetMode() == AnnotationAdderAnnotator.OffsetMode.CHARACTER) { @@ -140,4 +144,23 @@ public List createTokenList(JCas jCas, AnnotationAdderConfiguration confi } return tokenList; } + + public void setAnnotationPayloadsToFeatures(Annotation annotation, ExternalTextAnnotation a) { + Collection keys = a.getPayloadKeys(); + if (!keys.isEmpty()) + featureSetters = new HashMap<>(); + try { + for (String key : keys) { + Object value = a.getPayload(key); + Method setter = featureSetters.get(key); + if (setter == null) { + setter = annotation.getClass().getMethod("set" + StringUtils.capitalize(key), value.getClass()); + featureSetters.put(key, setter); + } + setter.invoke(annotation, value); + } + } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) { + e.printStackTrace(); + } + } } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java index d249cf906..40436c2cb 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java @@ -43,7 +43,7 @@ public boolean addAnnotations(AnnotationData data, AnnotationAdderHelper helper, throw new IllegalArgumentException("The entity annotation type " + uimaType + " does not exist in the type system."); try { // The sha check is supposed to compare the document text on which the annotation was made with the - // document text the current CAS has. If the differ, the annotations will most likely have + // document text the current CAS has. If they differ, the annotations will most likely have // offset discrepancies which is why they won't be added and a warning will be issued. final String shaFromAnnotation = (String) a.getPayload("sha"); boolean shaMatches = true; @@ -60,6 +60,7 @@ public boolean addAnnotations(AnnotationData data, AnnotationAdderHelper helper, if (a.getStart() >= 0) { final Annotation annotation = JCoReAnnotationTools.getAnnotationByClassName(jCas, uimaType); helper.setAnnotationOffsetsRelativeToDocument(annotation, a, configuration); + helper.setAnnotationPayloadsToFeatures(annotation, a); annotation.addToIndexes(); } else { log.trace("ExternalAnnotation for document {} has no entity offsets or offsets < 0, not adding anything to the CAS.", a.getDocumentId()); diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/AnnotationFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/AnnotationFormat.java index cb28d7d9f..46d652dcf 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/AnnotationFormat.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/AnnotationFormat.java @@ -4,4 +4,6 @@ public interface AnnotationFormat { T parse(String data); + + void withHeader(boolean withHeader); } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/DocumentClassAnnotationFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/DocumentClassAnnotationFormat.java index 6376e803d..48f03d136 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/DocumentClassAnnotationFormat.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/DocumentClassAnnotationFormat.java @@ -17,4 +17,9 @@ public ExternalDocumentClassAnnotation parse(String data) { String type = null; return new ExternalDocumentClassAnnotation(docId, documentClass, confidence, componentId); } + + @Override + public void withHeader(boolean withHeader) { + // does nothing + } } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormat.java index b35e4f26c..1a71edfcc 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormat.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormat.java @@ -3,6 +3,9 @@ import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation; public class SimpleTSVEntityAnnotationFormat implements AnnotationFormat { + private String[] header; + private boolean withHeader; + @Override public ExternalTextAnnotation parse(String data) { if (data == null || data.startsWith("#")) @@ -10,12 +13,28 @@ public ExternalTextAnnotation parse(String data) { final String[] record = data.split("\t"); if (record.length < 3) throw new IllegalArgumentException("Expected a 3 or 4-column format providing document ID, begin, end and UIMA type (optional if the default type is set to the AnnotationAdderAnnotator) for the annotation but got " + record.length + " columns: " + data); + if (withHeader && header == null) { + header = record; + return null; + } String docId = record[0]; int begin = Integer.parseInt(record[1]); int end = Integer.parseInt(record[2]); String type = null; if (record.length > 3) type = record[3]; - return new ExternalTextAnnotation(docId, begin, end, type); + ExternalTextAnnotation externalTextAnnotation = new ExternalTextAnnotation(docId, begin, end, type); + if (record.length > 4) { + if (header == null) + throw new IllegalStateException("There are columns exceeding the default 4-column format but no header was given to deliver their names."); + for (int i = 4; i < record.length; i++) + externalTextAnnotation.addPayload(header[i], record[i]); + } + return externalTextAnnotation; + } + + @Override + public void withHeader(boolean withHeader) { + this.withHeader = withHeader; } } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityWithDocumentTextShaAnnotationFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityWithDocumentTextShaAnnotationFormat.java index f46893595..9332a9d93 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityWithDocumentTextShaAnnotationFormat.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityWithDocumentTextShaAnnotationFormat.java @@ -21,4 +21,9 @@ public ExternalTextAnnotation parse(String data) { externalTextAnnotation.addPayload("sha", sha); return externalTextAnnotation; } + + @Override + public void withHeader(boolean withHeader) { + // does nothing + } } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/ExternalTextAnnotation.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/ExternalTextAnnotation.java index bd1408f47..7c1dd7c03 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/ExternalTextAnnotation.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/ExternalTextAnnotation.java @@ -1,5 +1,7 @@ package de.julielab.jcore.ae.annotationadder.annotationrepresentations; +import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.Map; @@ -59,4 +61,8 @@ public void addPayload(String key, Object value) { public Object getPayload(String key) { return payload != null ? payload.get(key) : null; } + + public Collection getPayloadKeys() { + return payload != null ? payload.keySet() : Collections.emptySet(); + } } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java index 4e6ba0a88..c2a4cb586 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java @@ -12,6 +12,7 @@ import java.io.File; import java.io.IOException; import java.util.Map; +import java.util.Objects; import java.util.stream.Collectors; public class FileAnnotationSource implements AnnotationSource> { @@ -25,7 +26,7 @@ public FileAnnotationSource(AnnotationFormat format) { public void loadAnnotations(File annotationfile) { try (BufferedReader br = FileUtilities.getReaderFromFile(annotationfile)) { - entitiesByDocId = br.lines().map(format::parse).collect(Collectors.groupingBy(AnnotationData::getDocumentId, Collectors.toCollection(AnnotationList::new))); + entitiesByDocId = br.lines().map(format::parse).filter(Objects::nonNull).collect(Collectors.groupingBy(AnnotationData::getDocumentId, Collectors.toCollection(AnnotationList::new))); } catch (IOException e) { e.printStackTrace(); } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java index 6de11f4d3..411223e98 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java @@ -15,6 +15,7 @@ public class InMemoryFileTextAnnotationProvider implements AnnotationProvider { public static final String PARAM_ANNOTATION_FORMAT = "AnnotationFormatClass"; + public static final String PARAM_WITH_HEADER = "WithHeader"; private final static Logger log = LoggerFactory.getLogger(InMemoryFileTextAnnotationProvider.class); private AnnotationSource annotationSource; @@ -27,9 +28,11 @@ public AnnotationList getAnnotations(String id) { public void load(DataResource dataResource) throws ResourceInitializationException { final ConfigurationParameterSettings parameterSettings = dataResource.getMetaData().getConfigurationParameterSettings(); final String formatClassName = (String) Optional.ofNullable(parameterSettings.getParameterValue(PARAM_ANNOTATION_FORMAT)).orElse(SimpleTSVEntityAnnotationFormat.class.getCanonicalName()); + final boolean withHeader = (boolean) Optional.ofNullable(parameterSettings.getParameterValue(PARAM_WITH_HEADER)).orElse(false); AnnotationFormat format; try { format = (AnnotationFormat) Class.forName(formatClassName).getDeclaredConstructor().newInstance(); + format.withHeader(withHeader); } catch (NoSuchMethodException | InvocationTargetException | InstantiationException | IllegalAccessException | ClassNotFoundException e) { log.error("Could not instantiate class {}", formatClassName); throw new ResourceInitializationException(e); diff --git a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java index a7f76f786..6aad5e94a 100644 --- a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java +++ b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java @@ -47,7 +47,52 @@ public void testCharacterOffsets() throws Exception { assertThat(genes.get(1).getBegin()).isEqualTo(5); assertThat(genes.get(1).getEnd()).isEqualTo(10); - // Test doc2 (no gene annotations) + // Test doc2 (no gene annotations, there will be a warning on DEBUG level) + jCas.reset(); + jCas.setDocumentText("There are no gene mentions in here"); + Header h2 = new Header(jCas); + h2.setDocId("doc2"); + h2.addToIndexes(); + engine.process(jCas); + assertThat(JCasUtil.exists(jCas, Gene.class)).isFalse(); + + // Test doc3 (one gene annotation) + jCas.reset(); + jCas.setDocumentText("PRKAVI does not exist, I think. But this is just a test so it doesn't matter."); + Header h3 = new Header(jCas); + h3.setDocId("doc3"); + h3.addToIndexes(); + engine.process(jCas); + final Gene gene = JCasUtil.selectSingle(jCas, Gene.class); + assertThat(gene.getBegin()).isEqualTo(0); + assertThat(gene.getEnd()).isEqualTo(6); + } + + @Test + public void testPayload() throws Exception { + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); + final ExternalResourceDescription externalResourceDescription = ExternalResourceFactory.createExternalResourceDescription(InMemoryFileTextAnnotationProvider.class, new File("src/test/resources/geneannotations_character_offsets_payload.tsv"), InMemoryFileTextAnnotationProvider.PARAM_WITH_HEADER, true); + final AnalysisEngine engine = AnalysisEngineFactory.createEngine(AnnotationAdderAnnotator.class, AnnotationAdderAnnotator.KEY_ANNOTATION_SOURCE, externalResourceDescription); + // Test doc1 (two gene annotations) + jCas.setDocumentText("BRCA PRKII are the genes of this sentence."); + final Header h = new Header(jCas); + h.setDocId("doc1"); + h.addToIndexes(); + + engine.process(jCas); + + final List genes = new ArrayList<>(JCasUtil.select(jCas, Gene.class)); + assertThat(genes).hasSize(2); + + assertThat(genes.get(0).getBegin()).isEqualTo(0); + assertThat(genes.get(0).getEnd()).isEqualTo(4); + assertThat(genes.get(0).getSpecificType()).isEqualTo("protein"); + + assertThat(genes.get(1).getBegin()).isEqualTo(5); + assertThat(genes.get(1).getEnd()).isEqualTo(10); + assertThat(genes.get(1).getSpecificType()).isEqualTo("dna"); + + // Test doc2 (no gene annotations, there will be a warning on DEBUG level) jCas.reset(); jCas.setDocumentText("There are no gene mentions in here"); Header h2 = new Header(jCas); @@ -66,6 +111,7 @@ public void testCharacterOffsets() throws Exception { final Gene gene = JCasUtil.selectSingle(jCas, Gene.class); assertThat(gene.getBegin()).isEqualTo(0); assertThat(gene.getEnd()).isEqualTo(6); + assertThat(gene.getComponentId()).isEqualTo("GoldData"); } @Test diff --git a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelperTest.java b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelperTest.java new file mode 100644 index 000000000..bcb96ec08 --- /dev/null +++ b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelperTest.java @@ -0,0 +1,24 @@ +package de.julielab.jcore.ae.annotationadder; + +import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation; +import de.julielab.jcore.types.Gene; +import org.apache.uima.UIMAException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class AnnotationAdderHelperTest { + + @Test + void setAnnotationPayloadsToFeatures() throws UIMAException { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types"); + Gene gene = new Gene(jCas); + ExternalTextAnnotation extAnnotation = new ExternalTextAnnotation("1", 0, 1, "dummy"); + extAnnotation.addPayload("specificType", "protein"); + AnnotationAdderHelper helper = new AnnotationAdderHelper(); + helper.setAnnotationPayloadsToFeatures(gene, extAnnotation); + assertEquals("protein", gene.getSpecificType()); + } +} \ No newline at end of file diff --git a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormatTest.java b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormatTest.java new file mode 100644 index 000000000..eb646e0e2 --- /dev/null +++ b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormatTest.java @@ -0,0 +1,27 @@ +package de.julielab.jcore.ae.annotationadder.annotationformat; + +import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; + +class SimpleTSVEntityAnnotationFormatTest { + + @Test + void parse() { + SimpleTSVEntityAnnotationFormat format = new SimpleTSVEntityAnnotationFormat(); + format.withHeader(true); + // should be ignored + assertNull(format.parse("# comment")); + // should be stored as header but not return something + assertNull(format.parse("docId\tbegin\tend\ttype\tspecificType\tcomponentId")); + ExternalTextAnnotation extAnnotation = format.parse("123\t0\t5\tde.julielab.jcore.types.Gene\tprotein\tGoldAnnotation"); + assertEquals("123", extAnnotation.getDocumentId()); + assertEquals(0, extAnnotation.getStart()); + assertEquals(5, extAnnotation.getEnd()); + assertEquals("de.julielab.jcore.types.Gene", extAnnotation.getUimaType()); + assertEquals("protein", extAnnotation.getPayload("specificType")); + assertEquals("GoldAnnotation", extAnnotation.getPayload("componentId")); + } +} \ No newline at end of file diff --git a/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets_payload.tsv b/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets_payload.tsv new file mode 100644 index 000000000..7606678d6 --- /dev/null +++ b/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets_payload.tsv @@ -0,0 +1,4 @@ +docId begin end uimaType specificType componentId +doc1 0 4 de.julielab.jcore.types.Gene protein GoldData +doc1 5 10 de.julielab.jcore.types.Gene dna GoldData +doc3 0 6 de.julielab.jcore.types.Gene gene GoldData \ No newline at end of file From 166e9d026c924fbc03ea091993a488b12b71f52a Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 20 Sep 2021 13:46:06 +0200 Subject: [PATCH 090/269] Add the `WithHeader` parameter to the descriptor. --- .../ae/annotationadder/desc/jcore-annotation-adder-ae.xml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml b/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml index 2a72b89f9..e30b428f9 100644 --- a/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml +++ b/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml @@ -79,6 +79,13 @@ false false + + WithHeader + Indicates whether the the input TSV file has a header line. + Boolean + false + false + From e58543db2db1e848a4b5924efeaaf431a9d95df5 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 23 Sep 2021 14:07:25 +0200 Subject: [PATCH 091/269] Allow the specification of column names in the descriptor. Thus, it is not required to add the source file to provide the TSV header. --- .../AnnotationAdderAnnotator.java | 2 + .../annotationformat/AnnotationFormat.java | 4 +- .../DocumentClassAnnotationFormat.java | 9 +++- .../SimpleTSVEntityAnnotationFormat.java | 17 ++++--- ...tyWithDocumentTextShaAnnotationFormat.java | 9 +++- .../InMemoryFileTextAnnotationProvider.java | 11 ++-- .../desc/jcore-annotation-adder-ae.xml | 9 +++- .../AnnotationAdderAnnotatorTest.java | 51 ++++++++++++++++++- .../SimpleTSVEntityAnnotationFormatTest.java | 2 +- .../geneannotations_character_offsets.tsv | 6 +-- 10 files changed, 99 insertions(+), 21 deletions(-) diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java index b31fc7d05..802206a63 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java @@ -39,6 +39,7 @@ public enum OffsetMode {CHARACTER, TOKEN} @ConfigurationParameter(name = PARAM_PREVENT_PROCESSED_MARK, mandatory = false, description = "This setting is only in effect if an input format is used that contains document text SHA256 digests while also writing the annotation results into a JeDIS database. If then a CAS document text, to which annotations should be added, does not match the digest given by an annotation, this CAS will not marked as being finished processing by DBCheckpointAE that may follow in the pipeline. The idea is that the mismatched documents require a reprocessing of the original annotation creation algorithm because their text has been changed relative to the annotation on file. By not setting the document as being finished processed, it is straightforward to process only those documents again that failed to add one or multiple annotations.") private boolean preventProcessedOnDigestMismatch; + private List annotationAdders = Arrays.asList(new TextAnnotationListAdder(), new DocumentClassAnnotationAdder()); /** @@ -49,6 +50,7 @@ public enum OffsetMode {CHARACTER, TOKEN} public void initialize(final UimaContext aContext) throws ResourceInitializationException { offsetMode = OffsetMode.valueOf(Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_OFFSET_MODE)).orElse(OffsetMode.CHARACTER.name())); defaultUimaType = (String) aContext.getConfigParameterValue(PARAM_DEFAULT_UIMA_TYPE); + preventProcessedOnDigestMismatch = Optional.ofNullable((Boolean) aContext.getConfigParameterValue(PARAM_PREVENT_PROCESSED_MARK)).orElse(false); try { annotationProvider = (AnnotationProvider) aContext.getResourceObject(KEY_ANNOTATION_SOURCE); } catch (ResourceAccessException e) { diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/AnnotationFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/AnnotationFormat.java index 46d652dcf..a0c31a52f 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/AnnotationFormat.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/AnnotationFormat.java @@ -5,5 +5,7 @@ public interface AnnotationFormat { T parse(String data); - void withHeader(boolean withHeader); + void hasHeader(boolean withHeader); + + void setColumnNames(String[] header); } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/DocumentClassAnnotationFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/DocumentClassAnnotationFormat.java index 48f03d136..bc24816e3 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/DocumentClassAnnotationFormat.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/DocumentClassAnnotationFormat.java @@ -19,7 +19,12 @@ public ExternalDocumentClassAnnotation parse(String data) { } @Override - public void withHeader(boolean withHeader) { - // does nothing + public void hasHeader(boolean withHeader) { + // does nothing right now + } + + @Override + public void setColumnNames(String[] header) { + // does nothing right now } } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormat.java index 1a71edfcc..bee28da11 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormat.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormat.java @@ -8,7 +8,7 @@ public class SimpleTSVEntityAnnotationFormat implements AnnotationFormat 4) { - if (header == null) - throw new IllegalStateException("There are columns exceeding the default 4-column format but no header was given to deliver their names."); - for (int i = 4; i < record.length; i++) - externalTextAnnotation.addPayload(header[i], record[i]); + if (header != null) { + for (int i = 4; i < record.length; i++) + externalTextAnnotation.addPayload(header[i], record[i]); + } } return externalTextAnnotation; } @Override - public void withHeader(boolean withHeader) { + public void hasHeader(boolean withHeader) { this.withHeader = withHeader; } + + @Override + public void setColumnNames(String[] header) { + this.header = header; + } } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityWithDocumentTextShaAnnotationFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityWithDocumentTextShaAnnotationFormat.java index 9332a9d93..0c1c10824 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityWithDocumentTextShaAnnotationFormat.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityWithDocumentTextShaAnnotationFormat.java @@ -23,7 +23,12 @@ public ExternalTextAnnotation parse(String data) { } @Override - public void withHeader(boolean withHeader) { - // does nothing + public void hasHeader(boolean withHeader) { + // does nothing right now + } + + @Override + public void setColumnNames(String[] header) { + // does nothing right now } } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java index 411223e98..1f6914340 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java @@ -15,9 +15,11 @@ public class InMemoryFileTextAnnotationProvider implements AnnotationProvider { public static final String PARAM_ANNOTATION_FORMAT = "AnnotationFormatClass"; - public static final String PARAM_WITH_HEADER = "WithHeader"; + public static final String PARAM_INPUT_HAS_HEADER = "InputHasHeader"; + public static final String PARAM_COLUMN_NAMES = "ColumnNames"; private final static Logger log = LoggerFactory.getLogger(InMemoryFileTextAnnotationProvider.class); private AnnotationSource annotationSource; + private AnnotationFormat format; @Override public AnnotationList getAnnotations(String id) { @@ -28,11 +30,12 @@ public AnnotationList getAnnotations(String id) { public void load(DataResource dataResource) throws ResourceInitializationException { final ConfigurationParameterSettings parameterSettings = dataResource.getMetaData().getConfigurationParameterSettings(); final String formatClassName = (String) Optional.ofNullable(parameterSettings.getParameterValue(PARAM_ANNOTATION_FORMAT)).orElse(SimpleTSVEntityAnnotationFormat.class.getCanonicalName()); - final boolean withHeader = (boolean) Optional.ofNullable(parameterSettings.getParameterValue(PARAM_WITH_HEADER)).orElse(false); - AnnotationFormat format; + final boolean hasHeader = (boolean) Optional.ofNullable(parameterSettings.getParameterValue(PARAM_INPUT_HAS_HEADER)).orElse(false); + final String[] columnNames = (String[])parameterSettings.getParameterValue(PARAM_COLUMN_NAMES); try { format = (AnnotationFormat) Class.forName(formatClassName).getDeclaredConstructor().newInstance(); - format.withHeader(withHeader); + format.hasHeader(hasHeader); + format.setColumnNames(columnNames); } catch (NoSuchMethodException | InvocationTargetException | InstantiationException | IllegalAccessException | ClassNotFoundException e) { log.error("Could not instantiate class {}", formatClassName); throw new ResourceInitializationException(e); diff --git a/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml b/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml index e30b428f9..71e138a6c 100644 --- a/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml +++ b/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml @@ -80,12 +80,19 @@ false - WithHeader + InputHasHeader Indicates whether the the input TSV file has a header line. Boolean false false + + ColumnNames + For column formats without a header. Required when the columns should be mapped to annotation type features. Then, the headers but correspond to the feature names and are case sensitive. When specified, the number of elements for this parameter must equal the number of columns in the input file. Then, the i-th parameter value will be set as the name of the i-th column. + String + true + false + diff --git a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java index 6aad5e94a..83f2aa54d 100644 --- a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java +++ b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java @@ -71,7 +71,7 @@ public void testCharacterOffsets() throws Exception { @Test public void testPayload() throws Exception { final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); - final ExternalResourceDescription externalResourceDescription = ExternalResourceFactory.createExternalResourceDescription(InMemoryFileTextAnnotationProvider.class, new File("src/test/resources/geneannotations_character_offsets_payload.tsv"), InMemoryFileTextAnnotationProvider.PARAM_WITH_HEADER, true); + final ExternalResourceDescription externalResourceDescription = ExternalResourceFactory.createExternalResourceDescription(InMemoryFileTextAnnotationProvider.class, new File("src/test/resources/geneannotations_character_offsets_payload.tsv"), InMemoryFileTextAnnotationProvider.PARAM_INPUT_HAS_HEADER, true); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(AnnotationAdderAnnotator.class, AnnotationAdderAnnotator.KEY_ANNOTATION_SOURCE, externalResourceDescription); // Test doc1 (two gene annotations) jCas.setDocumentText("BRCA PRKII are the genes of this sentence."); @@ -114,6 +114,55 @@ public void testPayload() throws Exception { assertThat(gene.getComponentId()).isEqualTo("GoldData"); } + @Test + public void testHeaderParameter() throws Exception { + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); + final ExternalResourceDescription externalResourceDescription = ExternalResourceFactory.createExternalResourceDescription(InMemoryFileTextAnnotationProvider.class, new File("src/test/resources/geneannotations_character_offsets.tsv"), InMemoryFileTextAnnotationProvider.PARAM_COLUMN_NAMES, new String[]{"docId", "begin", "end", "uimaType", "specificType", "componentId"}); + final AnalysisEngine engine = AnalysisEngineFactory.createEngine(AnnotationAdderAnnotator.class, AnnotationAdderAnnotator.KEY_ANNOTATION_SOURCE, externalResourceDescription); + // Test doc1 (two gene annotations) + jCas.setDocumentText("BRCA PRKII are the genes of this sentence."); + final Header h = new Header(jCas); + h.setDocId("doc1"); + h.addToIndexes(); + + engine.process(jCas); + + final List genes = new ArrayList<>(JCasUtil.select(jCas, Gene.class)); + assertThat(genes).hasSize(2); + + assertThat(genes.get(0).getBegin()).isEqualTo(0); + assertThat(genes.get(0).getEnd()).isEqualTo(4); + assertThat(genes.get(0).getSpecificType()).isEqualTo("additionalColumn1"); + assertThat(genes.get(0).getComponentId()).isEqualTo("additionalColumn2"); + + assertThat(genes.get(1).getBegin()).isEqualTo(5); + assertThat(genes.get(1).getEnd()).isEqualTo(10); + assertThat(genes.get(1).getSpecificType()).isEqualTo("additionalColumn1"); + assertThat(genes.get(1).getComponentId()).isEqualTo("additionalColumn2"); + + // Test doc2 (no gene annotations, there will be a warning on DEBUG level) + jCas.reset(); + jCas.setDocumentText("There are no gene mentions in here"); + Header h2 = new Header(jCas); + h2.setDocId("doc2"); + h2.addToIndexes(); + engine.process(jCas); + assertThat(JCasUtil.exists(jCas, Gene.class)).isFalse(); + + // Test doc3 (one gene annotation) + jCas.reset(); + jCas.setDocumentText("PRKAVI does not exist, I think. But this is just a test so it doesn't matter."); + Header h3 = new Header(jCas); + h3.setDocId("doc3"); + h3.addToIndexes(); + engine.process(jCas); + final Gene gene = JCasUtil.selectSingle(jCas, Gene.class); + assertThat(gene.getBegin()).isEqualTo(0); + assertThat(gene.getEnd()).isEqualTo(6); + assertThat(gene.getSpecificType()).isEqualTo("additionalColumn1"); + assertThat(gene.getComponentId()).isEqualTo("additionalColumn2"); + } + @Test public void testTokenOffsets() throws Exception { final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); diff --git a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormatTest.java b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormatTest.java index eb646e0e2..848526c03 100644 --- a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormatTest.java +++ b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormatTest.java @@ -11,7 +11,7 @@ class SimpleTSVEntityAnnotationFormatTest { @Test void parse() { SimpleTSVEntityAnnotationFormat format = new SimpleTSVEntityAnnotationFormat(); - format.withHeader(true); + format.hasHeader(true); // should be ignored assertNull(format.parse("# comment")); // should be stored as header but not return something diff --git a/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets.tsv b/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets.tsv index a3b4799ab..33babd2dc 100644 --- a/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets.tsv +++ b/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets.tsv @@ -1,3 +1,3 @@ -doc1 0 4 de.julielab.jcore.types.Gene -doc1 5 10 de.julielab.jcore.types.Gene -doc3 0 6 de.julielab.jcore.types.Gene \ No newline at end of file +doc1 0 4 de.julielab.jcore.types.Gene additionalColumn1 additionalColumn2 +doc1 5 10 de.julielab.jcore.types.Gene additionalColumn1 additionalColumn2 +doc3 0 6 de.julielab.jcore.types.Gene additionalColumn1 additionalColumn2 \ No newline at end of file From 0aa2bd92c1d5a16f6422998bdb5ea7fc0d936ea6 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 23 Sep 2021 18:49:28 +0200 Subject: [PATCH 092/269] Add logger messages in error cases. --- .../acronymtagger/main/AcronymAnnotator.java | 19 +++++++------------ .../consumer/acronyms/AcronymWriter.java | 8 +++++++- .../FileAnnotationSource.java | 10 +++++----- .../desc/jcore-annotation-adder-ae.xml | 2 +- 4 files changed, 20 insertions(+), 19 deletions(-) diff --git a/jcore-acronym-ae/src/main/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java b/jcore-acronym-ae/src/main/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java index ad7877e80..3bb8fff9e 100644 --- a/jcore-acronym-ae/src/main/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java +++ b/jcore-acronym-ae/src/main/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java @@ -158,12 +158,9 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept LOGGER.info(" done"); - } catch (AnnotatorContextException e) { - throw new ResourceInitializationException(); - } catch (AnnotatorConfigurationException e) { - throw new ResourceInitializationException(); - } catch (ResourceProcessException e) { - throw new ResourceInitializationException(); + } catch (AnnotatorContextException | AnnotatorConfigurationException| ResourceProcessException e) { + LOGGER.error("Could not initialize acronym annotator", e); + throw new ResourceInitializationException(e); } } @@ -242,14 +239,16 @@ public void process(JCas aJCas) { ConsistencyAnnotator ca = new ConsistencyAnnotator(); ca.consistencyAnnotate(aJCas); } - + if (postprocessing) { Postprocessing.doPostprocessing(aJCas); } - + } catch (StringIndexOutOfBoundsException e) { LOGGER.error("typical Error in AcronymAnnotator.process() : StringIndexOutOfBounds"); + } catch (Throwable t) { + LOGGER.error("Acronym resolution error: ", t); } } @@ -557,10 +556,6 @@ private int findFullformStart(String potFF, String acro) { /** * looks for the 'best' position in the sentence to start looking for a fullform * - * @param sentence - * @param acroStart - * @param maxTokens - * @return */ private int getPotFullformStart(String sentence, int acroStart, int acroLength) { diff --git a/jcore-acronym-writer/src/main/java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java b/jcore-acronym-writer/src/main/java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java index b1aabca29..a406021b9 100644 --- a/jcore-acronym-writer/src/main/java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java +++ b/jcore-acronym-writer/src/main/java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java @@ -15,6 +15,8 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; @@ -24,7 +26,7 @@ @ResourceMetaData(name = "JCoRe Acronym Writer", description = "Writes acronym annotation to a text file.") public class AcronymWriter extends JCasAnnotator_ImplBase { - +private final static Logger log = LoggerFactory.getLogger(AcronymWriter.class); public static final String PARAM_OUTPUTFILE = "OutputFile"; @ConfigurationParameter(name = PARAM_OUTPUTFILE) @@ -38,6 +40,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept try { os = FileUtilities.getOutputStreamToFile(new File(outputFile)); } catch (IOException e) { + log.error("Could not initialize acronym writer", e); throw new ResourceInitializationException(e); } } @@ -70,7 +73,10 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { ++abbrCount; } } catch (CASRuntimeException | IOException e) { + log.error("Exception while writing acronyms", e); throw new AnalysisEngineProcessException(e); + } catch (Throwable t) { + log.error("Exception while writing acronyms", t); } } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java index c2a4cb586..69958d586 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java @@ -1,6 +1,6 @@ package de.julielab.jcore.ae.annotationadder.annotationsources; -import de.julielab.java.utilities.FileUtilities; +import de.julielab.java.utilities.UriUtilities; import de.julielab.jcore.ae.annotationadder.annotationformat.AnnotationFormat; import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationData; import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationList; @@ -9,8 +9,8 @@ import org.slf4j.LoggerFactory; import java.io.BufferedReader; -import java.io.File; import java.io.IOException; +import java.net.URI; import java.util.Map; import java.util.Objects; import java.util.stream.Collectors; @@ -24,8 +24,8 @@ public FileAnnotationSource(AnnotationFormat format) { this.format = format; } - public void loadAnnotations(File annotationfile) { - try (BufferedReader br = FileUtilities.getReaderFromFile(annotationfile)) { + private void loadAnnotations(URI annotationUri) { + try (BufferedReader br = UriUtilities.getReaderFromUri(annotationUri)) { entitiesByDocId = br.lines().map(format::parse).filter(Objects::nonNull).collect(Collectors.groupingBy(AnnotationData::getDocumentId, Collectors.toCollection(AnnotationList::new))); } catch (IOException e) { e.printStackTrace(); @@ -35,7 +35,7 @@ public void loadAnnotations(File annotationfile) { @Override public void initialize(DataResource dataResource) { log.info("Loading entity annotations from {}", dataResource.getUri()); - loadAnnotations(new File(dataResource.getUri())); + loadAnnotations(dataResource.getUri()); } @Override diff --git a/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml b/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml index 71e138a6c..20ea1f3d1 100644 --- a/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml +++ b/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml @@ -88,7 +88,7 @@ ColumnNames - For column formats without a header. Required when the columns should be mapped to annotation type features. Then, the headers but correspond to the feature names and are case sensitive. When specified, the number of elements for this parameter must equal the number of columns in the input file. Then, the i-th parameter value will be set as the name of the i-th column. + For column formats without a header. Required when the columns should be mapped to annotation type features. Then, the headers must correspond to the feature names and are case sensitive. When specified, the number of elements for this parameter must equal the number of columns in the input file. Then, the i-th parameter value will be set as the name of the i-th column. String true false From 88f20530cf5a152c8b4171cf4814b923a36078cf Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 24 Sep 2021 11:02:52 +0200 Subject: [PATCH 093/269] Add more logging in error cases. --- .../julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java | 2 +- .../java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/jcore-acronym-ae/src/main/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java b/jcore-acronym-ae/src/main/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java index 3bb8fff9e..a8e588af9 100644 --- a/jcore-acronym-ae/src/main/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java +++ b/jcore-acronym-ae/src/main/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java @@ -234,7 +234,7 @@ public void process(JCas aJCas) { annotate(sentenceText, aJCas, sentence.getBegin()); } - // if extra annotation is whished, do so :-) + // if extra annotation is wished, do so :-) if (consistencyAnno) { ConsistencyAnnotator ca = new ConsistencyAnnotator(); ca.consistencyAnnotate(aJCas); diff --git a/jcore-acronym-writer/src/main/java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java b/jcore-acronym-writer/src/main/java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java index a406021b9..ddc1ba416 100644 --- a/jcore-acronym-writer/src/main/java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java +++ b/jcore-acronym-writer/src/main/java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java @@ -43,10 +43,12 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept log.error("Could not initialize acronym writer", e); throw new ResourceInitializationException(e); } + log.trace("AcronymWriter successfully initialized."); } @Override public void process(JCas jcas) throws AnalysisEngineProcessException { + log.trace("Processing with AcronymWriter"); try { String pubmedId = JCoReTools.getDocId(jcas); FSIterator it = jcas.getAnnotationIndex(Abbreviation.type).iterator(); From e86a2ad1f6ce4d4efa20a765fad34a5be00314bc Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 24 Sep 2021 12:14:13 +0200 Subject: [PATCH 094/269] Make removal of file name extension for docId optional in FileReader. --- .../AnnotationAdderAnnotator.java | 38 +++++++++++-------- .../AnnotationAdderAnnotatorTest.java | 19 ++++++++++ .../jcore/reader/file/main/FileReader.java | 11 ++++-- 3 files changed, 49 insertions(+), 19 deletions(-) diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java index 802206a63..00245937d 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java @@ -6,6 +6,7 @@ import de.julielab.jcore.utility.JCoReTools; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.fit.descriptor.ResourceMetaData; @@ -67,23 +68,28 @@ public void initialize(final UimaContext aContext) throws ResourceInitialization * is where the actual work happens. */ @Override - public void process(final JCas aJCas) { - final String docId = JCoReTools.getDocId(aJCas); - if (docId == null) - log.error("The current document does not have a header. Cannot add external annotations."); - final AnnotationData annotations = annotationProvider.getAnnotations(docId); - final AnnotationAdderHelper helper = new AnnotationAdderHelper(); - if (annotations != null) { - boolean success = false; - int adderNum = 0; - // We are now iterating through the available annotation adders for the one that handles the obtained annotation data - while (adderNum < annotationAdders.size() && !(success = annotationAdders.get(adderNum).addAnnotations(annotations, helper, adderConfiguration, aJCas, preventProcessedOnDigestMismatch))) { - ++adderNum; + public void process(final JCas aJCas) throws AnalysisEngineProcessException { + try { + final String docId = JCoReTools.getDocId(aJCas); + if (docId == null) + log.error("The current document does not have a header. Cannot add external annotations."); + final AnnotationData annotations = annotationProvider.getAnnotations(docId); + final AnnotationAdderHelper helper = new AnnotationAdderHelper(); + if (annotations != null) { + boolean success = false; + int adderNum = 0; + // We are now iterating through the available annotation adders for the one that handles the obtained annotation data + while (adderNum < annotationAdders.size() && !(success = annotationAdders.get(adderNum).addAnnotations(annotations, helper, adderConfiguration, aJCas, preventProcessedOnDigestMismatch))) { + ++adderNum; + } + if (!success) + throw new IllegalArgumentException("There was no annotation adder to handle the annotation data of class " + annotations.getClass().getCanonicalName()); + } else { + log.debug("No external annotations were delivered for document ID {}", docId); } - if (!success) - throw new IllegalArgumentException("There was no annotation adder to handle the annotation data of class " + annotations.getClass().getCanonicalName()); - } else { - log.debug("No external annotations were delivered for document ID {}", docId); + } catch (Throwable t) { + log.error("Could not add annotations due to exception.", t); + throw new AnalysisEngineProcessException(t); } } diff --git a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java index 83f2aa54d..48ee699e7 100644 --- a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java +++ b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java @@ -68,6 +68,25 @@ public void testCharacterOffsets() throws Exception { assertThat(gene.getEnd()).isEqualTo(6); } + @Test + public void testCharacterOffsets2() throws Exception { + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); + final ExternalResourceDescription externalResourceDescription = ExternalResourceFactory.createExternalResourceDescription(InMemoryFileTextAnnotationProvider.class, new File("src/test/resources/test.txt")); + final AnalysisEngine engine = AnalysisEngineFactory.createEngine(AnnotationAdderAnnotator.class, AnnotationAdderAnnotator.KEY_ANNOTATION_SOURCE, externalResourceDescription); + // Test doc1 (two gene annotations) + jCas.setDocumentText("BRCA PRKII are the genes of this sentence."); + final Header h = new Header(jCas); + h.setDocId("10022127.txt"); + h.addToIndexes(); + + engine.process(jCas); + + final List genes = new ArrayList<>(JCasUtil.select(jCas, Gene.class)); + for (Gene g : genes) { + System.out.println(g); + } + } + @Test public void testPayload() throws Exception { final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); diff --git a/jcore-file-reader/src/main/java/de/julielab/jcore/reader/file/main/FileReader.java b/jcore-file-reader/src/main/java/de/julielab/jcore/reader/file/main/FileReader.java index dee16f1d7..3ea69e29e 100644 --- a/jcore-file-reader/src/main/java/de/julielab/jcore/reader/file/main/FileReader.java +++ b/jcore-file-reader/src/main/java/de/julielab/jcore/reader/file/main/FileReader.java @@ -86,6 +86,8 @@ public class FileReader extends CollectionReader_ImplBase { */ public static final String ORIG_FILES_EXT = "OriginalFileExt"; + public static final String REMOVE_FILE_NAME_EXTENSION_FOR_DOC_ID = "RemoveFileNameExtensionForDocId"; + private ArrayList files; private int fileIndex; @@ -110,6 +112,8 @@ public class FileReader extends CollectionReader_ImplBase { private File origFolder; @ConfigurationParameter(name = ORIG_FILES_EXT, mandatory = false) private String origFileExt; + @ConfigurationParameter(name = REMOVE_FILE_NAME_EXTENSION_FOR_DOC_ID, mandatory = false, defaultValue = "true") + private boolean removeFileNameExtensionForDocId; /** * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize() @@ -149,6 +153,7 @@ public void initialize() throws ResourceInitializationException { } else { useFilenameAsDocId = filenameAsDocId; } + removeFileNameExtensionForDocId = Optional.ofNullable((Boolean) getConfigParameterValue(REMOVE_FILE_NAME_EXTENSION_FOR_DOC_ID)).orElse(true); allowedExtensionsArray = (String[]) getConfigParameterValue(ALLOWED_FILE_EXTENSIONS); final Set allowedExtensions = new HashSet<>(); @@ -225,7 +230,7 @@ public void getNext(CAS aCAS) throws IOException, CollectionException { String origText = null; if (origFolder != null) { - File origFile = new File(origFolder, getFileName(file) + "." + origFileExt); + File origFile = new File(origFolder, getFileName(file, true) + "." + origFileExt); origText = IOStreamUtilities.getStringFromInputStream(FileUtilities.getInputStreamFromFile(origFile)); } @@ -329,7 +334,7 @@ public void getNext(CAS aCAS) throws IOException, CollectionException { if (useFilenameAsDocId) { - String filename = getFileName(file); + String filename = getFileName(file, removeFileNameExtensionForDocId); Header header = new Header(jcas); @@ -415,7 +420,7 @@ private void createFileListByType(File inputDirectory, final Set allowed .forEach(files::add); } - private String getFileName(File fi) { + private String getFileName(File fi, boolean removeExtension) { String filename = fi.getName(); int extDotIndex = filename.lastIndexOf('.'); if (extDotIndex > 0) { From 65317026804e73a80ebb6180d2aff6a168aa9e7f Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 27 Sep 2021 09:44:12 +0200 Subject: [PATCH 095/269] Add the RemoveFileNameExtensionForDocId parameter to the FileReader descriptor. --- .../jcore/reader/file/desc/jcore-file-reader.xml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/jcore-file-reader/src/main/resources/de/julielab/jcore/reader/file/desc/jcore-file-reader.xml b/jcore-file-reader/src/main/resources/de/julielab/jcore/reader/file/desc/jcore-file-reader.xml index f5b30ff00..bda1bb0e5 100644 --- a/jcore-file-reader/src/main/resources/de/julielab/jcore/reader/file/desc/jcore-file-reader.xml +++ b/jcore-file-reader/src/main/resources/de/julielab/jcore/reader/file/desc/jcore-file-reader.xml @@ -74,6 +74,12 @@ false false + + RemoveFileNameExtensionForDocId + Boolean + false + false + @@ -118,11 +124,17 @@ txt + + RemoveFileNameExtensionForDocId + + true + + - + From 52c2aa3e01cc9501b7259e8d24515ff50676aa45 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 27 Sep 2021 10:57:11 +0200 Subject: [PATCH 096/269] Set XMIWriter log message "Wrote file ..." to debug level. --- .../java/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-xmi-writer/src/main/java/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.java b/jcore-xmi-writer/src/main/java/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.java index 6a33348dd..4762f809e 100644 --- a/jcore-xmi-writer/src/main/java/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.java +++ b/jcore-xmi-writer/src/main/java/de/julielab/jcore/consumer/xmi/CasToXmiConsumer.java @@ -295,7 +295,7 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { String fileName = outFileName.toString(); try { writeXmi(jcas.getCas(), fileName); - LOGGER.info(" Wrote file " + fileName); + LOGGER.debug(" Wrote file " + fileName); } catch (IOException e) { try { throw new ResourceProcessException(e); From c92516f7258dc607a45fe0cf7069c7c9f5a8fa48 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 27 Sep 2021 10:59:08 +0200 Subject: [PATCH 097/269] Make actual use of the "removeExtension" parameter in FileReader. --- .../jcore/reader/file/main/FileReader.java | 248 +++++++++--------- 1 file changed, 129 insertions(+), 119 deletions(-) diff --git a/jcore-file-reader/src/main/java/de/julielab/jcore/reader/file/main/FileReader.java b/jcore-file-reader/src/main/java/de/julielab/jcore/reader/file/main/FileReader.java index 3ea69e29e..564ec30f0 100644 --- a/jcore-file-reader/src/main/java/de/julielab/jcore/reader/file/main/FileReader.java +++ b/jcore-file-reader/src/main/java/de/julielab/jcore/reader/file/main/FileReader.java @@ -25,7 +25,6 @@ import de.julielab.jcore.types.pubmed.Header; import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException; import org.apache.uima.cas.CAS; -import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.collection.CollectionReader_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; @@ -33,6 +32,8 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.*; import java.nio.file.FileVisitOption; @@ -44,7 +45,6 @@ import java.util.stream.Stream; public class FileReader extends CollectionReader_ImplBase { - /** * */ @@ -85,9 +85,8 @@ public class FileReader extends CollectionReader_ImplBase { * */ public static final String ORIG_FILES_EXT = "OriginalFileExt"; - public static final String REMOVE_FILE_NAME_EXTENSION_FOR_DOC_ID = "RemoveFileNameExtensionForDocId"; - + private final static Logger log = LoggerFactory.getLogger(FileReader.class); private ArrayList files; private int fileIndex; @@ -213,138 +212,143 @@ public boolean hasNext() { * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */ @Override - public void getNext(CAS aCAS) throws IOException, CollectionException { - JCas jcas; + public void getNext(CAS aCAS) throws CollectionException { + log.trace("Reading next file, if present"); + File file = null; try { - jcas = aCAS.getJCas(); - } catch (CASException e) { - throw new CollectionException(e); - } + JCas jcas = aCAS.getJCas(); - // open input stream to file - File file = files.get(fileIndex++); + // open input stream to file + file = files.get(fileIndex++); + log.trace("Got next file: {}", file); - String text = IOStreamUtilities.getStringFromInputStream(FileUtilities.getInputStreamFromFile(file)); + String text = IOStreamUtilities.getStringFromInputStream(FileUtilities.getInputStreamFromFile(file)); - Pattern nws = Pattern.compile("[^\\s]+", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); + Pattern nws = Pattern.compile("[^\\s]+", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); - String origText = null; - if (origFolder != null) { - File origFile = new File(origFolder, getFileName(file, true) + "." + origFileExt); - origText = IOStreamUtilities.getStringFromInputStream(FileUtilities.getInputStreamFromFile(origFile)); - } + String origText = null; + if (origFolder != null) { + File origFile = new File(origFolder, getFileName(file, true) + "." + origFileExt); + origText = IOStreamUtilities.getStringFromInputStream(FileUtilities.getInputStreamFromFile(origFile)); + } - // sentence per line mode - if (sentencePerLine) { - BufferedReader rdr = new BufferedReader(new StringReader(text)); - List lines = new ArrayList(); - List start = new ArrayList(); - List end = new ArrayList(); - Integer tmp = 0; - String line; - while ((line = rdr.readLine()) != null) { - if (!Pattern.matches("\\s*", line)) { - lines.add(line); - start.add(tmp); - end.add(tmp + line.length()); + // sentence per line mode + if (sentencePerLine) { + log.trace("Reading input file as one sentence per line."); + BufferedReader rdr = new BufferedReader(new StringReader(text)); + List lines = new ArrayList(); + List start = new ArrayList(); + List end = new ArrayList(); + Integer tmp = 0; + String line; + while ((line = rdr.readLine()) != null) { + if (!Pattern.matches("\\s*", line)) { + lines.add(line); + start.add(tmp); + end.add(tmp + line.length()); + } + tmp += (line.length() + 1); } - tmp += (line.length() + 1); - } - rdr.close(); - - int index_tmp = 0; - Optional newLine; - for (Integer i = 0; i < lines.size(); i++) { - boolean addSent2index = true; - Sentence sent = new Sentence(jcas); - if (origText != null) { - newLine = Stream - .of(lines.get(i).split("\\s+")) - .map(x -> Pattern.quote(x)) - .reduce((x, y) -> x + "\\s*" + y); - Pattern p = Pattern.compile(newLine.get(), Pattern.UNICODE_CHARACTER_CLASS); - Matcher m = p.matcher(origText); - if (m.find(index_tmp)) { - int newStart = m.start(); - int newEnd = m.end(); - index_tmp = m.end() + 1; - sent.setBegin(newStart); - sent.setEnd(newEnd); + rdr.close(); + + int index_tmp = 0; + Optional newLine; + for (Integer i = 0; i < lines.size(); i++) { + boolean addSent2index = true; + Sentence sent = new Sentence(jcas); + if (origText != null) { + newLine = Stream + .of(lines.get(i).split("\\s+")) + .map(x -> Pattern.quote(x)) + .reduce((x, y) -> x + "\\s*" + y); + Pattern p = Pattern.compile(newLine.get(), Pattern.UNICODE_CHARACTER_CLASS); + Matcher m = p.matcher(origText); + if (m.find(index_tmp)) { + int newStart = m.start(); + int newEnd = m.end(); + index_tmp = m.end() + 1; + sent.setBegin(newStart); + sent.setEnd(newEnd); + } else { + addSent2index = false; + } } else { - addSent2index = false; + sent.setBegin(start.get(i)); + sent.setEnd(end.get(i)); + } + sent.setComponentId(this.getClass().getName() + " : Sentence per Line Mode"); + if (addSent2index) { + sent.addToIndexes(); } - } else { - sent.setBegin(start.get(i)); - sent.setEnd(end.get(i)); - } - sent.setComponentId(this.getClass().getName() + " : Sentence per Line Mode"); - if (addSent2index) { - sent.addToIndexes(); } } - } - //token by token mode - if (tokenByToken) { - List tokensList = new ArrayList<>(); - List tokStart = new ArrayList<>(); - List tokEnd = new ArrayList<>(); - - - Integer numberOfTokens = 0; - Matcher m = nws.matcher(text); - while (m.find()) { - String token = m.group(); - int start = m.start(); - int end = m.end(); - tokensList.add(token); - tokStart.add(start); - tokEnd.add(end); - numberOfTokens++; - } + //token by token mode + if (tokenByToken) { + log.trace("Reading input file as tokenized text with whitespace as token separator."); + List tokensList = new ArrayList<>(); + List tokStart = new ArrayList<>(); + List tokEnd = new ArrayList<>(); + + + Integer numberOfTokens = 0; + Matcher m = nws.matcher(text); + while (m.find()) { + String token = m.group(); + int start = m.start(); + int end = m.end(); + tokensList.add(token); + tokStart.add(start); + tokEnd.add(end); + numberOfTokens++; + } - int index_tmp = 0; - for (Integer j = 0; j < tokensList.size(); j++) { - boolean addToken2index = true; - Token token = new Token(jcas); - if (origText != null) { - String tok = tokensList.get(j); - int newStart = origText.indexOf(tok, index_tmp); - int newEnd = newStart + tok.length(); - index_tmp = newEnd; - token.setBegin(newStart); - token.setEnd(newEnd); - } else { - token.setBegin(tokStart.get(j)); - token.setEnd(tokEnd.get(j)); - } - token.setComponentId(this.getClass().getName() + " : Tokenized Mode"); - if (addToken2index) { - token.addToIndexes(); + int index_tmp = 0; + for (Integer j = 0; j < tokensList.size(); j++) { + boolean addToken2index = true; + Token token = new Token(jcas); + if (origText != null) { + String tok = tokensList.get(j); + int newStart = origText.indexOf(tok, index_tmp); + int newEnd = newStart + tok.length(); + index_tmp = newEnd; + token.setBegin(newStart); + token.setEnd(newEnd); + } else { + token.setBegin(tokStart.get(j)); + token.setEnd(tokEnd.get(j)); + } + token.setComponentId(this.getClass().getName() + " : Tokenized Mode"); + if (addToken2index) { + token.addToIndexes(); + } } } - } - // put document in CAS - if (origText != null) { - jcas.setDocumentText(origText); - } else { - jcas.setDocumentText(text); - } - - if (useFilenameAsDocId) { + // put document in CAS + if (origText != null) { + jcas.setDocumentText(origText); + } else { + jcas.setDocumentText(text); + } - String filename = getFileName(file, removeFileNameExtensionForDocId); + if (useFilenameAsDocId) { + String filename = getFileName(file, removeFileNameExtensionForDocId); + log.trace("Setting the file name {} as docId to a new Header annotation.", filename); - Header header = new Header(jcas); + Header header = new Header(jcas); - // set ID - header.setDocId(filename); + // set ID + header.setDocId(filename); - // set publication date - addDateForID(header, jcas, filename); + // set publication date + addDateForID(header, jcas, filename); - header.addToIndexes(); + header.addToIndexes(); + } + } catch (Throwable t) { + log.error("Could not read file {}", file, t); + throw new CollectionException(t); } } @@ -414,7 +418,11 @@ public Progress[] getProgress() { private void createFileListByType(File inputDirectory, final Set allowedExtensions) throws IOException { Files.walk(inputDirectory.toPath(), useSubDirs ? Integer.MAX_VALUE : 1, FileVisitOption.FOLLOW_LINKS) - .filter(p -> { if (allowedExtensions.isEmpty()) return true; for (String ext : allowedExtensions) if (p.toString().endsWith(ext)) return true; return false;}) + .filter(p -> { + if (allowedExtensions.isEmpty()) return true; + for (String ext : allowedExtensions) if (p.toString().endsWith(ext)) return true; + return false; + }) .map(Path::toFile) .filter(File::isFile) .forEach(files::add); @@ -422,9 +430,11 @@ private void createFileListByType(File inputDirectory, final Set allowed private String getFileName(File fi, boolean removeExtension) { String filename = fi.getName(); - int extDotIndex = filename.lastIndexOf('.'); - if (extDotIndex > 0) { - filename = filename.substring(0, extDotIndex); + if (removeExtension) { + int extDotIndex = filename.lastIndexOf('.'); + if (extDotIndex > 0) { + filename = filename.substring(0, extDotIndex); + } } if (fileNameSplitUnderscore) { int extUnderScoreIndex = filename.lastIndexOf('_'); From 4ac6bc49af2b361e7f2836f43282046aa4ce422c Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 27 Sep 2021 12:04:36 +0200 Subject: [PATCH 098/269] Ignore the UIMA type in column 4 if it is not found in the type system for AnnotationAdder. --- .../TextAnnotationListAdder.java | 8 ++++++-- .../AnnotationAdderAnnotatorTest.java | 19 ------------------- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java index 40436c2cb..8ae202449 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java @@ -36,8 +36,12 @@ public boolean addAnnotations(AnnotationData data, AnnotationAdderHelper helper, String jCasDocTextSha = null; boolean shaMismatchWasReported = false; for (ExternalTextAnnotation a : annotationList) { - String uimaType = a.getUimaType() == null ? configuration.getDefaultUimaType() : a.getUimaType(); - if (uimaType == null) + String uimaType; + if (a.getUimaType() != null && jCas.getTypeSystem().getType(a.getUimaType()) != null) + uimaType = a.getUimaType(); + else if (configuration.getDefaultUimaType() != null) + uimaType = configuration.getDefaultUimaType(); + else throw new IllegalArgumentException("Missing annotation type: Neither the annotation of document " + a.getDocumentId() + " with offsets " + a.getStart() + "-" + a.getEnd() + " provides a type nor is the default type set."); if (jCas.getTypeSystem().getType(uimaType) == null) throw new IllegalArgumentException("The entity annotation type " + uimaType + " does not exist in the type system."); diff --git a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java index 48ee699e7..83f2aa54d 100644 --- a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java +++ b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java @@ -68,25 +68,6 @@ public void testCharacterOffsets() throws Exception { assertThat(gene.getEnd()).isEqualTo(6); } - @Test - public void testCharacterOffsets2() throws Exception { - final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); - final ExternalResourceDescription externalResourceDescription = ExternalResourceFactory.createExternalResourceDescription(InMemoryFileTextAnnotationProvider.class, new File("src/test/resources/test.txt")); - final AnalysisEngine engine = AnalysisEngineFactory.createEngine(AnnotationAdderAnnotator.class, AnnotationAdderAnnotator.KEY_ANNOTATION_SOURCE, externalResourceDescription); - // Test doc1 (two gene annotations) - jCas.setDocumentText("BRCA PRKII are the genes of this sentence."); - final Header h = new Header(jCas); - h.setDocId("10022127.txt"); - h.addToIndexes(); - - engine.process(jCas); - - final List genes = new ArrayList<>(JCasUtil.select(jCas, Gene.class)); - for (Gene g : genes) { - System.out.println(g); - } - } - @Test public void testPayload() throws Exception { final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); From 574001d570d09f5a3b9e33f616282e963642d7a5 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 27 Sep 2021 13:11:39 +0200 Subject: [PATCH 099/269] Add more logging to the classes of the AnnotationAdder. --- .../AnnotationAdderAnnotator.java | 1 + .../TextAnnotationListAdder.java | 5 ++-- .../AnnotationList.java | 29 ++++++++++++++++++- .../annotationsources/AnnotationSource.java | 7 ++++- .../FileAnnotationSource.java | 9 +++--- ...ryFileDocumentClassAnnotationProvider.java | 8 ++++- .../InMemoryFileTextAnnotationProvider.java | 7 ++++- 7 files changed, 55 insertions(+), 11 deletions(-) diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java index 00245937d..ceaac7535 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java @@ -76,6 +76,7 @@ public void process(final JCas aJCas) throws AnalysisEngineProcessException { final AnnotationData annotations = annotationProvider.getAnnotations(docId); final AnnotationAdderHelper helper = new AnnotationAdderHelper(); if (annotations != null) { + log.trace("Found annotations for document ID {}.", docId); boolean success = false; int adderNum = 0; // We are now iterating through the available annotation adders for the one that handles the obtained annotation data diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java index 8ae202449..e6c433ce6 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java @@ -5,7 +5,6 @@ import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation; import de.julielab.jcore.types.ext.DBProcessingMetaData; import de.julielab.jcore.utility.JCoReAnnotationTools; -import de.julielab.jcore.utility.JCoReTools; import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.digest.DigestUtils; import org.apache.uima.cas.CASException; @@ -65,14 +64,14 @@ else if (configuration.getDefaultUimaType() != null) final Annotation annotation = JCoReAnnotationTools.getAnnotationByClassName(jCas, uimaType); helper.setAnnotationOffsetsRelativeToDocument(annotation, a, configuration); helper.setAnnotationPayloadsToFeatures(annotation, a); + log.trace("Adding annotation of type {} with offsets {}-{} to document with ID {}", uimaType, annotation.getBegin(), annotation.getEnd(), annotationList.getDocId()); annotation.addToIndexes(); } else { log.trace("ExternalAnnotation for document {} has no entity offsets or offsets < 0, not adding anything to the CAS.", a.getDocumentId()); } } else { if (!shaMismatchWasReported) { - final String docId = JCoReTools.getDocId(jCas); - log.warn("The document with ID '{}' has a differing document text hash from a given annotation. The annotation will not be added to the document. Annotation hash: {}, current document text hash: {}", docId, shaFromAnnotation, jCasDocTextSha); + log.warn("The document with ID '{}' has a differing document text hash from a given annotation. The annotation will not be added to the document. Annotation hash: {}, current document text hash: {}", annotationList.getDocId(), shaFromAnnotation, jCasDocTextSha); shaMismatchWasReported = true; if (preventProcessedOnDigestMismatch) { try { diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/AnnotationList.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/AnnotationList.java index afa5e074d..44da0c57c 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/AnnotationList.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/AnnotationList.java @@ -1,8 +1,34 @@ package de.julielab.jcore.ae.annotationadder.annotationrepresentations; import java.util.ArrayList; +import java.util.Collection; public class AnnotationList extends ArrayList implements AnnotationData { + @Override + public boolean add(T t) { + setDocId(t.getDocumentId()); + return super.add(t); + } + + @Override + public void add(int index, T element) { + setDocId(element.getDocumentId()); + super.add(index, element); + } + + @Override + public boolean addAll(Collection c) { + if (c != null) + c.stream().findAny().ifPresent(annotation -> setDocId(annotation.getDocumentId())); + return super.addAll(c); + } + + @Override + public boolean addAll(int index, Collection c) { + if (c != null) + c.stream().findAny().ifPresent(annotation -> setDocId(annotation.getDocumentId())); + return super.addAll(index, c); + } private String docId; @@ -11,11 +37,12 @@ public String getDocId() { } public void setDocId(String docId) { + if (docId != null && this.docId != null && !docId.equals(this.docId)) + throw new IllegalArgumentException("This annotation list already contains annotations for document with ID " + this.docId + " but the document ID should now be set to " + docId + "."); this.docId = docId; } @Override - public String getDocumentId() { return docId; } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/AnnotationSource.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/AnnotationSource.java index d7a1daad9..5a18be30e 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/AnnotationSource.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/AnnotationSource.java @@ -3,7 +3,12 @@ import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationData; import org.apache.uima.resource.DataResource; +import java.io.IOException; +import java.net.URI; + public interface AnnotationSource { - void initialize(DataResource dataResource); + void loadAnnotations(URI annotationUri) throws IOException; + + void initialize(DataResource dataResource) throws IOException; T getAnnotations(String id); } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java index 69958d586..845c42c95 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java @@ -24,16 +24,17 @@ public FileAnnotationSource(AnnotationFormat format) { this.format = format; } - private void loadAnnotations(URI annotationUri) { + @Override + public void loadAnnotations(URI annotationUri) throws IOException { try (BufferedReader br = UriUtilities.getReaderFromUri(annotationUri)) { entitiesByDocId = br.lines().map(format::parse).filter(Objects::nonNull).collect(Collectors.groupingBy(AnnotationData::getDocumentId, Collectors.toCollection(AnnotationList::new))); - } catch (IOException e) { - e.printStackTrace(); } + if (log.isTraceEnabled()) + log.trace("Loaded {} entity annotations for {} document IDs.", entitiesByDocId.values().stream().flatMap(AnnotationList::stream).count(), entitiesByDocId.size()); } @Override - public void initialize(DataResource dataResource) { + public void initialize(DataResource dataResource) throws IOException { log.info("Loading entity annotations from {}", dataResource.getUri()); loadAnnotations(dataResource.getUri()); } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileDocumentClassAnnotationProvider.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileDocumentClassAnnotationProvider.java index ab95d5759..731f114ce 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileDocumentClassAnnotationProvider.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileDocumentClassAnnotationProvider.java @@ -6,6 +6,8 @@ import org.apache.uima.resource.DataResource; import org.apache.uima.resource.ResourceInitializationException; +import java.io.IOException; + public class InMemoryFileDocumentClassAnnotationProvider implements AnnotationProvider { private AnnotationSource> annotationSource; @@ -18,7 +20,11 @@ public AnnotationList getAnnotations(String id) public void load(DataResource dataResource) throws ResourceInitializationException { // This logic could be made configurable if required so in the future. annotationSource = new FileAnnotationSource(new DocumentClassAnnotationFormat()); - annotationSource.initialize(dataResource); + try { + annotationSource.initialize(dataResource); + } catch (IOException e) { + throw new ResourceInitializationException(e); + } } diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java index 1f6914340..ac89d5b1e 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java @@ -10,6 +10,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.Optional; @@ -41,7 +42,11 @@ public void load(DataResource dataResource) throws ResourceInitializationExcepti throw new ResourceInitializationException(e); } annotationSource = new FileAnnotationSource(format); - annotationSource.initialize(dataResource); + try { + annotationSource.initialize(dataResource); + } catch (IOException e) { + throw new ResourceInitializationException(e); + } } From d8fb38c379feebbf95d903d88512bc7edb8d9445 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 27 Sep 2021 13:20:29 +0200 Subject: [PATCH 100/269] Set the component ID for annotations created by the AnnotationAdder. --- .../jcore/ae/annotationadder/TextAnnotationListAdder.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java index e6c433ce6..7626dce18 100644 --- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java +++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java @@ -62,6 +62,8 @@ else if (configuration.getDefaultUimaType() != null) // that the SHA was the same as it was at time of the original entity tagging. if (a.getStart() >= 0) { final Annotation annotation = JCoReAnnotationTools.getAnnotationByClassName(jCas, uimaType); + if (annotation instanceof de.julielab.jcore.types.Annotation) + ((de.julielab.jcore.types.Annotation)annotation).setComponentId(AnnotationAdderAnnotator.class.getSimpleName()); helper.setAnnotationOffsetsRelativeToDocument(annotation, a, configuration); helper.setAnnotationPayloadsToFeatures(annotation, a); log.trace("Adding annotation of type {} with offsets {}-{} to document with ID {}", uimaType, annotation.getBegin(), annotation.getEnd(), annotationList.getDocId()); From beb319b6f0d6bd204a11a4682de9e726d633c917 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 28 Sep 2021 08:20:28 +0200 Subject: [PATCH 101/269] Add support for JATS 1.3 tagset for PMC reader. Following the 1.2 to 1.3 change description at https://jats.nlm.nih.gov/publishing/tag-library/1.3/chapter/version-1.3-chg.html this doesn't seem to concern us. The 1.3 tagset is backward compatible to previous JATS versions and does not seem to bring extensions that we would actually use right now. --- .../jcore/multiplier/pmc/PMCMultiplier.java | 5 +++- .../jcore/reader/pmc/CasPopulator.java | 10 ++++++-- .../reader/pmc/NoDataAvailableException.java | 23 +++++++++++++++++++ .../julielab/jcore/reader/pmc/PMCReader.java | 4 +++- .../jcore/reader/pmc/parser/FrontParser.java | 2 +- .../reader/pmc/parser/NxmlDocumentParser.java | 8 +++++++ 6 files changed, 47 insertions(+), 5 deletions(-) create mode 100644 jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NoDataAvailableException.java diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java index b723f6215..38d52f4b8 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java @@ -1,6 +1,7 @@ package de.julielab.jcore.multiplier.pmc; import de.julielab.jcore.reader.pmc.CasPopulator; +import de.julielab.jcore.reader.pmc.NoDataAvailableException; import de.julielab.jcore.reader.pmc.parser.ElementParsingException; import de.julielab.jcore.types.casmultiplier.JCoReURI; import org.apache.uima.analysis_component.JCasMultiplier_ImplBase; @@ -37,7 +38,7 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { try { casPopulator = new CasPopulator(currentUriBatch); } catch (IOException e) { - log.error("Exception occurred when trying to inizialize the NXML parser", e); + log.error("Exception occurred when trying to initialize the NXML parser", e); throw new AnalysisEngineProcessException(e); } } @@ -60,6 +61,8 @@ public AbstractCas next() throws AnalysisEngineProcessException { return cas; } catch (ElementParsingException e) { log.error("Exception occurred why trying to parse {}", next, e); + } catch (NoDataAvailableException e) { + log.error("Could not populate the CAS due to preceding error. Returning null."); } } return null; diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java index ff3a1e0f0..61e2851a5 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java @@ -21,7 +21,7 @@ public CasPopulator(Iterator nxmlIterator) throws IOException { nxmlDocumentParser.loadElementPropertyFile("/de/julielab/jcore/reader/pmc/resources/elementproperties.yml"); } - public void populateCas(URI nxmlUri, JCas cas) throws ElementParsingException { + public void populateCas(URI nxmlUri, JCas cas) throws ElementParsingException, NoDataAvailableException { ElementParsingResult result = null; URI currentUri = nxmlUri; while (currentUri != null && result == null) { @@ -30,7 +30,13 @@ public void populateCas(URI nxmlUri, JCas cas) throws ElementParsingException { result = nxmlDocumentParser.parse(); } catch (DocumentParsingException e) { log.warn("Error occurred when trying to read from URI {} (ASCII string: {}): {}. Skipping document.", currentUri, currentUri.toASCIIString(), e.getMessage()); - currentUri = nxmlIterator.next(); + if (nxmlIterator.hasNext()) { + currentUri = nxmlIterator.next(); + } else { + String msg = "Cannot just skip the errored document because there is no next document currently available. Returning without adding any data to the CAS."; + log.warn(msg); + throw new NoDataAvailableException(msg); + } } } StringBuilder sb = populateCas(result, new StringBuilder()); diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NoDataAvailableException.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NoDataAvailableException.java new file mode 100644 index 000000000..41a611d26 --- /dev/null +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NoDataAvailableException.java @@ -0,0 +1,23 @@ +package de.julielab.jcore.reader.pmc; + +public class NoDataAvailableException extends Exception { + + public NoDataAvailableException() { + } + + public NoDataAvailableException(String message) { + super(message); + } + + public NoDataAvailableException(String message, Throwable cause) { + super(message, cause); + } + + public NoDataAvailableException(Throwable cause) { + super(cause); + } + + public NoDataAvailableException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReader.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReader.java index d58f3f939..921fc10b5 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReader.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReader.java @@ -54,10 +54,12 @@ public void getNext(JCas cas) throws CollectionException { next = pmcFiles.next(); casPopulator.populateCas(next, cas); if (extractIdFromFilename) - ((Header)cas.getAnnotationIndex(Header.type).iterator().next()).setDocId(getIdFromFilename(next)); + ((Header) cas.getAnnotationIndex(Header.type).iterator().next()).setDocId(getIdFromFilename(next)); } catch (ElementParsingException e) { log.error("Exception occurred when trying to parse {}", next, e); throw new CollectionException(e); + } catch (NoDataAvailableException e) { + log.error("Could not populate CAS due to preceding error."); } completed++; } diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java index b21a66aec..e1272094d 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java @@ -84,7 +84,7 @@ else if (xPathExists(String.format(pubDateFmt, "pmc-release"))) : getXPathValue("/article/front/journal-meta/journal-title-group/journal-title"); // there actually might be several abbreviated titles but here, we // only use the first; our type system currently cannot represent - // more anyway. One could try decide for an preferred one since the + // more anyway. One could try to decide for a preferred one since the // abbrev-type attribute disposes the source of the abbreviated // title (e.g. publisher or nlm-ta). Optional abbrevJournalTitle = nxmlDocumentParser.getTagset() == Tagset.NLM_2_3 diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java index 069d038f1..c6a0e837b 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java @@ -94,6 +94,8 @@ private void setTagset() throws NavException, DocTypeNotFoundException, DocTypeN tagset = Tagset.JATS_1_0; else if (docType.contains("JATS-archivearticle1-mathml3.dtd")) tagset = Tagset.JATS_1_2_MATH_ML_3; + else if (docType.contains("JATS-archivearticle1-3-mathml3.dtd")) + tagset = Tagset.JATS_1_3; else if (docType.contains("journalpublishing.dtd") || docType.contains("archivearticle.dtd")) tagset = Tagset.NLM_2_3; else if (docType.contains("journalpublishing3.dtd") || docType.contains("archivearticle3.dtd")) @@ -210,6 +212,12 @@ public enum Tagset { * @see https://jats.nlm.nih.gov/publishing/tag-library/1.2/index.html */ JATS_1_2_MATH_ML_3, + /** + * NISO JATS Version 1.3 (ANSI/NISO Z39.96-2021) + * + * @see https://jats.nlm.nih.gov/publishing/tag-library/1.3/index.html + */ + JATS_1_3, /** * NLM Journal Publishing DTD v. 2.3 * From 53ae91dece4389120e6e5b8829f9deadc23b168a Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 28 Sep 2021 08:26:52 +0200 Subject: [PATCH 102/269] Add fallbacks in case of unknown XML tag sets for PMC reader. Newer JATS versions are - until now - backward compatible with their previous version. Since we don't use any of the special capabilities of any JATS format, we just check for JATS in general and assign the latest version if we haven't handled the exact version. We do the same for the NLM tagsets. --- .../de/julielab/jcore/reader/pmc/parser/FrontParser.java | 4 ++-- .../jcore/reader/pmc/parser/NxmlDocumentParser.java | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java index e1272094d..af4a2b944 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java @@ -79,7 +79,7 @@ else if (xPathExists(String.format(pubDateFmt, "pmc-release"))) Optional year = getXPathValue(String.format("/article/front/article-meta/pub-date[@pub-type='%s']/year", pubType)); Optional month = getXPathValue(String.format("/article/front/article-meta/pub-date[@pub-type='%s']/month", pubType)); Optional day = getXPathValue(String.format("/article/front/article-meta/pub-date[@pub-type='%s']/day", pubType)); - Optional journalTitle = nxmlDocumentParser.getTagset() == Tagset.NLM_2_3 + Optional journalTitle = nxmlDocumentParser.getTagset() == Tagset.NLM_2_3 || nxmlDocumentParser.getTagset() == Tagset.NLM_3_0 ? getXPathValue("/article/front/journal-meta/journal-title") : getXPathValue("/article/front/journal-meta/journal-title-group/journal-title"); // there actually might be several abbreviated titles but here, we @@ -87,7 +87,7 @@ else if (xPathExists(String.format(pubDateFmt, "pmc-release"))) // more anyway. One could try to decide for a preferred one since the // abbrev-type attribute disposes the source of the abbreviated // title (e.g. publisher or nlm-ta). - Optional abbrevJournalTitle = nxmlDocumentParser.getTagset() == Tagset.NLM_2_3 + Optional abbrevJournalTitle = nxmlDocumentParser.getTagset() == Tagset.NLM_2_3 || nxmlDocumentParser.getTagset() == Tagset.NLM_3_0 ? getXPathValue("/article/front/journal-meta/abbrev-journal-title") : getXPathValue("/article/front/journal-meta/journal-title-group/abbrev-journal-title"); Optional volume = getXPathValue("/article/front/article-meta/volume"); diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java index c6a0e837b..2042b258c 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java @@ -100,8 +100,13 @@ else if (docType.contains("journalpublishing.dtd") || docType.contains("archivea tagset = Tagset.NLM_2_3; else if (docType.contains("journalpublishing3.dtd") || docType.contains("archivearticle3.dtd")) tagset = Tagset.NLM_3_0; - else - throw new DocTypeNotSupportedException("Unsupported document type: " + docType); + else if (docType.contains("JATS")) { + log.warn("Unknown document type: {}. Assigning the latest JATS tagset in assumption of backward compatibility.", docType); + tagset = Tagset.JATS_1_3; + } else if (docType.contains("journalpublishing") || docType.contains("archivearticle")) { + log.warn("Unknown document type: {}. Assigning the latest NLM tagset in assumption of backward compatibility.", docType); + tagset = Tagset.NLM_3_0; + } return; } } From 69899e5e51e81c9b53a695adffb1714d28d4b689 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 6 Oct 2021 07:35:06 +0200 Subject: [PATCH 103/269] Log the doc ID for documents with long sentences in TokenAnnotator. --- jcore-jtbd-ae/pom.xml | 5 +++++ .../de/julielab/jcore/ae/jtbd/main/TokenAnnotator.java | 9 +++++++-- .../chunking/ConfigurableChunkerProviderImplAlt.java | 4 +++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/jcore-jtbd-ae/pom.xml b/jcore-jtbd-ae/pom.xml index c773cf55d..d4a7430a4 100644 --- a/jcore-jtbd-ae/pom.xml +++ b/jcore-jtbd-ae/pom.xml @@ -86,6 +86,11 @@ jcore-types ${jcore-types-version}
+ + de.julielab + jcore-utilities + ${project.parent.version} + cc.mallet mallet diff --git a/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotator.java b/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotator.java index 1ddd664f7..c073983a2 100644 --- a/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotator.java +++ b/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotator.java @@ -26,6 +26,7 @@ import de.julielab.jcore.ae.jtbd.Unit; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; +import de.julielab.jcore.utility.JCoReTools; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; @@ -153,8 +154,12 @@ public void process(final JCas aJCas) throws AnalysisEngineProcessException { int length = sentence.getEnd() - sentence .getBegin(); LOGGER.debug("going to next sentence having length: " + length); - if (length > 1000) - LOGGER.warn("Current sentence has length {}.", length); + if (length > 1000) { + if (LOGGER.isWarnEnabled()) { + String docId = JCoReTools.getDocId(aJCas); + LOGGER.warn("Current sentence has length {} (document ID {}).", length, docId); + } + } final String text = sentence.getCoveredText(); writeTokensToCAS(text, sentence.getBegin(), aJCas); } diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.java index aa1c07623..8d9e63b44 100644 --- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.java +++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.java @@ -333,7 +333,7 @@ public boolean getCaseSensitive() { } - private InputStream readStreamFromFileSystemOrClassPath(String filePath) { + private InputStream readStreamFromFileSystemOrClassPath(String filePath) throws FileNotFoundException { InputStream is = null; File file = new File(filePath); if (file.exists()) { @@ -351,6 +351,8 @@ private InputStream readStreamFromFileSystemOrClassPath(String filePath) { } catch (IOException e) { e.printStackTrace(); } + if (is == null) + throw new FileNotFoundException("Could not read contents from " + filePath); return is; } } From 133c0de8ccdd4ac3d78be339e0bc84f030949a5e Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 29 Nov 2021 17:45:13 +0100 Subject: [PATCH 104/269] Activate GNormPlus features and re-add Maven pom section to copy dependencies to target/lib. --- jcore-banner-ae/pom.xml | 20 ++++++++++++++++++- .../main/java/banner/tagging/FeatureSet.java | 6 +++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/jcore-banner-ae/pom.xml b/jcore-banner-ae/pom.xml index 6235ec58d..ec5a25e53 100644 --- a/jcore-banner-ae/pom.xml +++ b/jcore-banner-ae/pom.xml @@ -71,7 +71,7 @@ de.julielab jcore-base 2.6.0-SNAPSHOT - .. + ../pom.xml @@ -79,4 +79,22 @@ https://opensource.org/licenses/BSD-2-Clause + + + + maven-dependency-plugin + + + prepare-package + + copy-dependencies + + + ${project.build.directory}/lib + + + + + + diff --git a/jcore-banner-ae/src/main/java/banner/tagging/FeatureSet.java b/jcore-banner-ae/src/main/java/banner/tagging/FeatureSet.java index df6548577..009154e3c 100644 --- a/jcore-banner-ae/src/main/java/banner/tagging/FeatureSet.java +++ b/jcore-banner-ae/src/main/java/banner/tagging/FeatureSet.java @@ -111,9 +111,9 @@ private SerialPipes createPipe(TagFormat format, Lemmatiser lemmatiser, dragon.n //siddhartha added these; pipes.add(simFindFilename == null ? new Noop() : new SimFind(simFindFilename)); -// pipes.add(new ChemicalSuffix("CHEM_SUFF=")); -// pipes.add(new MentionTypeHint("MENTION_TYPE=")); -// pipes.add(new ProteinSymbols("PROT_SYM=")); + pipes.add(new ChemicalSuffix("CHEM_SUFF=")); + pipes.add(new MentionTypeHint("MENTION_TYPE=")); + pipes.add(new ProteinSymbols("PROT_SYM=")); pipes.add(new OffsetConjunctions(new int[][] { { -2 }, { -1 }, { 1 }, { 2 } })); pipes.add(new TokenSequence2FeatureVectorSequence(true, true)); From 40f6ff6f2128d23dcbbb6cb88cb7999ecf1cfd0f Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 14 Jan 2022 11:33:50 +0100 Subject: [PATCH 105/269] Use the recursive deletion of the index file for PersistentStringIndexMapProvider. File.delete() only delete files and as such, Lucene indexes were not deleted albeit they need an update. --- .../jcore/consumer/es/sharedresources/LuceneIndex.java | 2 +- .../PersistentStringIndexMapProvider.java | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/LuceneIndex.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/LuceneIndex.java index a28c0a5c1..907c333a6 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/LuceneIndex.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/LuceneIndex.java @@ -37,7 +37,7 @@ public LuceneIndex(String indexDirectory) { IndexWriterConfig iwc = new IndexWriterConfig(); iw = new IndexWriter(directory, iwc); } else { - log.debug("Index directory {} already"); + log.debug("Index directory {} already exists.", indexDirectory); } } catch (IOException e) { log.error("could not initialize Lucene index", e); diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java index 93dd296f2..3a9334cb9 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java @@ -3,6 +3,7 @@ import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; +import org.apache.commons.io.FileUtils; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.NotImplementedException; import org.apache.uima.resource.DataResource; @@ -125,13 +126,15 @@ public void load(DataResource aData) throws ResourceInitializationException { indexFile = new File("es-consumer-cache", resourceFileName); if (resourceFile.exists() && indexFile.exists() && resourceFile.lastModified() > indexFile.lastModified()) { log.info("Resource file {} is newer than the existing cached index at {}. Creating new index.", resourceFile, indexFile); - indexFile.delete(); + if (indexFile.isDirectory()) + FileUtils.deleteQuietly(indexFile); + else + indexFile.delete(); } else { boolean indexFileExisted = indexFile.exists(); if (!indexFileExisted) { log.info("Creating persistent cache for resource {} at {}.", uri, indexFile); - } - else { + } else { log.info("Using existing persistent cache {} for resource {}.", indexFile, uri); loadData = false; } From bc4c0bb0ac01d9f9f2e7eedd66e25aa9ce76ad02 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 17 Jan 2022 09:59:03 +0100 Subject: [PATCH 106/269] Fix #125. The bug actually was in JCoReCondensedDocumentText rather than JSBD itself. New tests have been added to ensure the correct behavior. --- .../jcore/ae/jsbd/main/SentenceAnnotator.java | 2 +- .../utility/JCoReCondensedDocumentText.java | 21 +++++++-- .../JCoReCondensedDocumentTextTest.java | 47 ++++++++++++++++++- 3 files changed, 62 insertions(+), 8 deletions(-) diff --git a/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java b/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java index fe5cbd833..220eea9bb 100644 --- a/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java +++ b/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java @@ -193,7 +193,7 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { int end = borders.get(i); // skip leading whites spaces - while (start < end && Character.isWhitespace(aJCas.getDocumentText().charAt(start))) + while (start < end && (Character.isWhitespace(documentText.getCodensedText().charAt(start)))) ++start; // get the string between the current annotation borders and recognized sentences diff --git a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java index 7067539ad..34b0e1f93 100644 --- a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java +++ b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java @@ -109,21 +109,26 @@ public void buildMap(JCas cas, Set cutAwayTypes) throws ClassNotFoundExc // Adapt offsets to remove superfluous white spaces from the condensed text boolean precedingCharacterIsWS = lastBegin == 0 || Character.isWhitespace(cas.getDocumentText().charAt(lastBegin - 1)); boolean succeedingCharacterIsWS = lastEnd < cas.getDocumentText().length() && Character.isWhitespace(cas.getDocumentText().charAt(lastEnd)); - if (precedingCharacterIsWS && succeedingCharacterIsWS) + boolean extendLastEnd = precedingCharacterIsWS && succeedingCharacterIsWS; + if (extendLastEnd) ++lastEnd; if (precedingCharacterIsWS && end >= cas.getDocumentText().length()) --begin; // The current cut away annotation begins after the previous cut away annotation, thus there is no // overlap and we can add the current state to the maps. cutSum += lastEnd - lastBegin; - int condensedPosition = lastEnd - cutSum + 1; - condensedPos2SumCutMap.put(condensedPosition, cutSum); + int condensedPosition = lastEnd - cutSum; + if (condensedPosition == lastBegin && !extendLastEnd) + ++condensedPosition; // For original offsets we need to be able to know where the begin and the end of // the cut away annotation was. This is exploited in getCondensedOffsetForOriginalOffset() originalPos2SumCutMap.put(lastBegin, lastCutSum); originalPos2SumCutMap.put(lastEnd, cutSum); lastBegin = begin; lastCutSum = cutSum; + if (condensedPosition + cutSum >= cas.getDocumentText().length()) + cutSum = cas.getDocumentText().length() -1 - condensedPosition; + condensedPos2SumCutMap.put(condensedPosition, cutSum); sb.append(cas.getDocumentText(), lastEnd, begin); } else if (lastEnd < 0) { // This is the first annotation @@ -146,10 +151,16 @@ public void buildMap(JCas cas, Set cutAwayTypes) throws ClassNotFoundExc if (precedingCharacterIsWS && (succeedingCharacterIsWS || lastEnd >= cas.getDocumentText().length())) ++lastEnd; cutSum += lastEnd - lastBegin; - int condensedPosition = lastEnd - cutSum + 1; - condensedPos2SumCutMap.put(condensedPosition, cutSum); + int condensedPosition = lastEnd - cutSum; originalPos2SumCutMap.put(lastBegin, lastCutSum); originalPos2SumCutMap.put(lastEnd, cutSum); + // Avoid the situation where the computed original position includes the last cut away annotation. + // This can happen when a cut away annotation appears at the very end of the text. Then, the cutSum + // accounts for this last annotation at the end of the condensed text which would result in an original + // position _after_ the cut away annotation. + if (condensedPosition + cutSum >= cas.getDocumentText().length()) + cutSum = cas.getDocumentText().length() -1 - condensedPosition; + condensedPos2SumCutMap.put(condensedPosition, cutSum); } // If lastEnd is still -1, we just did not find any of the cut away annotations. Thus, we just copy the whole text. if (lastEnd == -1) diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java index 86ef54bf9..470baa250 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java @@ -30,7 +30,7 @@ public void testReduce() throws Exception { assertEquals(13, condensedText.getOriginalOffsetForCondensedOffset(13)); assertEquals(15, condensedText.getOriginalOffsetForCondensedOffset(14)); assertEquals(30, condensedText.getOriginalOffsetForCondensedOffset(29)); - + assertEquals(0, condensedText.getCondensedOffsetForOriginalOffset(0)); assertEquals(13, condensedText.getCondensedOffsetForOriginalOffset(13)); assertEquals(14, condensedText.getCondensedOffsetForOriginalOffset(15)); @@ -54,7 +54,7 @@ public void testReduce2() throws Exception { assertEquals(13, condensedText.getOriginalOffsetForCondensedOffset(13)); assertEquals(15, condensedText.getOriginalOffsetForCondensedOffset(14)); assertEquals(31, condensedText.getOriginalOffsetForCondensedOffset(29)); - + assertEquals(0, condensedText.getCondensedOffsetForOriginalOffset(0)); assertEquals(13, condensedText.getCondensedOffsetForOriginalOffset(13)); assertEquals(14, condensedText.getCondensedOffsetForOriginalOffset(15)); @@ -85,6 +85,49 @@ public void testReduce3() throws Exception { assertEquals("This sentence has multiple references. This is a second sentence.", condensedText.getCodensedText()); } + @Test + public void testReduce4() throws Exception { + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("This sentence\n1\nhas references."); + InternalReference ref1 = new InternalReference(jcas, 14, 15); + ref1.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(InternalReference.class.getCanonicalName()))); + assertEquals("This sentence\nhas references.", condensedText.getCodensedText()); + assertEquals(0, condensedText.getOriginalOffsetForCondensedOffset(0)); + assertEquals(16, condensedText.getOriginalOffsetForCondensedOffset(14)); + } + + @Test + public void testReduce5() throws Exception { + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("Leptin is an adipose-derived protein secreted by adipocytes and is expressed in adipose tissue.\n" + + "1\n" + + "It has the role of being a key regulator of several physiological pathways including body weight and regulation of food intake, inflammation, endocrine function, energy homeostasis, bone metabolism and immunity.\n" + + "2\n" + + "3\n" + + "Results from various studies indicate that leptin may play a significant role in bone physiology, independent of the central nervous system.\n"); + InternalReference ref1 = new InternalReference(jcas, 96, 97); + ref1.addToIndexes(); + InternalReference ref2 = new InternalReference(jcas, 310, 311); + ref2.addToIndexes(); + InternalReference ref3 = new InternalReference(jcas, 312, 313); + ref3.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(InternalReference.class.getCanonicalName()))); + System.out.println(condensedText.getCodensedText()); + assertEquals("Leptin is an adipose-derived protein secreted by adipocytes and is expressed in adipose tissue.\n" + + "It has the role of being a key regulator of several physiological pathways including body weight and regulation of food intake, inflammation, endocrine function, energy homeostasis, bone metabolism and immunity.\n" + + "Results from various studies indicate that leptin may play a significant role in bone physiology, independent of the central nervous system.\n", condensedText.getCodensedText()); + assertEquals(98, condensedText.getOriginalOffsetForCondensedOffset(96)); + assertEquals(314, condensedText.getOriginalOffsetForCondensedOffset(308)); + } + + @Test public void testCondensedOffsetsWithinCutawayAnnotations() throws Exception { JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", From cdff1d5c7f742b71e0dc37e9ec3ed86e12c4fc88 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 17 Jan 2022 10:01:20 +0100 Subject: [PATCH 107/269] Fixes #126 (ESConsumer cache index updates). --- .../es/sharedresources/AbstractMapProvider.java | 6 ++++-- .../PersistentIndexAddonTermsProvider.java | 15 +++++++++++++-- .../PersistentStringIndexMapProvider.java | 15 +++++++++++---- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AbstractMapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AbstractMapProvider.java index 7a181d55a..a02b81797 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AbstractMapProvider.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/AbstractMapProvider.java @@ -42,12 +42,14 @@ public void load(DataResource aData) throws ResourceInitializationException { throw new IOException("Resource " + aData.getUri() + " not found"); } br = new BufferedReader(is); - map = new HashMap<>(); +// map = new HashMap<>(); String line; String splitExpression = "\t"; + int numEntries = 0; while ((line = br.readLine()) != null) { if (line.trim().length() == 0 || line.startsWith("#")) continue; + ++numEntries; String[] split = line.split(splitExpression); if (split.length != 2) { splitExpression = "\\s+"; @@ -61,7 +63,7 @@ public void load(DataResource aData) throws ResourceInitializationException { else put(getKey(split[0]), getValue(split[1])); } - log.info("Finished reading resource {} and got {} elements.", aData.getUri(), map.size()); + log.info("Finished reading resource {} and got {} entries.", aData.getUri(), numEntries); } catch (IOException e) { throw new ResourceInitializationException(e); } finally { diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentIndexAddonTermsProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentIndexAddonTermsProvider.java index b20d466ef..b98514ee3 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentIndexAddonTermsProvider.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentIndexAddonTermsProvider.java @@ -3,6 +3,7 @@ import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; +import org.apache.commons.io.FileUtils; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.NotImplementedException; import org.apache.uima.resource.DataResource; @@ -12,6 +13,7 @@ import org.slf4j.Logger; import java.io.File; +import java.io.IOException; import java.net.MalformedURLException; import java.net.URI; import java.time.Duration; @@ -115,7 +117,7 @@ public Optional load(String s) { public void load(DataResource aData) throws ResourceInitializationException { // prepare the persistent index URI uri = aData.getUri(); - File indexFile; + File indexFile = null; boolean loadData = true; try { File resourceFile = new File(uri); @@ -123,7 +125,13 @@ public void load(DataResource aData) throws ResourceInitializationException { indexFile = new File("es-consumer-cache", resourceFileName); if (resourceFile.exists() && indexFile.exists() && resourceFile.lastModified() > indexFile.lastModified()) { log.info("Resource file {} is newer than the existing cached index at {}. Creating new index.", resourceFile, indexFile); - indexFile.delete(); + if (indexFile.isDirectory()) { + log.info("Deleting index directory {}", indexFile); + FileUtils.deleteDirectory(indexFile); + } else { + log.info("Deleting index file {}", indexFile); + indexFile.delete(); + } } else { boolean indexFileExisted = indexFile.exists(); if (!indexFileExisted) { @@ -138,6 +146,9 @@ public void load(DataResource aData) throws ResourceInitializationException { } catch (MalformedURLException e) { log.error("Could obtain file name from resource URI '{}'", uri, e); throw new IllegalStateException(e); + } catch (IOException e) { + log.error("Could not delete index file {}", indexFile, e); + throw new ResourceInitializationException(e); } if (loadData) { super.load(aData); diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java index 3a9334cb9..2551cedea 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java @@ -13,6 +13,7 @@ import org.slf4j.Logger; import java.io.File; +import java.io.IOException; import java.net.MalformedURLException; import java.net.URI; import java.time.Duration; @@ -118,7 +119,7 @@ protected void put(String key, String value) { public void load(DataResource aData) throws ResourceInitializationException { // prepare the persistent index URI uri = aData.getUri(); - File indexFile; + File indexFile = null; boolean loadData = true; try { File resourceFile = new File(uri); @@ -126,10 +127,13 @@ public void load(DataResource aData) throws ResourceInitializationException { indexFile = new File("es-consumer-cache", resourceFileName); if (resourceFile.exists() && indexFile.exists() && resourceFile.lastModified() > indexFile.lastModified()) { log.info("Resource file {} is newer than the existing cached index at {}. Creating new index.", resourceFile, indexFile); - if (indexFile.isDirectory()) - FileUtils.deleteQuietly(indexFile); - else + if (indexFile.isDirectory()) { + log.info("Deleting index directory {}", indexFile); + FileUtils.deleteDirectory(indexFile); + } else { + log.info("Deleting index file {}", indexFile); indexFile.delete(); + } } else { boolean indexFileExisted = indexFile.exists(); if (!indexFileExisted) { @@ -143,6 +147,9 @@ public void load(DataResource aData) throws ResourceInitializationException { } catch (MalformedURLException e) { log.error("Could obtain file name from resource URI '{}'", uri, e); throw new IllegalStateException(e); + } catch (IOException e) { + log.error("Could not delete index file {}", indexFile, e); + throw new ResourceInitializationException(e); } if (loadData) { super.load(aData); From 2570a478d4e982e7fd16f7ea6c58e796aa4154ce Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 17 Jan 2022 10:14:47 +0100 Subject: [PATCH 108/269] Update to CoStoSys 1.6.1-SNAPSHOT. --- jedis-parent/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jedis-parent/pom.xml b/jedis-parent/pom.xml index 3daef871c..0b8807ef9 100644 --- a/jedis-parent/pom.xml +++ b/jedis-parent/pom.xml @@ -17,7 +17,7 @@ de.julielab costosys - 1.6.0-SNAPSHOT + 1.6.1-SNAPSHOT de.julielab From 3529c58a9f9739d4b76a8c6b852033402c8a76d4 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 17 Jan 2022 10:33:36 +0100 Subject: [PATCH 109/269] Use test model instead of local model for JSBD tests. The local model was used to create the exact same environment that was present when some error occurred. This is not necessary anymore. --- .../de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java index 5a5b23a47..c3d0aa8a9 100644 --- a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java +++ b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java @@ -300,7 +300,7 @@ public void testErrordoc() throws Exception { XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "errordocs", "PMC5478802.xmi").toFile()), jCas.getCas()); JCasUtil.select(jCas, Sentence.class).forEach(Annotation::removeFromIndexes); AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, - "/Users/faessler/Coding/git/jcore-projects/jcore-jsbd-ae-biomedical-english/src/main/resources/de/julielab/jcore/ae/jsbd/model/jsbd-biomed-oversampled-abstracts-split-at-punctuation.mod.gz", + "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_MAX_SENTENCE_LENGTH, 1000, SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, new String[]{ "de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"}, @@ -322,7 +322,7 @@ public void testErrordoc2() throws Exception { XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "errordocs", "PMC8205280.xmi").toFile()), jCas.getCas()); JCasUtil.select(jCas, Sentence.class).forEach(Annotation::removeFromIndexes); AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, - "/Users/faessler/Coding/git/jcore-projects/jcore-jsbd-ae-biomedical-english/src/main/resources/de/julielab/jcore/ae/jsbd/model/jsbd-biomed-oversampled-abstracts-split-at-punctuation.mod.gz", + "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_MAX_SENTENCE_LENGTH, 1000, SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, new String[]{ "de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"}, From 8c74873cdde6bd19d916cc14b90d273667364dab Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 20 Jan 2022 09:42:23 +0100 Subject: [PATCH 110/269] Add an option to omit internal references with letters in them from document text condensation. References like "Figure 2 shows..." are embedded in the text and should thus not be removed. --- .../jcore/ae/jsbd/main/SentenceAnnotator.java | 2 +- .../ae/jsbd/main/SentenceAnnotatorTest.java | 28 +++++++++ .../test/resources/errordocs/PMC5070457.xmi | 5 ++ .../utility/JCoReCondensedDocumentText.java | 60 ++++++++++++++++++- .../JCoReCondensedDocumentTextTest.java | 25 +++++++- 5 files changed, 116 insertions(+), 4 deletions(-) create mode 100644 jcore-jsbd-ae/src/test/resources/errordocs/PMC5070457.xmi diff --git a/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java b/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java index 220eea9bb..d89ca98b7 100644 --- a/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java +++ b/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java @@ -155,7 +155,7 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { JCoReCondensedDocumentText documentText; try { // If there are no cut-away types, the document text will remain unchanged. - documentText = new JCoReCondensedDocumentText(aJCas, cutAwayTypes, Set.of(',')); + documentText = new JCoReCondensedDocumentText(aJCas, cutAwayTypes, Set.of(','), true); } catch (ClassNotFoundException e1) { LOGGER.error("Could not create the text without annotations to be cut away in document {}", JCoReTools.getDocId(aJCas), e1); throw new AnalysisEngineProcessException(e1); diff --git a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java index c3d0aa8a9..1e820d945 100644 --- a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java +++ b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java @@ -332,5 +332,33 @@ public void testErrordoc2() throws Exception { assertThatCode(() -> jsbd.process(jCas.getCas())).doesNotThrowAnyException(); } + @Test + public void testErrordoc3() throws Exception { + // This document has multiple sentences that begin with a Figure reference mention ("Figure 2 shows..."). + // By cutting away all the internal reference annotation spans for sentence tagging, the "Figure 2" was + // ultimately appended to the previous sentence, causing errors. Thus, the option to omit internal references + // with letters was added to the condensed document text. This is a test that everything is working as intended. + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", + "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.extensions.jcore-semantics-mention-extension-types"); + + XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "errordocs", "PMC5070457.xmi").toFile()), jCas.getCas()); + JCasUtil.select(jCas, Sentence.class).forEach(Annotation::removeFromIndexes); + AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE, + "de/julielab/jcore/ae/jsbd/model/test-model.gz", + SentenceAnnotator.PARAM_MAX_SENTENCE_LENGTH, 1000, + SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, new String[]{ + "de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"}, + SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{de.julielab.jcore.types.pubmed.InternalReference.class.getCanonicalName()} + ); + assertThatCode(() -> jsbd.process(jCas.getCas())).doesNotThrowAnyException(); + Collection sentences = JCasUtil.select(jCas, Sentence.class); + for (var s : sentences) { + String coveredText = s.getCoveredText(); + if (coveredText.contains("They concluded")) + assertThat(coveredText).endsWith("filament19."); + } + } + } diff --git a/jcore-jsbd-ae/src/test/resources/errordocs/PMC5070457.xmi b/jcore-jsbd-ae/src/test/resources/errordocs/PMC5070457.xmi new file mode 100644 index 000000000..dd0c227ca --- /dev/null +++ b/jcore-jsbd-ae/src/test/resources/errordocs/PMC5070457.xmi @@ -0,0 +1,5 @@ + +PMC5070457 \ No newline at end of file diff --git a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java index 34b0e1f93..9ceb5f84e 100644 --- a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java +++ b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReCondensedDocumentText.java @@ -1,11 +1,15 @@ package de.julielab.jcore.utility; +import de.julielab.jcore.types.InternalReference; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; import java.util.Map.Entry; import java.util.NavigableMap; import java.util.Set; import java.util.TreeMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * This class is helpful when some parts of the CAS document text should be cut @@ -22,6 +26,11 @@ public class JCoReCondensedDocumentText { private String condensedText; private JCas cas; private Set cutAwayFillCharacters; + private boolean skipInternalReferencesWithLetters; + + public boolean isSkipInternalReferencesWithLetters() { + return skipInternalReferencesWithLetters; + } /** *

@@ -35,7 +44,41 @@ public class JCoReCondensedDocumentText { * @throws ClassNotFoundException If cutAwayTypes contains non-existing type names. */ public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes) throws ClassNotFoundException { - this(cas, cutAwayTypes, null); + this(cas, cutAwayTypes, false); + } + + /** + *

+ * Cuts away the covered text of annotations of a type in cutAwayTypes + * from the cas document text. If cutAwayTypes is null or + * empty, this class' methods will return the original CAS data. + *

+ * + * @param cas The CAS for which the document text should be cut. + * @param cutAwayTypes The types for cutting. May be null. + * @throws ClassNotFoundException If cutAwayTypes contains non-existing type names. + */ + public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes, boolean skipInternalReferencesWithLetters) throws ClassNotFoundException { + this(cas, cutAwayTypes, null, skipInternalReferencesWithLetters); + } + + /** + *

+ * Cuts away the covered text of annotations of a type in cutAwayTypes + * from the cas document text. If cutAwayTypes is null or + * empty, this class' methods will return the original CAS data. + *

+ *

The cutAwayFillCharacters set may provide characters that, when being the only character between + * to cut-away annotations, will add to the span of text being cut away. This way, enumerations of references + * (e.g. "4,6,8") can be completely removed, for example.

+ * + * @param cas The CAS for which the document text should be cut. + * @param cutAwayTypes The types for cutting. May be null. + * @param cutAwayFillCharacters Characters that, when being the only separator between two cut away annotations, are also cut away. + * @throws ClassNotFoundException If cutAwayTypes contains non-existing type names. + */ + public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes, Set cutAwayFillCharacters) throws ClassNotFoundException{ + this(cas, cutAwayTypes, cutAwayFillCharacters, false); } /** @@ -53,9 +96,10 @@ public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes) throws Cla * @param cutAwayFillCharacters Characters that, when being the only separator between two cut away annotations, are also cut away. * @throws ClassNotFoundException If cutAwayTypes contains non-existing type names. */ - public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes, Set cutAwayFillCharacters) throws ClassNotFoundException { + public JCoReCondensedDocumentText(JCas cas, Set cutAwayTypes, Set cutAwayFillCharacters, boolean skipInternalReferencesWithLetters) throws ClassNotFoundException { this.cas = cas; this.cutAwayFillCharacters = cutAwayFillCharacters; + this.skipInternalReferencesWithLetters = skipInternalReferencesWithLetters; buildMap(cas, cutAwayTypes); } @@ -84,6 +128,7 @@ public JCas getCas() { public void buildMap(JCas cas, Set cutAwayTypes) throws ClassNotFoundException { if (cutAwayTypes == null || cutAwayTypes.isEmpty()) return; + Pattern letterP = Pattern.compile("[a-zA-Z]"); StringBuilder sb = new StringBuilder(); condensedPos2SumCutMap = new TreeMap<>(); condensedPos2SumCutMap.put(0, 0); @@ -103,6 +148,17 @@ public void buildMap(JCas cas, Set cutAwayTypes) throws ClassNotFoundExc int begin = merger.getCurrentBegin(); int end = merger.getCurrentEnd(); + // Only remove InternalReferences without letters. Those are just numbers in + // PMC and often lead to errors because they are not really part of the sentence. Table and figure + // references, on the other hand, are embedded in the text. Rule of thumb: Remove references + // that don't have a letter. + if (skipInternalReferencesWithLetters && (merger.getAnnotation() instanceof InternalReference || merger.getAnnotation() instanceof de.julielab.jcore.types.pubmed.InternalReference)) { + String coveredText = ((Annotation)merger.getAnnotation()).getCoveredText(); + Matcher letterM = letterP.matcher(coveredText); + if (letterM.find()) + continue; + } + boolean moreThanOneCharacterDistance = begin - lastEnd > 2; boolean previousCharacterIsCutAwayDelimiter = cutAwayFillCharacters == null || cutAwayFillCharacters.isEmpty() || (begin - lastEnd == 2 && cutAwayFillCharacters.contains(cas.getDocumentText().charAt(begin - 1))); if (lastEnd > 0 && begin > lastEnd && (previousCharacterIsCutAwayDelimiter || moreThanOneCharacterDistance)) { diff --git a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java index 470baa250..da51e1c59 100644 --- a/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java +++ b/jcore-utilities/src/test/java/de/julielab/jcore/utility/JCoReCondensedDocumentTextTest.java @@ -119,7 +119,6 @@ public void testReduce5() throws Exception { JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, new HashSet<>(Arrays.asList(InternalReference.class.getCanonicalName()))); - System.out.println(condensedText.getCodensedText()); assertEquals("Leptin is an adipose-derived protein secreted by adipocytes and is expressed in adipose tissue.\n" + "It has the role of being a key regulator of several physiological pathways including body weight and regulation of food intake, inflammation, endocrine function, energy homeostasis, bone metabolism and immunity.\n" + "Results from various studies indicate that leptin may play a significant role in bone physiology, independent of the central nervous system.\n", condensedText.getCodensedText()); @@ -127,6 +126,30 @@ public void testReduce5() throws Exception { assertEquals(314, condensedText.getOriginalOffsetForCondensedOffset(308)); } + @Test + public void testReduce6() throws Exception { + // Test the option to skip internal references that have letters from omission from the condensed text. + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-structure-types"); + jcas.setDocumentText("Andreeva et al.19 and Xiao et al.20 studied the way of binding of a myosin head to an actin filament by using tryptic digestion of myofibrils and measuring optical polarization and dichroism. They concluded that in the rigor rabbit psoas muscle each myosin head binds to two actin monomers in a thin filament20, suggesting the possibility that the myosin head may first bind to one and then to two monomers in the actin filament19.\n" + + "Figure 2 shows an example of possible mechanism of how such binding change occurs."); + InternalReference ref1 = new InternalReference(jcas, 15, 17); + ref1.addToIndexes(); + InternalReference ref2 = new InternalReference(jcas, 33, 35); + ref2.addToIndexes(); + InternalReference ref3 = new InternalReference(jcas, 308, 310); + ref3.addToIndexes(); + InternalReference ref4 = new InternalReference(jcas, 428, 430); + ref4.addToIndexes(); + InternalReference ref5 = new InternalReference(jcas, 432, 440); + ref5.addToIndexes(); + + JCoReCondensedDocumentText condensedText = new JCoReCondensedDocumentText(jcas, + new HashSet<>(Arrays.asList(InternalReference.class.getCanonicalName())), true); + assertEquals("Andreeva et al. and Xiao et al. studied the way of binding of a myosin head to an actin filament by using tryptic digestion of myofibrils and measuring optical polarization and dichroism. They concluded that in the rigor rabbit psoas muscle each myosin head binds to two actin monomers in a thin filament, suggesting the possibility that the myosin head may first bind to one and then to two monomers in the actin filament.\n" + + "Figure 2 shows an example of possible mechanism of how such binding change occurs.", condensedText.getCodensedText()); + } + @Test public void testCondensedOffsetsWithinCutawayAnnotations() throws Exception { From 20c7a4569c5777f2e77cd8a6308cce8ed666c153 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 20 Jan 2022 10:00:01 +0100 Subject: [PATCH 111/269] Remove internal reference spans included in BANNER gene mentions. --- .../jcore/ae/banner/BANNERAnnotator.java | 33 +++++++++++++++- .../jcore/ae/banner/desc/jcore-banner-ae.xml | 1 + .../jcore/ae/banner/BANNERAnnotatorTest.java | 38 ++++++++++++++++++- 3 files changed, 69 insertions(+), 3 deletions(-) diff --git a/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java b/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java index 1f6077e17..0d8837ff6 100644 --- a/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java +++ b/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java @@ -9,8 +9,10 @@ import banner.types.Mention; import banner.types.Sentence; import de.julielab.jcore.types.EntityMention; +import de.julielab.jcore.types.pubmed.InternalReference; import de.julielab.jcore.utility.JCoReAnnotationTools; import de.julielab.jcore.utility.JCoReTools; +import de.julielab.jcore.utility.index.JCoReOverlapAnnotationIndex; import dragon.nlp.tool.Tagger; import dragon.nlp.tool.lemmatiser.EngLemmatiser; import org.apache.commons.configuration.ConfigurationException; @@ -34,6 +36,7 @@ import java.io.IOException; import java.io.InputStream; import java.util.Arrays; +import java.util.List; import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; @@ -145,6 +148,7 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { String docId = ""; try { docId = JCoReTools.getDocId(jcas); + JCoReOverlapAnnotationIndex intRefIndex = new JCoReOverlapAnnotationIndex<>(jcas, InternalReference.type); FSIterator sentIt = jcas.getAnnotationIndex(de.julielab.jcore.types.Sentence.type).iterator(); int geneCount = 0; int sentCount = 0; @@ -164,8 +168,15 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { String typeName = typeMap.getOrDefault(entityType.getText(), EntityMention.class.getCanonicalName()); Annotation a = JCoReAnnotationTools.getAnnotationByClassName(jcas, typeName); - a.setBegin(sentenceBegin + mention.getStartChar()); - a.setEnd(sentenceBegin + mention.getEndChar()); + int originalBegin = sentenceBegin + mention.getStartChar(); + int originalEnd = sentenceBegin + mention.getEndChar(); + a.setBegin(originalBegin); + a.setEnd(originalEnd); + excludeReferenceAnnotationSpans(a, intRefIndex); + if (a.getEnd() <= a.getBegin()) { + log.error("After removing internal reference spans from the gene, it has no positive span any more. The original text was {} with offsets {}-{}. The new offsets are {}-{}.", jcas.getDocumentText().substring(originalBegin, originalEnd), originalBegin, originalEnd, a.getBegin(), a.getEnd()); + continue; + } if (a instanceof de.julielab.jcore.types.Annotation) { de.julielab.jcore.types.Annotation jcoreA = (de.julielab.jcore.types.Annotation) a; jcoreA.setId("BANNER, " + docId + ": " + geneCount++); @@ -184,4 +195,22 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { throw new AnalysisEngineProcessException(e); } } + + /** + * Internal references can actually look like a part of a gene, e.g. "filament19" where "19" is a reference. + * Exclude those spans from the gene mentions. + * @param a The gene annotation. + * @param intRefIndex The reference index. + */ + private void excludeReferenceAnnotationSpans(Annotation a, JCoReOverlapAnnotationIndex intRefIndex) { + List annotationsInGene = intRefIndex.search(a); + for (Annotation overlappingAnnotation : annotationsInGene) { + if (overlappingAnnotation.getBegin() == a.getBegin()) { + a.setBegin(overlappingAnnotation.getEnd()); + } + if (overlappingAnnotation.getEnd() == a.getEnd()) { + a.setEnd(overlappingAnnotation.getBegin()); + } + } + } } diff --git a/jcore-banner-ae/src/main/resources/de/julielab/jcore/ae/banner/desc/jcore-banner-ae.xml b/jcore-banner-ae/src/main/resources/de/julielab/jcore/ae/banner/desc/jcore-banner-ae.xml index b98b5f42f..6eddce439 100644 --- a/jcore-banner-ae/src/main/resources/de/julielab/jcore/ae/banner/desc/jcore-banner-ae.xml +++ b/jcore-banner-ae/src/main/resources/de/julielab/jcore/ae/banner/desc/jcore-banner-ae.xml @@ -29,6 +29,7 @@ + diff --git a/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java b/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java index 489ecd37d..61f748892 100644 --- a/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java +++ b/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java @@ -12,6 +12,7 @@ import de.julielab.jcore.types.Gene; import de.julielab.jcore.types.Sentence; +import de.julielab.jcore.types.pubmed.InternalReference; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.fit.factory.AnalysisEngineFactory; @@ -34,7 +35,8 @@ public void testProcess() throws Exception { // just tag a single sentence with a test model that actually used that sentence as training data. JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-document-meta-types", - "de.julielab.jcore.types.jcore-semantics-biology-types"); + "de.julielab.jcore.types.jcore-semantics-biology-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); // this is sentence P00055040A0000 from the test BC2GM train data jcas.setDocumentText( "Ten out-patients with pustulosis palmaris et plantaris were examined with direct immunofluorescence (IF) technique for deposition of fibrinogen, fibrin or its degradation products (FR-antigen) in affected and unaffected skin, together with heparin-precipitable fraction (HPF), cryoglobulin and total plasma fibrinogen in the blood."); @@ -59,6 +61,40 @@ public void testProcess() throws Exception { assertEquals("fibrinogen", geneList.get(4).getCoveredText()); } + @Test + public void testInternalReferenceExclusion() throws Exception { + // Internal references in papers, e.g. for bibliography, often appear as numbers. If such a number is + // directly appended to a gene name, it is mostly included into the gene name by BANNER. + // Thus, such reference spans are removed afterwards in the annotator and this test is checking that it works. + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", + "de.julielab.jcore.types.jcore-document-meta-types", + "de.julielab.jcore.types.jcore-semantics-biology-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); + // this is sentence P00055040A0000 from the test BC2GM train data EXCEPT the '19' following 'fibrinogen' which + // is our internal reference for this test. + jcas.setDocumentText( + "Ten out-patients with pustulosis palmaris et plantaris were examined with direct immunofluorescence (IF) technique for deposition of fibrinogen19, fibrin or its degradation products (FR-antigen) in affected and unaffected skin, together with heparin-precipitable fraction (HPF), cryoglobulin and total plasma fibrinogen in the blood."); + new Sentence(jcas, 0, jcas.getDocumentText().length()).addToIndexes(); + new InternalReference(jcas, 143, 145).addToIndexes(); + AnalysisEngine bannerAe = AnalysisEngineFactory.createEngine(BANNERAnnotator.class, + BANNERAnnotator.PARAM_CONFIG_FILE, "src/test/resources/banner_ae_test.xml", BANNERAnnotator.PARAM_TYPE_MAPPING, new String[] {"GENE=de.julielab.jcore.types.Gene"}); + bannerAe.process(jcas); + + // expected result from the GENE.eval.small file: + // P00055040A0000|116 125|fibrinogen + // P00055040A0000|127 132|fibrin + // P00055040A0000|158 167|FR-antigen + // P00055040A0000|243 254|cryoglobulin + // P00055040A0000|269 278|fibrinogen + // However, we ignore the offsets because the eval offsets ignore white spaces + List geneList = new ArrayList(JCasUtil.select(jcas, Gene.class)); + assertEquals("fibrinogen", geneList.get(0).getCoveredText()); + assertEquals("fibrin", geneList.get(1).getCoveredText()); + assertEquals("FR-antigen", geneList.get(2).getCoveredText()); + assertEquals("cryoglobulin", geneList.get(3).getCoveredText()); + assertEquals("fibrinogen", geneList.get(4).getCoveredText()); + } + @Test public void testMultithreading() throws Exception { List ts = new ArrayList<>(); From 94f8e7d7a535df08d1c6eaf5509d0eac23c698c3 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 20 Jan 2022 10:31:10 +0100 Subject: [PATCH 112/269] Remove internal reference spans included in flair gene mentions. Flair exhibits the same behaviour as BANNER, to no surprise. Numbers appended to gene names are just included into the recognized gene name. Thus, explicit exclusion of references is here also necessary. --- .../jcore/ae/banner/BANNERAnnotatorTest.java | 3 +- .../jcore/ae/flairner/FlairNerAnnotator.java | 23 ++++++++++ .../ae/flairner/FlairNerAnnotatorTest.java | 44 ++++++++++++++++--- 3 files changed, 62 insertions(+), 8 deletions(-) diff --git a/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java b/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java index 61f748892..ed1ce4cee 100644 --- a/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java +++ b/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java @@ -113,7 +113,8 @@ private void tagalot() throws UIMAException { // just tag a single sentence with a test model that actually used that sentence as training data. JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-document-meta-types", - "de.julielab.jcore.types.jcore-semantics-biology-types"); + "de.julielab.jcore.types.jcore-semantics-biology-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); // this is sentence P00055040A0000 from the test BC2GM train data jcas.setDocumentText( "Maintenance of skeletal muscle mass is regulated by the balance between anabolic and catabolic processes. Mammalian target of rapamycin (mTOR) is an evolutionarily conserved serine/threonine kinase, and is known to play vital roles in protein synthesis. Recent findings have continued to refine our understanding of the function of mTOR in maintaining skeletal muscle mass. mTOR controls the anabolic and catabolic signaling of skeletal muscle mass, resulting in the modulation of muscle hypertrophy and muscle wastage. This review will highlight the fundamental role of mTOR in skeletal muscle growth by summarizing the phenotype of skeletal-specific mTOR deficiency. In addition, the evidence that mTOR is a dual regulator of anabolism and catabolism in skeletal muscle mass will be discussed. A full understanding of mTOR signaling in the maintenance of skeletal muscle mass could help to develop mTOR-targeted therapeutics to prevent muscle wasting."); diff --git a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java index 215b07718..8154b0f5c 100644 --- a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java +++ b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java @@ -9,9 +9,11 @@ import de.julielab.jcore.types.EntityMention; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; +import de.julielab.jcore.types.pubmed.InternalReference; import de.julielab.jcore.utility.JCoReAnnotationTools; import de.julielab.jcore.utility.JCoReTools; import de.julielab.jcore.utility.index.Comparators; +import de.julielab.jcore.utility.index.JCoReOverlapAnnotationIndex; import de.julielab.jcore.utility.index.JCoReTreeMapAnnotationIndex; import de.julielab.jcore.utility.index.TermGenerators; import org.apache.uima.UimaContext; @@ -24,6 +26,7 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.DoubleArray; +import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -169,6 +172,7 @@ public void process(final JCas aJCas) throws AnalysisEngineProcessException { log.debug("Document {} does not have any tokens", JCoReTools.getDocId(aJCas)); } try { + JCoReOverlapAnnotationIndex intRefIndex = new JCoReOverlapAnnotationIndex<>(aJCas, InternalReference.type); final AnnotationAdderHelper helper = new AnnotationAdderHelper(); log.trace("Sending document sentences to flair for entity tagging."); final NerTaggingResponse taggingResponse = connector.tagSentences(StreamSupport.stream(sentIndex.spliterator(), false)); @@ -178,6 +182,7 @@ public void process(final JCas aJCas) throws AnalysisEngineProcessException { final Sentence sentence = sentenceMap.get(entity.getDocumentId()); EntityMention em = (EntityMention) JCoReAnnotationTools.getAnnotationByClassName(aJCas, entityClass); helper.setAnnotationOffsetsRelativeToSentence(sentence, em, entity, adderConfig); + excludeReferenceAnnotationSpans(em, intRefIndex); em.setSpecificType(entity.getTag()); em.setConfidence(String.valueOf(entity.getLabelConfidence())); em.setComponentId(componentId); @@ -250,6 +255,24 @@ private void addTokenEmbeddings(JCas aJCas, Map sentenceMap, A } } + /** + * Internal references can actually look like a part of a gene, e.g. "filament19" where "19" is a reference. + * Exclude those spans from the gene mentions. + * @param a The gene annotation. + * @param intRefIndex The reference index. + */ + private void excludeReferenceAnnotationSpans(Annotation a, JCoReOverlapAnnotationIndex intRefIndex) { + List annotationsInGene = intRefIndex.search(a); + for (Annotation overlappingAnnotation : annotationsInGene) { + if (overlappingAnnotation.getBegin() == a.getBegin()) { + a.setBegin(overlappingAnnotation.getEnd()); + } + if (overlappingAnnotation.getEnd() == a.getEnd()) { + a.setEnd(overlappingAnnotation.getBegin()); + } + } + } + @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { try { diff --git a/jcore-flair-ner-ae/src/test/java/de/julielab/jcore/ae/flairner/FlairNerAnnotatorTest.java b/jcore-flair-ner-ae/src/test/java/de/julielab/jcore/ae/flairner/FlairNerAnnotatorTest.java index 9c5171fd6..56fc4d046 100644 --- a/jcore-flair-ner-ae/src/test/java/de/julielab/jcore/ae/flairner/FlairNerAnnotatorTest.java +++ b/jcore-flair-ner-ae/src/test/java/de/julielab/jcore/ae/flairner/FlairNerAnnotatorTest.java @@ -4,6 +4,7 @@ import de.julielab.jcore.types.Gene; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; +import de.julielab.jcore.types.pubmed.InternalReference; import de.julielab.jcore.utility.index.Comparators; import de.julielab.jcore.utility.index.JCoReTreeMapAnnotationIndex; import de.julielab.jcore.utility.index.TermGenerators; @@ -43,7 +44,7 @@ public class FlairNerAnnotatorTest { @Test public void testAnnotatorWithoutWordEmbeddings() throws Exception { - final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types"); + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt"); String text = "Knockdown of SUB1 homolog by siRNA inhibits the early stages of HIV-1 replication in 293T cells infected with VSV-G pseudotyped HIV-1 ."; jCas.setDocumentText(text); @@ -69,10 +70,39 @@ public void testAnnotatorWithoutWordEmbeddings() throws Exception { engine.collectionProcessComplete(); } + @Test + public void testAnnotatorWithoutWordEmbeddings2() throws Exception { + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); + final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt"); + String text = "Knockdown of SUB1 homolog2 by siRNA inhibits the early stages of HIV-1 replication in 293T cells infected with VSV-G pseudotyped HIV-1 ."; + jCas.setDocumentText(text); + Sentence s = new Sentence(jCas, 0, text.length()); + addTokens(jCas); + s.addToIndexes(); + new InternalReference(jCas, 25, 26).addToIndexes(); + engine.process(jCas); + List foundGenes = new ArrayList<>(); + JCoReTreeMapAnnotationIndex tokenIndex = new JCoReTreeMapAnnotationIndex<>(TermGenerators.longOffsetTermGenerator(), TermGenerators.longOffsetTermGenerator(), jCas, Token.type); + for (Annotation a : jCas.getAnnotationIndex(Gene.type)) { + Gene g = (Gene) a; + foundGenes.add(g.getCoveredText()); + assertThat(g.getSpecificType().equals("Gene")); + final Iterator tokenIt = tokenIndex.searchFuzzy(g).iterator(); + while (tokenIt.hasNext()) { + Token token = tokenIt.next(); + assertThat(token.getEmbeddingVectors()).isNull(); + } + assertThat(Double.parseDouble(g.getConfidence())).isGreaterThan(0.64); + assertThat(g.getComponentId().equals(FlairNerAnnotator.class.getSimpleName())); + } + assertThat(foundGenes).containsExactly("SUB1 homolog", "HIV-1", "VSV-G", "HIV-1"); + engine.collectionProcessComplete(); + } + @Test public void testAnnotatorWithEntityWordEmbeddings() throws Exception { embeddingsCache.clear(); - final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types"); + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_STORE_EMBEDDINGS, ENTITIES, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt", FlairNerAnnotator.PARAM_COMPONENT_ID, "ATotallyDifferentComponentId"); String text = "Knockdown of SUB1 homolog by siRNA inhibits the early stages of HIV-1 replication in 293T cells infected with VSV-G pseudotyped HIV-1 ."; jCas.setDocumentText(text); @@ -111,7 +141,7 @@ public void testAnnotatorWithEntityWordEmbeddings() throws Exception { @Test(dependsOnMethods = "testAnnotatorWithEntityWordEmbeddings") public void testAnnotatorWithEntitySubWordEmbeddings() throws Exception { - final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types"); + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_STORE_EMBEDDINGS, ENTITIES, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt"); String text = "Knockdown of SUB1 homolog by siRNA inhibits the early stages of HIV-1 replication in 293T cells infected with VSV-G pseudotyped HIV-1 ."; jCas.setDocumentText(text); @@ -179,7 +209,7 @@ private double l2Norm(double[] vector) { @Test public void testAnnotatorWithAllEmbeddings() throws Exception { - final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types"); + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_STORE_EMBEDDINGS, FlairNerAnnotator.StoreEmbeddings.ALL, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt"); String text = "Knockdown of SUB1 homolog by siRNA inhibits the early stages of HIV-1 replication in 293T cells infected with VSV-G pseudotyped HIV-1 ."; jCas.setDocumentText(text); @@ -214,7 +244,7 @@ private void addSentences(JCas jCas) { @Test public void testAnnotator2() throws Exception { - final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types"); + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt"); // The sentence detection and tokenization was done by the jcore-j[st]bd-biomedical-english JCoRe project components, using the executable (java -jar) command line artifact created when building the components. String text = "Synergistic lethal effect between hydrogen peroxide and neocuproine ( 2,9-dimethyl 1,10-phenanthroline ) in Escherichia coli .\n" + @@ -241,7 +271,7 @@ public void testAnnotator2() throws Exception { @Test public void testAnnotatorOnOffsetIssueDocument() throws Exception { - final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types"); + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt"); XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "1681975.xmi").toString()), jCas.getCas()); @@ -259,7 +289,7 @@ public void testAnnotatorOnOffsetIssueDocument() throws Exception { @Test public void testEmbeddings2() throws Exception { - final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types"); + final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); final AnalysisEngine engine = AnalysisEngineFactory.createEngine(FlairNerAnnotator.class, FlairNerAnnotator.PARAM_ANNOTATION_TYPE, Gene.class.getCanonicalName(), FlairNerAnnotator.PARAM_FLAIR_MODEL, "src/test/resources/genes-small-model.pt", FlairNerAnnotator.PARAM_STORE_EMBEDDINGS, ENTITIES); // The sentence detection and tokenization was done by the jcore-j[st]bd-biomedical-english JCoRe project components, using the executable (java -jar) command line artifact created when building the components. String text = "We show that tal controls gene expression and tissue folding in Drosophila , thus acting as a link between patterning and morphogenesis .\n tal function is mediated by several 33-nucleotide-long open reading frames ( ORFs )"; From 7a2e154fa127e4eef2c2ce0384cc046126be9b83 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 24 Jan 2022 13:38:40 +0100 Subject: [PATCH 113/269] Fixes #127. --- jcore-pmc-reader/README.md | 4 +- .../jcore/multiplier/pmc/PMCMultiplier.java | 31 ++++++++- .../jcore/reader/pmc/CasPopulator.java | 5 +- .../jcore/reader/pmc/PMCMultiplierReader.java | 12 +++- .../julielab/jcore/reader/pmc/PMCReader.java | 3 +- .../jcore/reader/pmc/PMCReaderBase.java | 5 ++ .../pmc/parser/DefaultElementParser.java | 10 +-- .../reader/pmc/parser/NxmlDocumentParser.java | 2 +- .../pmc/desc/jcore-pmc-multiplier-reader.xml | 7 +++ .../reader/pmc/desc/jcore-pmc-reader.xml | 7 +++ .../elementproperties-no-bib-refs.yml | 63 +++++++++++++++++++ .../pmc/resources/elementproperties.yml | 1 - .../jcore/reader/pmc/PMCReaderTest.java | 49 +++++++++++++++ .../jcore-uri-multiplier-types.xml | 13 ++++ 14 files changed, 193 insertions(+), 19 deletions(-) create mode 100644 jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties-no-bib-refs.yml diff --git a/jcore-pmc-reader/README.md b/jcore-pmc-reader/README.md index 4fb82a46a..f42e43e76 100644 --- a/jcore-pmc-reader/README.md +++ b/jcore-pmc-reader/README.md @@ -102,7 +102,9 @@ The following properties are currently supported: | paths | list of objects | Allows to specify a relative or absolute XPath like sequence of element names in the form `abstract/sec/title` and properties that should be applied to elements matching this path. | | type | string | The UIMA type that should be used to annotate the text contents of the element | -The `attribute` and `path` properties define criteria where the base properties are overwritten by the properties specified in association with the given attribute-value combination or path. For example, it is possible to include a certain element for document text but omit it if has a specific element as parent or some attribute value. +The `attribute` and `path` properties define criteria where the base properties are overwritten by the properties specified in association with the given attribute-value combination or path. Attributes are addressed by specifying `name` and `value` keys. The `name` is the name of the attribute to test and `value` is the value the attribute must have for the property override to take effect. Paths require the `path` key followed by a slash-separated sequence of element names that ends with the name of the XML element for which the rule should hold. The path does not need to start from the root, it should just be long enough to identify the element distinctively. + +For example, it is possible to include a certain element for document text but omit it if it has a specific element as parent or some attribute value. Here is an example taken directly from the `elementproperties.yml` file: ```yml diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java index 38d52f4b8..f15b5d983 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java @@ -2,8 +2,10 @@ import de.julielab.jcore.reader.pmc.CasPopulator; import de.julielab.jcore.reader.pmc.NoDataAvailableException; +import de.julielab.jcore.reader.pmc.PMCMultiplierReader; import de.julielab.jcore.reader.pmc.parser.ElementParsingException; import de.julielab.jcore.types.casmultiplier.JCoReURI; +import de.julielab.jcore.types.casmultiplier.MultiplierConfigParameters; import org.apache.uima.analysis_component.JCasMultiplier_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.AbstractCas; @@ -12,6 +14,7 @@ import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.StringArray; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -21,13 +24,13 @@ import java.util.Iterator; @ResourceMetaData(name = "JCoRe Pubmed Central NXML Multiplier", description = "This multiplier expect to receive URIs to NXML documents in the form of JCoReURI feature structures. All JCoReURI FS in the annotation indexes are read and output as new CASes.") -@OperationalProperties(outputsNewCases = true, multipleDeploymentAllowed = true, modifiesCas = false) +@OperationalProperties(outputsNewCases = true, modifiesCas = false) @TypeCapability(outputs = {"de.julielab.jcore.types.TitleType", "de.julielab.jcore.types.Title", "de.julielab.jcore.types.TextObject", "de.julielab.jcore.types.Table", "de.julielab.jcore.types.SectionTitle", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.PubType", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.OtherPub", "de.julielab.jcore.types.pubmed.OtherID", "de.julielab.jcore.types.pubmed.ManualDescriptor", "de.julielab.jcore.types.Keyword", "de.julielab.jcore.types.Journal", "de.julielab.jcore.types.pubmed.Header", "de.julielab.jcore.types.Footnote", "de.julielab.jcore.types.Figure", "uima.tcas.DocumentAnnotation", "de.julielab.jcore.types.Date", "de.julielab.jcore.types.CaptionType", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.AutoDescriptor", "de.julielab.jcore.types.AuthorInfo", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection"}) public class PMCMultiplier extends JCasMultiplier_ImplBase { private final static Logger log = LoggerFactory.getLogger(PMCMultiplier.class); private Iterator currentUriBatch; private CasPopulator casPopulator; - + private Boolean omitBibReferences = null; @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { @@ -35,14 +38,36 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { if (log.isDebugEnabled()) log.debug("Received batch of {} NXML URIs", jcoreUris.size()); currentUriBatch = jcoreUris.stream().map(JCoReURI::getUri).map(URI::create).iterator(); + determineOmitBibReferences(aJCas); try { - casPopulator = new CasPopulator(currentUriBatch); + casPopulator = new CasPopulator(currentUriBatch, omitBibReferences); } catch (IOException e) { log.error("Exception occurred when trying to initialize the NXML parser", e); throw new AnalysisEngineProcessException(e); } } + private void determineOmitBibReferences(JCas aJCas) throws AnalysisEngineProcessException { + try { + MultiplierConfigParameters multiplierConfigParameters = JCasUtil.selectSingle(aJCas, MultiplierConfigParameters.class); + StringArray parameters = multiplierConfigParameters.getParameters(); + for (int i = 0; i < parameters.size(); ++i) { + String[] paramPair = parameters.get(i).split("\\s+=\\s+"); + if (paramPair.length != 2) { + String msg = "Error while parsing multiplier configuration parameters passed from the multiplier reader. The parameter array contains the entry \"" + parameters.get(i) + "\". The expected format is =."; + log.error(msg); + throw new AnalysisEngineProcessException(new IllegalArgumentException(msg)); + } + if (paramPair[0].equals(PMCMultiplierReader.PARAM_OMIT_BIB_REFERENCES)) { + omitBibReferences = Boolean.parseBoolean(paramPair[1]); + } + } + } catch (IllegalArgumentException e) { + omitBibReferences = false; + // nothing further; there were no parameters given + } + } + @Override public boolean hasNext() { diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java index 61e2851a5..d3b402b36 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java @@ -15,10 +15,11 @@ public class CasPopulator { private NxmlDocumentParser nxmlDocumentParser; private Iterator nxmlIterator; - public CasPopulator(Iterator nxmlIterator) throws IOException { + public CasPopulator(Iterator nxmlIterator, Boolean omitBibReferences) throws IOException { this.nxmlIterator = nxmlIterator; nxmlDocumentParser = new NxmlDocumentParser(); - nxmlDocumentParser.loadElementPropertyFile("/de/julielab/jcore/reader/pmc/resources/elementproperties.yml"); + String settings = omitBibReferences ? "/de/julielab/jcore/reader/pmc/resources/elementproperties-no-bib-refs.yml" : "/de/julielab/jcore/reader/pmc/resources/elementproperties.yml"; + nxmlDocumentParser.loadElementPropertyFile(settings); } public void populateCas(URI nxmlUri, JCas cas) throws ElementParsingException, NoDataAvailableException { diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCMultiplierReader.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCMultiplierReader.java index 5527a249c..4c349098c 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCMultiplierReader.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCMultiplierReader.java @@ -1,6 +1,7 @@ package de.julielab.jcore.reader.pmc; import de.julielab.jcore.types.casmultiplier.JCoReURI; +import de.julielab.jcore.types.casmultiplier.MultiplierConfigParameters; import org.apache.uima.UimaContext; import org.apache.uima.collection.CollectionException; import org.apache.uima.ducc.Workitem; @@ -8,6 +9,7 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.StringArray; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -24,6 +26,7 @@ public class PMCMultiplierReader extends PMCReaderBase { public static final String PARAM_WHITELIST = PMCReaderBase.PARAM_WHITELIST; public static final String PARAM_SEND_CAS_TO_LAST = "SendCasToLast"; public static final String PARAM_BATCH_SIZE = "BatchSize"; + public static final String PARAM_OMIT_BIB_REFERENCES = PMCReaderBase.PARAM_OMIT_BIB_REFERENCES; private final static Logger log = LoggerFactory.getLogger(PMCMultiplierReader.class); @ConfigurationParameter(name = PARAM_SEND_CAS_TO_LAST, mandatory = false, defaultValue = "false", description = "UIMA DUCC relevant parameter when using a CAS multiplier. When set to true, the worker CAS from the collection reader is forwarded to the last component in the pipeline. This can be used to send information about the progress to the CAS consumer in order to have it perform batch operations. For this purpose, a feature structure of type WorkItem from the DUCC library is added to the worker CAS. This feature structure has information about the current progress.") private boolean sendCasToLast; @@ -51,9 +54,16 @@ public void getNext(JCas jCas) throws CollectionException { log.error("Exception with URI: " + uri.toString(), e); throw new CollectionException(e); } - completed++; } + // Send configuration parameters to the multiplier if necessary + if (omitBibReferences) { + MultiplierConfigParameters parameters = new MultiplierConfigParameters(jCas); + StringArray paramArray = new StringArray(jCas, 1); + paramArray.set(0, PMCReaderBase.PARAM_OMIT_BIB_REFERENCES+"="+omitBibReferences); + parameters.setParameters(paramArray); + parameters.addToIndexes(); + } if (sendCasToLast) { Workitem workitem = new Workitem(jCas); // Send the work item CAS also to the consumer. Normally, only the CASes emitted by the CAS multiplier diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReader.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReader.java index 921fc10b5..86a5fac26 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReader.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReader.java @@ -33,6 +33,7 @@ public class PMCReader extends PMCReaderBase { public static final String PARAM_SEARCH_ZIP = PMCReaderBase.PARAM_SEARCH_ZIP; public static final String PARAM_WHITELIST = PMCReaderBase.PARAM_WHITELIST; public static final String PARAM_EXTRACT_ID_FROM_FILENAME = PMCReaderBase.PARAM_EXTRACT_ID_FROM_FILENAME; + public static final String PARAM_OMIT_BIB_REFERENCES = PMCReaderBase.PARAM_OMIT_BIB_REFERENCES; private static final Logger log = LoggerFactory.getLogger(PMCReader.class); private CasPopulator casPopulator; @@ -40,7 +41,7 @@ public class PMCReader extends PMCReaderBase { public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); try { - casPopulator = new CasPopulator(pmcFiles); + casPopulator = new CasPopulator(pmcFiles, omitBibReferences); } catch (IOException e) { log.error("Exception occurred when trying to initialize NXML parser", e); throw new ResourceInitializationException(e); diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReaderBase.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReaderBase.java index a9fdd3890..73e16a0a0 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReaderBase.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/PMCReaderBase.java @@ -28,6 +28,7 @@ public abstract class PMCReaderBase extends JCasCollectionReader_ImplBase { public static final String PARAM_SEARCH_ZIP = "SearchInZipFiles"; public static final String PARAM_WHITELIST = "WhitelistFile"; public static final String PARAM_EXTRACT_ID_FROM_FILENAME = "ExtractIdFromFilename"; + public static final String PARAM_OMIT_BIB_REFERENCES = "OmitBibliographyReferences"; private final static Logger log = LoggerFactory.getLogger(PMCReaderBase.class); @ConfigurationParameter(name = PARAM_INPUT, description = "The path to an NXML file or a directory with NXML files and possibly subdirectories holding more NXML files.") protected File input; @@ -44,6 +45,9 @@ public abstract class PMCReaderBase extends JCasCollectionReader_ImplBase { @ConfigurationParameter(name = PARAM_EXTRACT_ID_FROM_FILENAME, mandatory = false, description = "Used for NXML documents that carry their ID in the file name but not in the document itself. Extracts the string after the last path separator and the first dot after the separator and sets it to the docId feature of the Header annotation.") protected boolean extractIdFromFilename; + @ConfigurationParameter(name = PARAM_OMIT_BIB_REFERENCES, mandatory = false, defaultValue = "false", description = "If set to true, references to the bibliography are omitted from the CAS text.") + protected boolean omitBibReferences; + protected Iterator pmcFiles; protected int completed; @@ -60,6 +64,7 @@ public void initialize(UimaContext context) throws ResourceInitializationExcepti searchRecursively = Optional.ofNullable((Boolean) getConfigParameterValue(PARAM_RECURSIVELY)).orElse(false); searchZip = Optional.ofNullable((Boolean) getConfigParameterValue(PARAM_SEARCH_ZIP)).orElse(false); whitelistFile = Optional.ofNullable((String) getConfigParameterValue(PARAM_WHITELIST)).map(File::new).orElse(null); + omitBibReferences = Optional.ofNullable((Boolean) getConfigParameterValue(PARAM_OMIT_BIB_REFERENCES)).orElse(false); log.info("Reading PubmedCentral NXML file(s) from {}", input); try { Set whitelist = readWhitelist(whitelistFile); diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/DefaultElementParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/DefaultElementParser.java index ac2f3cd23..42e1dc5a6 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/DefaultElementParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/DefaultElementParser.java @@ -54,7 +54,7 @@ public DefaultElementParser(NxmlDocumentParser nxmlDocumentParser) { @Override protected void beforeParseElement() throws ElementParsingException { - // since this parser does not know the element is is used upon, set + // since this parser does not know the element it is used upon, set // it first for the parsing result creation try { elementName = vn.toString(vn.getCurrentIndex()); @@ -138,12 +138,6 @@ protected void editResult(ElementParsingResult result) throws NavException { if (typeName.equals(ElementProperties.TYPE_NONE)) return; - // @SuppressWarnings("unchecked") - // Map defaultFeatureValues = (Map) - // nxmlDocumentParser - // .getTagProperties(elementName) - // .getOrDefault(ElementProperties.DEFAULT_FEATURE_VALUES, - // Collections.emptyMap()); @SuppressWarnings("unchecked") Map defaultFeatureValues = (Map) getApplicableProperties() .orElse(Collections.emptyMap()) @@ -276,8 +270,6 @@ private Optional> getApplicableProperties() throws NavExcept String attributeValue = attributesOfElement.get(attribute.get(ElementProperties.NAME)); if (attributeValue != null && attributeValue.equals(attribute.get(ElementProperties.VALUE)) && attribute.containsKey(ElementProperties.OMIT_ELEMENT)) { - // omitElement = (boolean) - // attribute.get(ElementProperties.OMIT_ELEMENT); applicableProperties = Optional.of(attribute); } } diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java index 2042b258c..9f75ba8db 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java @@ -86,7 +86,7 @@ public void reset(InputStream is, JCas cas) throws DocumentParsingException { * @throws NavException * @throws DocTypeNotFoundException */ - private void setTagset() throws NavException, DocTypeNotFoundException, DocTypeNotSupportedException { + private void setTagset() throws NavException, DocTypeNotFoundException { for (int i = 0; i < vn.getTokenCount(); i++) { if (vn.getTokenType(i) == VTDNav.TOKEN_DTD_VAL) { String docType = StringUtils.normalizeSpace(vn.toString(i)).replaceAll("'", "\""); diff --git a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-multiplier-reader.xml b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-multiplier-reader.xml index 224b668eb..5f1655fc7 100644 --- a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-multiplier-reader.xml +++ b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-multiplier-reader.xml @@ -49,6 +49,13 @@ false false + + OmitBibliographyReferences + If set to true, references to the bibliography are omitted from the CAS text. + Boolean + false + false + diff --git a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-reader.xml b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-reader.xml index 284f41cdd..478806bfb 100644 --- a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-reader.xml +++ b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/desc/jcore-pmc-reader.xml @@ -42,6 +42,13 @@ false false + + OmitBibliographyReferences + If set to true, references to the bibliography are omitted from the CAS text. + Boolean + false + false + diff --git a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties-no-bib-refs.yml b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties-no-bib-refs.yml new file mode 100644 index 000000000..16d5355bb --- /dev/null +++ b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties-no-bib-refs.yml @@ -0,0 +1,63 @@ +article-title: + block-element: true + type: de.julielab.jcore.types.Title +title: + block-element: true + type: de.julielab.jcore.types.Title + default-feature-values: + titleType: other + paths: + - path: sec/title + type: de.julielab.jcore.types.SectionTitle + default-feature-values: + titleType: section + - path: abstract/sec/title + type: de.julielab.jcore.types.AbstractSectionHeading + default-feature-values: + titleType: abstractSection +abstract: + block-element: true + type: de.julielab.jcore.types.AbstractText +label: + block-element: true + type: de.julielab.jcore.types.Title + paths: + - path: list-item/label + omit-element: true +sec: + block-element: true + type: de.julielab.jcore.types.Section + paths: + - path: abstract/sec + type: de.julielab.jcore.types.AbstractSection +p: + block-element: true + type: de.julielab.jcore.types.Paragraph +list: + block-element: true + type: de.julielab.jcore.types.List +list-item: + block-element: true + type: de.julielab.jcore.types.ListItem +caption: + block-element: true + type: de.julielab.jcore.types.Caption + default-feature-values: + captionType: other +fn-group: + block-element: true +front: + omit-element: true +back: + omit-element: true +fig-group: + omit-element: true +floats-group: + omit-element: true +array: + omit-element: true +xref: + attributes: + - name: ref-type + value: bibr + omit-element: true \ No newline at end of file diff --git a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties.yml b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties.yml index f8b5d3429..230bbf929 100644 --- a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties.yml +++ b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties.yml @@ -56,4 +56,3 @@ floats-group: omit-element: true array: omit-element: true - diff --git a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCReaderTest.java b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCReaderTest.java index 7d5547754..3a79a51e8 100644 --- a/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCReaderTest.java +++ b/jcore-pmc-reader/src/test/java/de/julielab/jcore/reader/pmc/PMCReaderTest.java @@ -23,6 +23,8 @@ import java.util.List; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -359,6 +361,53 @@ public void testFigureReferencesAnnotated() throws Exception { assertThat(figRefs).extracting("refid").containsExactly("Fig1", "Fig2"); } + @Test + public void testBibliographyReferencesAnnotated() throws Exception { + JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); + CollectionReader reader = CollectionReaderFactory.createReader(PMCReader.class, PMCReader.PARAM_INPUT, + "src/test/resources/documents-recursive/PMC2847692.nxml.gz"); + reader.getNext(cas.getCas()); + Collection refs = JCasUtil.select(cas, InternalReference.class); + // Without a filter on bibliographic references, there should 76 references to bibliography + List bibliography = refs.stream().filter(r -> r.getReftype().equalsIgnoreCase("bibliography")).collect(Collectors.toList()); + assertThat(bibliography).hasSize(76); + + // RegEx for something like "2004a" + Matcher yearReferenceMatcher = Pattern.compile("[0-9]{4}[ab]?").matcher(cas.getDocumentText()); + int numReferencePatternsInText = 0; + while (yearReferenceMatcher.find()) { + ++numReferencePatternsInText; + } + // Some found patterns are no references, thus the number is higher than that of the references. + assertThat(numReferencePatternsInText).isEqualTo(84); + } + + @Test + public void testBibliographyReferencesOmitted() throws Exception { + JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types", + "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); + CollectionReader reader = CollectionReaderFactory.createReader(PMCReader.class, PMCReader.PARAM_INPUT, + "src/test/resources/documents-recursive/PMC2847692.nxml.gz", + PMCMultiplierReader.PARAM_OMIT_BIB_REFERENCES, true); + reader.getNext(cas.getCas()); + Collection refs = JCasUtil.select(cas, InternalReference.class); + // Since we set the omission parameter to true, there should be no bibliographic references + List bibliography = refs.stream().filter(r -> r.getReftype().equalsIgnoreCase("bibliography")).collect(Collectors.toList()); + assertThat(bibliography).isEmpty(); + + // RegEx for something like "2004a" + Matcher yearReferenceMatcher = Pattern.compile("[0-9]{4}[ab]?").matcher(cas.getDocumentText()); + int numReferencePatternsInText = 0; + while (yearReferenceMatcher.find()) { + ++numReferencePatternsInText; + } + // In the test above, where we have the same document but with bib. references, there were 84 occurrences + // of the pattern. 76 of those were actual references. Thus, after removing the references, 8 pattern + // occurrences should remain. + assertThat(numReferencePatternsInText).isEqualTo(8); + } + @Test public void testPmcReaderDescriptor() throws Exception { // read a whole directory with subdirectories diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-uri-multiplier-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-uri-multiplier-types.xml index 5f6a3459b..fe06797e8 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-uri-multiplier-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/casmultiplier/jcore-uri-multiplier-types.xml @@ -19,5 +19,18 @@ + + de.julielab.jcore.types.casmultiplier.MultiplierConfigParameters + A list of strings in properties format to specify parameters and their values. The format is <key>=<value>. May be used to transfer configuration properties from the multiplier reader to the multiplier. + Normally in UIMA, the multiplier would just have the configuration parameters itself. However, it can be confusing that the basic reader - without a successive multiplier - has some parameters that the multiplier reader does not exhibit because they must be set on the multiplier. Using this annotation, parameter settings can be sent to the multiplier which then does not need further configuration on its own. + uima.tcas.Annotation + + + parameters + An array of string holding key - value pairs in the format <key>=<value>. + uima.cas.StringArray + + + \ No newline at end of file From 7aa1b354c789597b300460474cf8ce9f2b474ef6 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 24 Jan 2022 14:13:09 +0100 Subject: [PATCH 114/269] Remove gene annotations completely covered by an internal reference annotation span. --- .../src/main/java/banner/tagging/pipe/LemmaPOS.java | 2 -- .../de/julielab/jcore/ae/banner/BANNERAnnotator.java | 9 ++++++++- .../julielab/jcore/ae/flairner/FlairNerAnnotator.java | 11 +++++++++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java b/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java index 8068cfa1b..36e8a7cd5 100644 --- a/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java +++ b/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java @@ -43,13 +43,11 @@ public LemmaPOS(Lemmatiser lemmatiser, Tagger posTagger) { public void setLemmatiser(Lemmatiser lemmatiser) { initResourcesMap(); getResources().lemmatiser = lemmatiser; - System.out.println("Setting lemmatiser to " + Thread.currentThread()); } public void setPosTagger(Tagger posTagger) { initResourcesMap(); getResources().posTagger = posTagger; - System.out.println("Setting PoS Tagger to " + Thread.currentThread()); } synchronized private void initResourcesMap() { diff --git a/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java b/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java index 0d8837ff6..1b48675d2 100644 --- a/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java +++ b/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java @@ -174,7 +174,7 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { a.setEnd(originalEnd); excludeReferenceAnnotationSpans(a, intRefIndex); if (a.getEnd() <= a.getBegin()) { - log.error("After removing internal reference spans from the gene, it has no positive span any more. The original text was {} with offsets {}-{}. The new offsets are {}-{}.", jcas.getDocumentText().substring(originalBegin, originalEnd), originalBegin, originalEnd, a.getBegin(), a.getEnd()); + // It seems there was nothing left of a gene mention outside the internal reference; skip continue; } if (a instanceof de.julielab.jcore.types.Annotation) { @@ -211,6 +211,13 @@ private void excludeReferenceAnnotationSpans(Annotation a, JCoReOverlapAnnotatio if (overlappingAnnotation.getEnd() == a.getEnd()) { a.setEnd(overlappingAnnotation.getBegin()); } + // Set zero-character spans on genes that are completely enclosed by a reference. Those are cases + // like, for instance, "Supplementary Figs. S12 and S13, Tables S2 and S3" where S12, S13 and even + // Tables S2 are annotated as genes. + if (overlappingAnnotation.getBegin() <= a.getBegin() && overlappingAnnotation.getEnd() >= a.getEnd()) { + a.setBegin(0); + a.setEnd(0); + } } } } diff --git a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java index 8154b0f5c..76184a17a 100644 --- a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java +++ b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java @@ -183,6 +183,10 @@ public void process(final JCas aJCas) throws AnalysisEngineProcessException { EntityMention em = (EntityMention) JCoReAnnotationTools.getAnnotationByClassName(aJCas, entityClass); helper.setAnnotationOffsetsRelativeToSentence(sentence, em, entity, adderConfig); excludeReferenceAnnotationSpans(em, intRefIndex); + if (em.getEnd() <= em.getBegin()) { + // It seems there was nothing left of a gene mention outside the internal reference; skip + continue; + } em.setSpecificType(entity.getTag()); em.setConfidence(String.valueOf(entity.getLabelConfidence())); em.setComponentId(componentId); @@ -270,6 +274,13 @@ private void excludeReferenceAnnotationSpans(Annotation a, JCoReOverlapAnnotatio if (overlappingAnnotation.getEnd() == a.getEnd()) { a.setEnd(overlappingAnnotation.getBegin()); } + // Set zero-character spans on genes that are completely enclosed by a reference. Those are cases + // like, for instance, "Supplementary Figs. S12 and S13, Tables S2 and S3" where S12, S13 and even + // Tables S2 are annotated as genes. + if (overlappingAnnotation.getBegin() <= a.getBegin() && overlappingAnnotation.getEnd() >= a.getEnd()) { + a.setBegin(0); + a.setEnd(0); + } } } From 7f0a16e7015d2a351f6f9906612cf17e894273b6 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 3 Feb 2022 15:41:52 +0100 Subject: [PATCH 115/269] Remove gene annotations when the covered text is blank. The previous effort to remove internal references from gene spans sometimes resulted in blank gene names. --- .../main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java | 2 +- .../java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java b/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java index 1b48675d2..b5c7e816e 100644 --- a/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java +++ b/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java @@ -173,7 +173,7 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { a.setBegin(originalBegin); a.setEnd(originalEnd); excludeReferenceAnnotationSpans(a, intRefIndex); - if (a.getEnd() <= a.getBegin()) { + if (a.getEnd() <= a.getBegin() || a.getCoveredText().isBlank()) { // It seems there was nothing left of a gene mention outside the internal reference; skip continue; } diff --git a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java index 76184a17a..de2382319 100644 --- a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java +++ b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java @@ -183,7 +183,7 @@ public void process(final JCas aJCas) throws AnalysisEngineProcessException { EntityMention em = (EntityMention) JCoReAnnotationTools.getAnnotationByClassName(aJCas, entityClass); helper.setAnnotationOffsetsRelativeToSentence(sentence, em, entity, adderConfig); excludeReferenceAnnotationSpans(em, intRefIndex); - if (em.getEnd() <= em.getBegin()) { + if (em.getEnd() <= em.getBegin() || em.getCoveredText().isBlank()) { // It seems there was nothing left of a gene mention outside the internal reference; skip continue; } From 3159c5760252e946159b0c6f3e807b17e981f05a Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 3 Feb 2022 15:42:41 +0100 Subject: [PATCH 116/269] Fix a regular expression bug where the PMC multiplier could not receive its parameters. --- .../java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java index f15b5d983..e4b80fac7 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCMultiplier.java @@ -52,7 +52,7 @@ private void determineOmitBibReferences(JCas aJCas) throws AnalysisEngineProcess MultiplierConfigParameters multiplierConfigParameters = JCasUtil.selectSingle(aJCas, MultiplierConfigParameters.class); StringArray parameters = multiplierConfigParameters.getParameters(); for (int i = 0; i < parameters.size(); ++i) { - String[] paramPair = parameters.get(i).split("\\s+=\\s+"); + String[] paramPair = parameters.get(i).split("\\s*=\\s*"); if (paramPair.length != 2) { String msg = "Error while parsing multiplier configuration parameters passed from the multiplier reader. The parameter array contains the entry \"" + parameters.get(i) + "\". The expected format is =."; log.error(msg); From fc2c1f710b13fbe224e1f306acb92250ed93b720 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 3 Feb 2022 15:44:17 +0100 Subject: [PATCH 117/269] Allow relative file paths. --- .../PersistentIndexAddonTermsProvider.java | 8 +++++++- .../sharedresources/PersistentStringIndexMapProvider.java | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentIndexAddonTermsProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentIndexAddonTermsProvider.java index b98514ee3..18d45b5b0 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentIndexAddonTermsProvider.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentIndexAddonTermsProvider.java @@ -120,7 +120,13 @@ public void load(DataResource aData) throws ResourceInitializationException { File indexFile = null; boolean loadData = true; try { - File resourceFile = new File(uri); + File resourceFile; + try { + resourceFile = new File(uri); + } catch (IllegalArgumentException e) { + // to support relative file paths like file:resources/somefile.txt + resourceFile = new File(uri.getSchemeSpecificPart()); + } String resourceFileName = FilenameUtils.getName(uri.toURL().getPath()); indexFile = new File("es-consumer-cache", resourceFileName); if (resourceFile.exists() && indexFile.exists() && resourceFile.lastModified() > indexFile.lastModified()) { diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java index 2551cedea..39994dc9c 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/sharedresources/PersistentStringIndexMapProvider.java @@ -122,7 +122,13 @@ public void load(DataResource aData) throws ResourceInitializationException { File indexFile = null; boolean loadData = true; try { - File resourceFile = new File(uri); + File resourceFile; + try { + resourceFile = new File(uri); + } catch (IllegalArgumentException e) { + // to support relative file paths like file:resources/somefile.txt + resourceFile = new File(uri.getSchemeSpecificPart()); + } String resourceFileName = FilenameUtils.getName(uri.toURL().getPath()); indexFile = new File("es-consumer-cache", resourceFileName); if (resourceFile.exists() && indexFile.exists() && resourceFile.lastModified() > indexFile.lastModified()) { From bf31e93091fc3edf9b071e6335e5ff95b289a5a1 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 3 Feb 2022 15:45:11 +0100 Subject: [PATCH 118/269] Add the actual flattened event type to the event flattener descriptor. --- .../jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/jcore-event-flattener-ae/src/main/resources/de/julielab/jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml b/jcore-event-flattener-ae/src/main/resources/de/julielab/jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml index ff351724b..7e3a1f520 100644 --- a/jcore-event-flattener-ae/src/main/resources/de/julielab/jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml +++ b/jcore-event-flattener-ae/src/main/resources/de/julielab/jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml @@ -13,6 +13,7 @@ + From 37f37d4ff3a945c56cc716ac6589140bf7b6c425 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 10 Feb 2022 11:55:30 +0100 Subject: [PATCH 119/269] PMCReader: Fix a bug where figure captions were labeled as table captions --- .../java/de/julielab/jcore/reader/pmc/parser/FigParser.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FigParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FigParser.java index 9149d8af9..428903fbb 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FigParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FigParser.java @@ -30,7 +30,7 @@ public FigParser(NxmlDocumentParser nxmlDocumentParser) { @Override protected void parseElement(ElementParsingResult figResult) throws ElementParsingException { try { - Optional tableWrapId = getXPathValue("@id"); + Optional figureId = getXPathValue("@id"); Optional labelResult = parseXPath("label"); Optional labelString = getXPathValue("label"); Optional captionResult = parseXPath("caption"); @@ -38,7 +38,7 @@ protected void parseElement(ElementParsingResult figResult) throws ElementParsin captionResult.ifPresent(r -> { ElementParsingResult result = (ElementParsingResult) r; Caption caption = (Caption) result.getAnnotation(); - caption.setCaptionType("table"); + caption.setCaptionType("figure"); figResult.addSubResult(r); }); labelResult.ifPresent(figResult::addSubResult); @@ -52,7 +52,7 @@ protected void parseElement(ElementParsingResult figResult) throws ElementParsin labelString.ifPresent(figure::setObjectLabel); captionResult.map(r -> (Caption) ((ElementParsingResult) r).getAnnotation()) .ifPresent(figure::setObjectCaption); - tableWrapId.ifPresent(figure::setObjectId); + figureId.ifPresent(figure::setObjectId); figResult.setAnnotation(figure); } catch (NavException | XPathParseException | XPathEvalException e) { From 14a04cdcbec913d6e58aab11d093e90d56132dd5 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 14 Feb 2022 17:18:10 +0100 Subject: [PATCH 120/269] Add the PMCDBMultiplier. The PMCReader was also adapted to work directly with input streams. --- jcore-pmc-db-reader/LICENSE | 26 +++ jcore-pmc-db-reader/README.md | 34 +++ jcore-pmc-db-reader/component.meta | 21 ++ jcore-pmc-db-reader/pom.xml | 71 ++++++ .../jcore/multiplier/pmc/PMCDBMultiplier.java | 221 ++++++++++++++++++ .../pmc/desc/jcore-pmc-db-multiplier.xml | 71 ++++++ .../PMCDBMultiplierHashComparisonTest.java | 213 +++++++++++++++++ .../multiplier/pmc/PMCDBMultiplierTest.java | 103 ++++++++ .../test/resources/testdocs/PMC6949206.xml | 6 + .../test/resources/testdocs/PMC7511315.xml | 28 +++ jcore-pmc-reader/LICENSE | 2 +- .../jcore/reader/pmc/CasPopulator.java | 17 ++ .../jcore/reader/pmc/NXMLURIIterator.java | 11 +- .../reader/pmc/parser/NxmlDocumentParser.java | 2 +- .../src/main/resources/LICENSE.txt | 2 +- 15 files changed, 822 insertions(+), 6 deletions(-) create mode 100644 jcore-pmc-db-reader/LICENSE create mode 100644 jcore-pmc-db-reader/README.md create mode 100644 jcore-pmc-db-reader/component.meta create mode 100644 jcore-pmc-db-reader/pom.xml create mode 100644 jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java create mode 100644 jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier.xml create mode 100644 jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierHashComparisonTest.java create mode 100644 jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierTest.java create mode 100644 jcore-pmc-db-reader/src/test/resources/testdocs/PMC6949206.xml create mode 100644 jcore-pmc-db-reader/src/test/resources/testdocs/PMC7511315.xml diff --git a/jcore-pmc-db-reader/LICENSE b/jcore-pmc-db-reader/LICENSE new file mode 100644 index 000000000..d0f946a29 --- /dev/null +++ b/jcore-pmc-db-reader/LICENSE @@ -0,0 +1,26 @@ +BSD 2-Clause License + +Copyright (c) 2022, JULIE Lab +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/jcore-pmc-db-reader/README.md b/jcore-pmc-db-reader/README.md new file mode 100644 index 000000000..f97bc30d2 --- /dev/null +++ b/jcore-pmc-db-reader/README.md @@ -0,0 +1,34 @@ +# JCoRe Pubmed Central DB Reader + +**Descriptor Path**: +``` +de.julielab.jcore.reader.pmc.desc.jcore-pmc-db-reader +``` + +JeDIS database reader for PMC base documents. + + + +**1. Parameters** + +| Parameter Name | Parameter Type | Mandatory | Multivalued | Description | +|----------------|----------------|-----------|-------------|-------------| +| param1 | UIMA-Type | Boolean | Boolean | Description | +| param2 | UIMA-Type | Boolean | Boolean | Description | + +**2. Predefined Settings** + +| Parameter Name | Parameter Syntax | Example | +|----------------|------------------|---------| +| param1 | Syntax-Description | `Example` | +| param2 | Syntax-Description | `Example` | + +**3. Capabilities** + +| Type | Input | Output | +|------|:-----:|:------:| +| de.julielab.jcore.types.TYPE | | `+` | +| de.julielab.jcore.types.ace.TYPE | `+` | | + + +[1] Some Literature? diff --git a/jcore-pmc-db-reader/component.meta b/jcore-pmc-db-reader/component.meta new file mode 100644 index 000000000..c57c78fa7 --- /dev/null +++ b/jcore-pmc-db-reader/component.meta @@ -0,0 +1,21 @@ +{ + "categories": [ + "multiplier", + "reader" + ], + "description": "JeDIS database reader for PMC base documents.", + "descriptors": [ + { + "category": "multiplier", + "location": "de.julielab.jcore.multiplier.pmc.desc.jcore-pmc-db-multiplier" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-pmc-db-reader", + "groupId": "de.julielab", + "version": "2.6.0-SNAPSHOT" + }, + "name": "JCoRe Pubmed Central DB Reader" +} diff --git a/jcore-pmc-db-reader/pom.xml b/jcore-pmc-db-reader/pom.xml new file mode 100644 index 000000000..21d363909 --- /dev/null +++ b/jcore-pmc-db-reader/pom.xml @@ -0,0 +1,71 @@ + + + + 4.0.0 + jcore-pmc-db-reader + jar + de.julielab + + + de.julielab + jedis-parent + 2.6.0-SNAPSHOT + ../jedis-parent + + + + + ch.qos.logback + logback-classic + test + + + org.slf4j + slf4j-api + + + de.julielab + jcore-descriptor-creator + + + de.julielab + jcore-db-reader + 2.6.0-SNAPSHOT + + + de.julielab + jcore-pmc-reader + ${project.parent.version} + + + de.julielab + jcore-types + ${jcore-types-version} + + + org.junit.jupiter + junit-jupiter-engine + + + de.julielab + jcore-db-test-utilities + + + org.assertj + assertj-core + + + de.julielab + jcore-utilities + ${jcore-utilities-version} + + + JCoRe Pubmed Central DB Reader + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-pmc-db-reader + JeDIS database reader for PMC base documents. + diff --git a/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java b/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java new file mode 100644 index 000000000..551b8dacb --- /dev/null +++ b/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java @@ -0,0 +1,221 @@ +package de.julielab.jcore.multiplier.pmc; + +import de.julielab.costosys.configuration.FieldConfig; +import de.julielab.costosys.dbconnection.CoStoSysConnection; +import de.julielab.jcore.reader.db.DBMultiplier; +import de.julielab.jcore.reader.db.DBReader; +import de.julielab.jcore.reader.pmc.CasPopulator; +import de.julielab.jcore.reader.pmc.NoDataAvailableException; +import de.julielab.jcore.reader.pmc.PMCReaderBase; +import de.julielab.jcore.reader.pmc.parser.ElementParsingException; +import de.julielab.jcore.types.casflow.ToVisit; +import de.julielab.jcore.types.casmultiplier.RowBatch; +import de.julielab.jcore.types.pubmed.Header; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.AbstractCas; +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.StringArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.*; +import java.util.stream.Collectors; + +public class PMCDBMultiplier extends DBMultiplier { + public static final String PARAM_OMIT_BIB_REFERENCES = PMCReaderBase.PARAM_OMIT_BIB_REFERENCES; + public static final String PARAM_ADD_SHA_HASH = "AddShaHash"; + public static final String PARAM_TABLE_DOCUMENT = "DocumentTable"; + public static final String PARAM_TABLE_DOCUMENT_SCHEMA = "DocumentTableSchema"; + public static final String PARAM_TO_VISIT_KEYS = "ToVisitKeys"; + protected static final byte[] comma = ",".getBytes(); + private final static Logger log = LoggerFactory.getLogger(PMCDBMultiplier.class); + @ConfigurationParameter(name = PARAM_OMIT_BIB_REFERENCES, mandatory = false, defaultValue = "false", description = "If set to true, references to the bibliography are omitted from the CAS text.") + protected boolean omitBibReferences; + @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, mandatory = false, description = "For use with AnnotationDefinedFlowController. Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS skips all component except the DBCheckpointAE to mark the document as processed.") + private String documentItemToHash; + @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT, mandatory = false, description = "For use with AnnotationDefinedFlowController. String parameter indicating the name of the " + + "table where the XMI data and, thus, the hash is stored. The name must be schema qualified. Note that in this component, only the ToVisit annotation is created that determines which components to apply to a CAS with matching (unchanged) hash. The logic to actually control the CAS flow is contained in the AnnotationDefinedFlowController.") + private String xmiStorageDataTable; + @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT_SCHEMA, mandatory = false, description = "For use with AnnotationDefinedFlowController. The name of the schema that the document table - given with the " + PARAM_TABLE_DOCUMENT + " parameter - adheres to. Only the primary key part is required for hash value retrieval.") + private String xmiStorageDataTableSchema; + @ConfigurationParameter(name = PARAM_TO_VISIT_KEYS, mandatory = false, description = "For use with AnnotationDefinedFlowController. The delegate AE keys of the AEs this CAS should still applied on although the hash has not changed. Can be null or empty indicating that no component should be applied to the CAS. This is, however, the task of the AnnotationDefinedFlowController.") + private String[] toVisitKeys; + + private CasPopulator casPopulator; + private Map docId2HashMap; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException { + super.initialize(aContext); + xmiStorageDataTable = (String) aContext.getConfigParameterValue(PARAM_TABLE_DOCUMENT); + xmiStorageDataTableSchema = (String) aContext.getConfigParameterValue(PARAM_TABLE_DOCUMENT_SCHEMA); + documentItemToHash = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_ADD_SHA_HASH)).orElse("document_text"); + toVisitKeys = (String[]) aContext.getConfigParameterValue(PARAM_TO_VISIT_KEYS); + omitBibReferences = Optional.ofNullable((Boolean) aContext.getConfigParameterValue(PARAM_OMIT_BIB_REFERENCES)).orElse(false); + // We don't know yet which tables to read. Thus, we leave the row mapping out. + // We will now once the DBMultiplier#process(JCas) will have been run. + initialized = false; + + if (!(xmiStorageDataTable == null && xmiStorageDataTableSchema == null) && !(xmiStorageDataTable != null && xmiStorageDataTableSchema != null && documentItemToHash != null)) { + String errorMsg = String.format("From the parameters '%s' and '%s' some are specified and some aren't. To activate hash value comparison in order to add aggregate component keys for CAS visit, specify all those parameters. Otherwise, specify none.", PARAM_TABLE_DOCUMENT, PARAM_TABLE_DOCUMENT_SCHEMA); + log.error(errorMsg); + throw new ResourceInitializationException(new IllegalArgumentException(errorMsg)); + } + + try { + casPopulator = new CasPopulator(omitBibReferences); + } catch (IOException e) { + String errorMsg = "Could not initialize the PMC CasPopulator."; + log.error(errorMsg); + throw new ResourceInitializationException(e); + } + } + + @Override + public void process(JCas aJCas) throws AnalysisEngineProcessException { + super.process(aJCas); + docId2HashMap = fetchCurrentHashesFromDatabase(JCasUtil.selectSingle(aJCas, RowBatch.class)); + } + + @Override + public AbstractCas next() throws AnalysisEngineProcessException { + JCas jCas = getEmptyJCas(); + try { + if (documentDataIterator.hasNext()) { + byte[][] documentData = documentDataIterator.next(); + String pkString = DBReader.setDBProcessingMetaData(dbc, readDataTable, tableName, documentData, jCas); + populateCas(jCas, documentData, pkString); + setToVisitAnnotation(jCas, pkString); + } + } catch (Exception e) { + log.error("Exception occurred: ", e); + throw new AnalysisEngineProcessException(e); + } + return jCas; + } + + private void populateCas(JCas jCas, byte[][] documentData, String pkString) throws NoDataAvailableException, ElementParsingException { + List pkIndices = dbc.getPrimaryKeyIndices(); + + // get index of xmlData; + // assumes that only one byte[] in arrayArray contains this data + // and that this byte[] is at the only index position that holds no + // primary key + List allIndices = new ArrayList(); + for (int i = 0; i < documentData.length; i++) { + allIndices.add(i); + } + List xmlIndices = new ArrayList<>(allIndices); + for (Integer pkIndex : pkIndices) + xmlIndices.remove(pkIndex); + int xmlIndex = xmlIndices.get(0); + try { + casPopulator.populateCas(new ByteArrayInputStream(documentData[xmlIndex]), jCas); + } catch (Exception e) { + log.error("Could not parse document {}.", pkString, e); + throw e; + } + // It actually happens that some PMC XML documents do not contain their own ID. We can use the ID obtained + // via the database primary key, which in turn might be derived from the original file name or some meta file. + Header header = JCasUtil.selectSingle(jCas, Header.class); + if (header.getDocId().isBlank()) { + log.debug("Document has no docId set. Derived the ID {} from the primary key and setting it as the Header#docId feature.", pkString); + header.setDocId(pkString); + } + } + + /** + *

Fetches the hashes of the currently stored documents in the database.

+ * + * @param rowBatch The annotation specifying which documents should be fetched by the multiplier and then be processed by the aggregate. + * @return A map from a string representation of the RowBatches document IDs to the hashes for the respective IDs. + * @throws AnalysisEngineProcessException If the SQL request fails. + */ + private Map fetchCurrentHashesFromDatabase(RowBatch rowBatch) throws AnalysisEngineProcessException { + if (xmiStorageDataTable != null && dbc.tableExists(xmiStorageDataTable) && rowBatch.getIdentifiers() != null && rowBatch.getIdentifiers().size() > 0) { + String hashColumn = documentItemToHash + "_sha256"; + // Extract the document IDs in this RowBatch. The IDs could be composite keys. + List documentIds = new ArrayList<>(rowBatch.getIdentifiers().size()); + Iterator documentIDsIt = rowBatch.getIdentifiers().iterator(); + while (documentIDsIt.hasNext()) { + StringArray pkArray = (StringArray) documentIDsIt.next(); + documentIds.add(pkArray.toStringArray()); + } + Map id2hash = new HashMap<>(documentIds.size()); + // This is the map we want to fill that lets us look up the hash of the document text by document ID. + String sql = null; + // Query the database for the document IDs in the current RowBatch and retrieve hashes. + try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { + FieldConfig xmiTableSchema = dbc.getFieldConfiguration(xmiStorageDataTableSchema); + String idQuery = documentIds.stream() + .map(key -> Arrays.stream(key).map(part -> "%s='" + part + "'").toArray(String[]::new)) + .map(xmiTableSchema::expandPKNames).map(expandedKeys -> String.join(" AND ", expandedKeys)) + .collect(Collectors.joining(" OR ")); + sql = String.format("SELECT %s,%s FROM %s WHERE %s", xmiTableSchema.getPrimaryKeyString(), hashColumn, xmiStorageDataTable, idQuery); + ResultSet rs = conn.createStatement().executeQuery(sql); + while (rs.next()) { + StringBuilder pkSb = new StringBuilder(); + for (int i = 0; i < xmiTableSchema.getPrimaryKey().length; i++) + pkSb.append(rs.getString(i + 1)).append(','); + // Remove trailing comma + pkSb.deleteCharAt(pkSb.length() - 1); + String hash = rs.getString(xmiTableSchema.getPrimaryKey().length + 1); + id2hash.put(pkSb.toString(), hash); + } + } catch (SQLException e) { + log.error("Could not retrieve hashes from the database. SQL query was '{}':", sql, e); + throw new AnalysisEngineProcessException(e); + } + return id2hash; + } + return null; + } + + /** + *

Creates a {@link ToVisit} annotation based on document text hash comparison and the defined parameter values.

+ *

Computes the hash of the newly read CAS and compares it to the hash for the same document retrieved from the + * database, if present. If there was a hash in the database and the hash values are equal, creates the ToVisit + * annotation and adds the toVisitKeys passed in the configuration of this component.

+ * + * @param jCas The newly read JCas. + * @param pkString + */ + private void setToVisitAnnotation(JCas jCas, String pkString) { + if (xmiStorageDataTable != null && dbc.tableExists(xmiStorageDataTable)) { + String existingHash = docId2HashMap.get(pkString); + if (existingHash != null) { + String newHash = getHash(jCas); + if (existingHash.equals(newHash)) { + if (log.isTraceEnabled()) + log.trace("Document {} has a document text hash that equals the one present in the database. Creating a ToVisit annotation routing it only to the components with delegate keys {}.", pkString, toVisitKeys); + ToVisit toVisit = new ToVisit(jCas); + if (toVisitKeys != null && toVisitKeys.length != 0) { + StringArray keysArray = new StringArray(jCas, toVisitKeys.length); + keysArray.copyFromArray(toVisitKeys, 0, 0, toVisitKeys.length); + toVisit.setDelegateKeys(keysArray); + } + toVisit.addToIndexes(); + } + } else { + log.trace("No existing hash was found for document {}", pkString); + } + } + } + + private String getHash(JCas newCas) { + final String documentText = newCas.getDocumentText(); + final byte[] sha = DigestUtils.sha256(documentText.getBytes()); + return Base64.encodeBase64String(sha); + } +} diff --git a/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier.xml b/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier.xml new file mode 100644 index 000000000..1bf858c07 --- /dev/null +++ b/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier.xml @@ -0,0 +1,71 @@ + + + org.apache.uima.java + true + de.julielab.jcore.multiplier.pmc.PMCDBMultiplier + + JCoRe Abstract Database Multiplier + A multiplier that receives document IDs to read from a database table from the DBMultiplierReader. The reader also delivers the path to the corpus storage system (CoStoSys) configuration and additional tables for joining with the main data table. This multiplier class is abstract and cannot be used directly.Extending classes must implement the next() method to actually read documents from the database and populate CASes with them. This component is a part of the Jena Document Information System, JeDIS. + JULIE Lab Jena, Germany + JULIE Lab Jena, Germany + + + OmitBibliographyReferences + If set to true, references to the bibliography are omitted from the CAS text. + Boolean + false + false + + + AddShaHash + For use with AnnotationDefinedFlowController. Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS skips all component except the DBCheckpointAE to mark the document as processed. + String + false + false + + + DocumentTable + For use with AnnotationDefinedFlowController. String parameter indicating the name of the table where the XMI data and, thus, the hash is stored. The name must be schema qualified. Note that in this component, only the ToVisit annotation is created that determines which components to apply to a CAS with matching (unchanged) hash. The logic to actually control the CAS flow is contained in the AnnotationDefinedFlowController. + String + false + false + + + DocumentTableSchema + For use with AnnotationDefinedFlowController. The name of the schema that the document table - given with the DocumentTable parameter - adheres to. Only the primary key part is required for hash value retrieval. + String + false + false + + + ToVisitKeys + For use with AnnotationDefinedFlowController. The delegate AE keys of the AEs this CAS should still applied on although the hash has not changed. Can be null or empty indicating that no component should be applied to the CAS. This is, however, the task of the AnnotationDefinedFlowController. + String + true + false + + + + + OmitBibliographyReferences + + false + + + + + + + + + + + + + + true + true + true + + + \ No newline at end of file diff --git a/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierHashComparisonTest.java b/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierHashComparisonTest.java new file mode 100644 index 000000000..a4f02e11a --- /dev/null +++ b/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierHashComparisonTest.java @@ -0,0 +1,213 @@ +package de.julielab.jcore.multiplier.pmc; + + +import de.julielab.costosys.dbconnection.CoStoSysConnection; +import de.julielab.costosys.dbconnection.DataBaseConnector; +import de.julielab.jcore.db.test.DBTestUtils; +import de.julielab.jcore.types.casflow.ToVisit; +import de.julielab.jcore.types.casmultiplier.RowBatch; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.configuration2.ex.ConfigurationException; +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.JCasIterator; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.factory.TypeSystemDescriptionFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.jcas.cas.StringArray; +import org.apache.uima.resource.metadata.TypeSystemDescription; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.PostgreSQLContainer; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * This test is an adaption of the XMLDBMultiplierTest in jcore-xml-db-reader. It tests whether the hash code comparison + * works as intended. + */ +public class PMCDBMultiplierHashComparisonTest { + + private static final String SOURCE_XML_TABLE = "source_xml_table"; + private static final String TARGET_XMI_TABLE = "target_xmi_table"; + private static final String PMCID_FIELD_NAME = "pmcid"; + private static final String DOCID_FIELD_NAME = "docid"; + private static final String XML_FIELD_NAME = "xml"; + private static final String BASE_DOCUMENT_FIELD_NAME = "base_document"; + private static final String HASH_FIELD_NAME = "documentText_sha256"; + private static final String MAX_XMI_ID_FIELD_NAME = "max_xmi_id"; + private static final String SOFA_MAPPING_FIELD_NAME = "sofa_mapping"; + private static final String SUBSET_TABLE = "test_subset"; + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); + private static String costosysConfig; + + @BeforeAll + public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { + postgres.start(); + DBTestUtils.createAndSetHiddenConfig(Path.of("src", "test", "resources", "hiddenConfig").toString(), postgres); + + DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setActiveTableSchema("pmc"); + costosysConfig = DBTestUtils.createTestCostosysConfig("pmc", 2, postgres); + new File(costosysConfig).deleteOnExit(); + try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { + // We create two tables. One is the XML table the multiplier reads from and maps the contents to the JCas. + // The other is a simulation of an XMI table used to serialize CAS instances via the jcore-xmi-db-writer. + // We need that target table to test the hash value comparison mechanism: If a document does not exist + // in the target table or has a non-matching hash on its document text, proceed as normal. + // But if the hash matches, we want to reserve the possibility to skip most part of the subsequent pipeline. + // For this, we could use the AnnnotationDefinedFlowController for jcore-flow-controllers. This controller + // looks for annotations of the ToVisit type that specify which exact components in an aggregate should + // be applied to the CAS carrying the ToVisit annotation. + prepareSourceXMLTable(dbc, conn); + prepareTargetXMITable(dbc, conn); + } + dbc.defineSubset(SUBSET_TABLE, SOURCE_XML_TABLE, "Test subset"); + assertThat(dbc.getNumRows(SOURCE_XML_TABLE)).isEqualTo(10); + assertThat(dbc.getNumRows(TARGET_XMI_TABLE)).isEqualTo(5); + + dbc.close(); + } + + private static void prepareSourceXMLTable(DataBaseConnector dbc, CoStoSysConnection conn) throws SQLException { + String xmlFmt = "\n" + + "
%d42\n" + + "

This is text nr %d.

\n" + + "
"; + dbc.createTable(SOURCE_XML_TABLE, "Test table for hash comparison test."); + String sql = String.format("INSERT INTO %s (%s,%s) VALUES (?,XMLPARSE(CONTENT ?))", SOURCE_XML_TABLE, PMCID_FIELD_NAME, XML_FIELD_NAME); + PreparedStatement ps = conn.prepareStatement(sql); + for (int i = 0; i < 10; i++) { + String xml = String.format(xmlFmt, i, i); + ps.setString(1, String.valueOf(i)); + ps.setString(2, xml); + ps.addBatch(); + } + ps.executeBatch(); + } + + private static void prepareTargetXMITable(DataBaseConnector dbc, CoStoSysConnection conn) throws SQLException { + // The PMC parser tries to format blocks of content using newlines which makes the test a bit awkward. + // The test might break if this formatting is changed. + String documentTextFmt = "\nThis is text nr %d.\n\n"; + dbc.createTable(TARGET_XMI_TABLE, "xmi_text", "Test table for hash comparison test."); + dbc.assureColumnsExist(TARGET_XMI_TABLE, List.of(HASH_FIELD_NAME), "text"); + String sql = String.format("INSERT INTO %s (%s,%s,%s,%s,%s) VALUES (?,XMLPARSE(CONTENT ?),?,?,?)", TARGET_XMI_TABLE, DOCID_FIELD_NAME, BASE_DOCUMENT_FIELD_NAME, HASH_FIELD_NAME, MAX_XMI_ID_FIELD_NAME, SOFA_MAPPING_FIELD_NAME); + PreparedStatement ps = conn.prepareStatement(sql); + // Note that we only add half of the documents compared to the source XML import. This way we test + // if the code behaves right when the target document does not yet exist at all. + for (int i = 0; i < 5; i++) { + String xml = String.format(documentTextFmt, i, i); + ps.setString(1, String.valueOf(i)); + ps.setString(2, xml); + // For one document in the "target XMI" table we put in a wrong hash. Thus, this document should not trigger + // the "toVisit" mechanism. + if (i != 3) + ps.setString(3, getHash(xml)); + else ps.setString(3, "someanotherhash"); + ps.setInt(4, 0); + ps.setString(5, "dummy"); + ps.addBatch(); + } + ps.executeBatch(); + } + + @AfterAll + public static void tearDown() { + postgres.stop(); + } + + private static String getHash(String str) { + final byte[] sha = DigestUtils.sha256(str.getBytes()); + return Base64.encodeBase64String(sha); + } + + /** + * Creates a JCas and adds a RowBatch for all 10 documents in the source XML table as well as the data table and subset table and schema names. + * + * @return A JCas prepared for the tests in this class. + * @throws UIMAException If some UIMA operation fails. + */ + private JCas prepareCas() throws UIMAException { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.jcore-casflow-types"); + RowBatch rowBatch = new RowBatch(jCas); + StringArray dataTable = new StringArray(jCas, 1); + dataTable.set(0, SOURCE_XML_TABLE); + rowBatch.setTables(dataTable); + StringArray tableSchema = new StringArray(jCas, 1); + tableSchema.set(0, "pmc"); + rowBatch.setTableSchemas(tableSchema); + rowBatch.setTableName(SUBSET_TABLE); + FSArray pks = new FSArray(jCas, 10); + // Read all documents + for (int i = 0; i < 10; i++) { + StringArray pk = new StringArray(jCas, 1); + pk.set(0, String.valueOf(i)); + pks = JCoReTools.addToFSArray(pks, pk); + } + rowBatch.setIdentifiers(pks); + rowBatch.setCostosysConfiguration(costosysConfig); + rowBatch.addToIndexes(); + return jCas; + } + + @Test + public void testHashComparison() throws Exception { + JCas jCas = prepareCas(); + TypeSystemDescription tsDesc = TypeSystemDescriptionFactory.createTypeSystemDescription("de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.jcore-document-structure-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-casflow-types"); + AnalysisEngine engine = AnalysisEngineFactory.createEngine(PMCDBMultiplier.class, tsDesc, + PMCDBMultiplier.PARAM_ADD_SHA_HASH, "documentText", + PMCDBMultiplier.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE, + PMCDBMultiplier.PARAM_TABLE_DOCUMENT_SCHEMA, "xmi_text", + PMCDBMultiplier.PARAM_TO_VISIT_KEYS, "ThisIsTheVisitKey" + ); + JCasIterator jCasIterator = engine.processAndOutputNewCASes(jCas); + List toVisitKeys = new ArrayList<>(); + while (jCasIterator.hasNext()) { + JCas newCas = jCasIterator.next(); + Collection select = JCasUtil.select(newCas, ToVisit.class); + select.forEach(tv -> tv.getDelegateKeys().forEach(k -> toVisitKeys.add(k))); + newCas.release(); + } + // There are 4 documents in the target table with the correct hash so we expect the delegate key 4 times + assertThat(toVisitKeys).containsExactly("ThisIsTheVisitKey", "ThisIsTheVisitKey", "ThisIsTheVisitKey", "ThisIsTheVisitKey"); + } + + @Test + public void testHashComparison2() throws Exception { + JCas jCas = prepareCas(); + TypeSystemDescription tsDesc = TypeSystemDescriptionFactory.createTypeSystemDescription("de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.jcore-document-structure-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-casflow-types"); + // In this test, we do not specify the keys to visit; the whole subsequent pipeline should be skipped. + // To indicate that, there should be ToVisit annotations but they should be null. + AnalysisEngine engine = AnalysisEngineFactory.createEngine(PMCDBMultiplier.class, tsDesc, + PMCDBMultiplier.PARAM_ADD_SHA_HASH, "documentText", + PMCDBMultiplier.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE, + PMCDBMultiplier.PARAM_TABLE_DOCUMENT_SCHEMA, "xmi_text" + ); + JCasIterator jCasIterator = engine.processAndOutputNewCASes(jCas); + List emptyToVisitAnnotation = new ArrayList<>(); + while (jCasIterator.hasNext()) { + JCas newCas = jCasIterator.next(); + Collection select = JCasUtil.select(newCas, ToVisit.class); + select.stream().filter(tv -> tv.getDelegateKeys() == null).forEach(emptyToVisitAnnotation::add); + newCas.release(); + } + // There are 4 documents in the target table with the correct hash so we expect the delegate key 4 times + assertThat(emptyToVisitAnnotation).hasSize(4); + } +} diff --git a/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierTest.java b/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierTest.java new file mode 100644 index 000000000..f8d65f822 --- /dev/null +++ b/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierTest.java @@ -0,0 +1,103 @@ +package de.julielab.jcore.multiplier.pmc; + +import de.julielab.costosys.Constants; +import de.julielab.costosys.dbconnection.DBCIterator; +import de.julielab.costosys.dbconnection.DataBaseConnector; +import de.julielab.jcore.db.test.DBTestUtils; +import de.julielab.jcore.types.casmultiplier.RowBatch; +import de.julielab.jcore.types.pubmed.Header; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.commons.configuration2.ex.ConfigurationException; +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.JCasIterator; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.jcas.cas.StringArray; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.PostgreSQLContainer; + +import java.io.File; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Spliterators; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +import static org.assertj.core.api.Assertions.assertThat; + +class PMCDBMultiplierTest { + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:" + DataBaseConnector.POSTGRES_VERSION); + private static String costosysConfig; + + @BeforeAll + public static void setup() throws ConfigurationException { + postgres.start(); + DBTestUtils.createAndSetHiddenConfig(Path.of("src", "test", "resources", "hiddenConfig").toString(), postgres); + + DataBaseConnector dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setActiveTableSchema("pmc"); + costosysConfig = DBTestUtils.createTestCostosysConfig("pmc", 2, postgres); + new File(costosysConfig).deleteOnExit(); + dbc.withConnectionExecute(d -> d.createTable(Constants.DEFAULT_DATA_TABLE_NAME, "Test data table.")); + dbc.withConnectionExecute(d -> d.importFromXMLFile(Path.of("src", "test", "resources", "testdocs").toString(), Constants.DEFAULT_DATA_TABLE_NAME)); + dbc.withConnectionExecute(d -> d.createSubsetTable("testsubset", Constants.DEFAULT_DATA_TABLE_NAME, "Test subset.")); + dbc.withConnectionExecute(d -> d.initSubset("testsubset", Constants.DEFAULT_DATA_TABLE_NAME)); + assertThat(dbc.countRowsOfDataTable(Constants.DEFAULT_DATA_TABLE_NAME, null)); + DBCIterator documentIterator = (DBCIterator) dbc.withConnectionQuery(d -> d.queryDataTable(Constants.DEFAULT_DATA_TABLE_NAME, null)); + // check that the documents are actually in the database as expected + List docIds = StreamSupport.stream(Spliterators.spliteratorUnknownSize(documentIterator, 0), false).map(b -> new String(b[0], StandardCharsets.UTF_8)).collect(Collectors.toList()); + assertThat(docIds).containsExactlyInAnyOrder("PMC6949206", "PMC7511315"); + } + + @Test + public void next() throws Exception { + AnalysisEngine engine = AnalysisEngineFactory.createEngine(PMCDBMultiplier.class); + JCasIterator jCasIterator = engine.processAndOutputNewCASes(prepareCas()); + List documentTexts = new ArrayList<>(); + List docIds = new ArrayList<>(); + while (jCasIterator.hasNext()) { + JCas newCas = jCasIterator.next(); + documentTexts.add(newCas.getDocumentText()); + docIds.add(JCasUtil.selectSingle(newCas, Header.class).getDocId()); + newCas.release(); + } + assertThat(docIds).containsExactlyInAnyOrder("PMC6949206", "PMC7511315"); + } + + /** + * Creates a JCas and adds a RowBatch for the test documents in the source XML table as well as the data table and subset table and schema names. + * + * @return A JCas prepared for the tests in this class. + * @throws UIMAException If some UIMA operation fails. + */ + private JCas prepareCas() throws UIMAException { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.jcore-casflow-types"); + RowBatch rowBatch = new RowBatch(jCas); + StringArray dataTable = new StringArray(jCas, 1); + dataTable.set(0, Constants.DEFAULT_DATA_TABLE_NAME); + rowBatch.setTables(dataTable); + StringArray tableSchema = new StringArray(jCas, 1); + tableSchema.set(0, "pmc"); + rowBatch.setTableSchemas(tableSchema); + rowBatch.setTableName("testsubset"); + FSArray pks = new FSArray(jCas, 2); + // Read all documents + List pkStrings = List.of("PMC6949206", "PMC7511315"); + for (String pkString : pkStrings) { + StringArray pk = new StringArray(jCas, 1); + pk.set(0, pkString); + pks = JCoReTools.addToFSArray(pks, pk); + } + rowBatch.setIdentifiers(pks); + rowBatch.setCostosysConfiguration(costosysConfig); + rowBatch.addToIndexes(); + return jCas; + } +} \ No newline at end of file diff --git a/jcore-pmc-db-reader/src/test/resources/testdocs/PMC6949206.xml b/jcore-pmc-db-reader/src/test/resources/testdocs/PMC6949206.xml new file mode 100644 index 000000000..d7bbf8d2e --- /dev/null +++ b/jcore-pmc-db-reader/src/test/resources/testdocs/PMC6949206.xml @@ -0,0 +1,6 @@ + +
pmcProtein CellProtein CellProtein & Cell1674-800X1674-8018Higher Education PressBeijing31037510PMC694920662310.1007/s13238-019-0623-2Research ArticleRescue of premature aging defects in Cockayne syndrome stem cells by CRISPR/Cas9-mediated gene correctionWangSi125MinZheying113JiQianzhao24GengLingling5SuYao5LiuZunpeng34HuHuifang34WangLixia24ZhangWeiqi24567SuzuikiKeiichiro910HuangYu11ZhangPuyao1TangTie-Shan4612QuJing
qujing@ioz.ac.cn
346
YuYang
yuyang5012@hotmail.com
1
LiuGuang-Hui
ghliu@ibp.ac.cn
24568
QiaoJie
jie.qiao@263.net
113
grid.411642.40000 0004 0605 3760Department of Obstetrics and Gynecology, Center for Reproductive Medicine, Peking University Third Hospital, Beijing, 100191 China grid.9227.e0000000119573309National Laboratory of Biomacromolecules, CAS Center for Excellence in Biomacromolecules, Institute of Biophysics, Chinese Academy of Sciences, Beijing, 100101 China grid.9227.e0000000119573309State Key Laboratory of Stem Cell and Reproductive Biology, Institute of Zoology, Chinese Academy of Sciences, Beijing, 100101 China grid.410726.60000 0004 1797 8419University of Chinese Academy of Sciences, Beijing, 100049 China grid.413259.80000 0004 0632 3337Advanced Innovation Center for Human Brain Protection, National Clinical Research Center for Geriatric Disorders, Xuanwu Hospital Capital Medical University, Beijing, 100053 China grid.9227.e0000000119573309Institute for Stem cell and Regeneration, Chinese Academy of Sciences, Beijing, 100101 China grid.9227.e0000000119573309Key Laboratory of Genomic and Precision Medicine, Beijing Institute of Genomics, Chinese Academy of Sciences, Beijing, 100101 China grid.24696.3f0000 0004 0369 153XBeijing Institute for Brain Disorders, Beijing, 100069 China grid.136593.b0000 0004 0373 3971Institute for Advanced Co-Creation Studies, Osaka University, Osaka, 560-8531 Japan grid.136593.b0000 0004 0373 3971Graduate School of Engineering Science, Osaka University, Osaka, 560-8531 Japan grid.11135.370000 0001 2256 9319Department of Medical Genetics, School of Basic Medical Sciences, Peking University Health Science Center, Beijing, 100191 China grid.458458.00000 0004 1792 6416State Key Laboratory of Membrane Biology, Institute of Zoology, Chinese Academy of Sciences, Beijing, 100101 China grid.11135.370000 0001 2256 9319Peking-Tsinghua Center for Life Sciences, Academy for Advanced Interdisciplinary Studies, Peking University, Beijing, 100871 China
304201930420191202011112219220191232019© The Author(s) 2019https://creativecommons.org/licenses/by/4.0/Open AccessThis article is distributed under the terms of the Creative Commons Attribution 4.0 International License (http://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided you give appropriate credit to the original author(s) and the source, provide a link to the Creative Commons license, and indicate if changes were made.

Cockayne syndrome (CS) is a rare autosomal recessive inherited disorder characterized by a variety of clinical features, including increased sensitivity to sunlight, progressive neurological abnormalities, and the appearance of premature aging. However, the pathogenesis of CS remains unclear due to the limitations of current disease models. Here, we generate integration-free induced pluripotent stem cells (iPSCs) from fibroblasts from a CS patient bearing mutations in CSB/ERCC6 gene and further derive isogenic gene-corrected CS-iPSCs (GC-iPSCs) using the CRISPR/Cas9 system. CS-associated phenotypic defects are recapitulated in CS-iPSC-derived mesenchymal stem cells (MSCs) and neural stem cells (NSCs), both of which display increased susceptibility to DNA damage stress. Premature aging defects in CS-MSCs are rescued by the targeted correction of mutant ERCC6. We next map the transcriptomic landscapes in CS-iPSCs and GC-iPSCs and their somatic stem cell derivatives (MSCs and NSCs) in the absence or presence of ultraviolet (UV) and replicative stresses, revealing that defects in DNA repair account for CS pathologies. Moreover, we generate autologous GC-MSCs free of pathogenic mutation under a cGMP (Current Good Manufacturing Practice)-compliant condition, which hold potential for use as improved biomaterials for future stem cell replacement therapy for CS. Collectively, our models demonstrate novel disease features and molecular mechanisms and lay a foundation for the development of novel therapeutic strategies to treat CS.

Electronic supplementary material

The online version of this article (10.1007/s13238-019-0623-2) contains supplementary material, which is available to authorized users.

KeywordsCockayne syndromeCRISPR/Cas9gene correctiondisease modellingmesenchymal stem cellneural stem cellissue-copyright-statement© The Author(s) 2020
INTRODUCTION

Cockayne syndrome (CS) is an autosomal recessive disorder characterized by progressive multisystem clinical features, including cachectic dwarfism, clinical photosensitivity, progressive neurological degeneration, and premature aging (Karikkineth et al., 2017). Two genes that are defective in Cockayne syndrome, CSA/ERCC8 (ERCC excision repair 8, CSA ubiquitin ligase complex subunit) and CSB/ERCC6 (ERCC excision repair 6, chromatin remodeling factor), have been identified. To date, two-thirds of CS patients have been linked to mutations in the CSB/ERCC6 gene, and one-third of CS patients have been linked to mutations in the CSA/ERCC8 gene. At least 78 different mutations in ERCC6, including typical missense mutations, frameshifts, and deletions, have been identified (Cleaver et al., 2009; Laugel, 2013). However, the underlying molecular mechanisms linking genotype to phenotype need to be clarified.

DNA damage caused by exogenous ultraviolet (UV) radiation-induced photoproducts or similar chemically induced products is sensed by the cellular nucleotide excision repair (NER) system (Friedberg, 2001, 2003; Cleaver et al., 2009; McKay and Cabrita, 2013). The NER system consists of two pathways: global genomic repair (GGR), in which damage to DNA regions not undergoing transcription is repaired, and transcription-coupled repair (TCR), in which damage to transcribed DNA regions is repaired. Bulky DNA adducts usually block transcription elongation by RNA polymerase II (RNAPII); then, the arrested RNAPII initiates the repair of transcription-blocking DNA lesions by TCR to permit the efficient recovery of mRNA synthesis. If TCR cannot be executed, widespread sustained transcription blockage eventually leads to apoptosis (McKay and Cabrita, 2013). ERCC6 is an ATP-stimulated ATPase that is required for the ubiquitylation of the carboxyterminal domain of RNAPII in TCR and the recovery of mRNA synthesis. In addition, ERCC6 has been reported as a member of the SWI/SNF family of proteins that contain a nucleotide-binding site and play a role in chromatin maintenance and remodelling by modulating the negative supercoiling of DNA and facilitating DNA strand exchange, possibly through the recruitment of the histone acetyltransferase p300 (Newman et al., 2006; Cleaver et al., 2009; Velez-Cruz and Egly, 2013).

Mice deficient for Ercc6 or Ercc8 have been generated and used to mimic mild CS symptoms, including fat tissue reduction, photoreceptor cell loss, and mild but characteristic nervous system pathology (van der Horst et al., 1997, 2002; Gorgels et al., 2007; Jaarsma et al., 2011). These mild CS mouse models are converted to severe CS models with short life spans, progressive nervous system degeneration and cachectic dwarfism after synergistic complete inactivation of global genome NER. For example, previous studies have demonstrated the simultaneous deleterious effects of intercrossing xeroderma pigmentosum (XP) (Xpa−/− or Xpc−/−) mice with CS (Csa−/−, Csb−/−, XpdXPCS) mice, which results in double mutants with very short life spans and dramatic progeroid features (Murai et al., 2001; Andressoo et al., 2006; van der Pluijm et al., 2007). Due to the differences in genetic and anatomic features between humans and mice, a human CS model needs to be established to reveal the cellular defects and molecular mechanisms for translation into a CS treatment.

In this study, we report the generation of induced pluripotent stem cells (iPSCs) from the fibroblasts of a CS patient bearing two novel heterogeneous mutations in the ERCC6 gene: c.643G>T in exon 4 and c.3776C>A in exon 18. We further derived gene-corrected CS-iPSCs (GC-iPSCs) using the CRISPR/Cas9-mediated gene editing technique. CS-iPSCs and GC-iPSCs were further differentiated into mesenchymal stem cells (MSCs) and neural stem cells (NSCs). Gene correction resulted in the effective restoration of DNA repair abilities and the alleviation of apoptosis and premature senescence, especially after exposure to UV irradiation or replicative stress (Fig. 1A). RNA sequencing analysis indicated that the compromised DNA repair and cell cycle deregulation observed in CS cells account for various CS cellular pathologies. Finally, we obtained gene-corrected CS-iPSC-derived MSCs under a cGMP (Current Good Manufacturing Practice)-compliant condition, which display promising potential in autologous stem cell therapy.

Generation of CS-iPSCs and gene-corrected CS-iPSCs. (A) Schematic diagram of the generation of CS-iPSCs and GC-iPSCs, as well as their adult stem cell derivatives, for modelling Cockayne syndrome. “Mut” represents mutant, “GC” represents gene corrected. (B) Genotype validation of two heterozygous mutations in the ERCC6 gene by genomic DNA sequencing. Fibroblasts isolated from a healthy individual were used as a control. (C) Strategy for correcting the ERCC6+/G643T mutation by the CRISPR/Cas9 system. The sequence of the gRNA is shown with the PAM sequence. Red crosses represent mutations in exon 4 and exon 18. The single-stranded oligodeoxynucleotide (ssODN) carrying a silent mutation (marked in green) was used as a repair template. (D) The correction of the ERCC6+/G643T mutation was verified by genomic DNA sequencing. The red arrow highlights the corrected base pair. The green arrow indicates the inclusion of silent mutation introduced by the exogenous ssODN template. ERCC6mut represents CS-iPSCs, ERCC6GC represents GC-iPSCs. (E) Karyotyping analysis of CS-iPSCs and GC-iPSCs indicating their normal karyotypes. (F) No residual episomal vector element EBNA-1 was observed in CS-iPSCs or GC-iPSCs by qPCR analysis. CS-fibroblasts were electroporated with pCXLE-hOCT3/4-shp53-F, pCXLE-hSK and pCXLE-hUL. The fibroblasts were cultured for 4 more days after electroporation and then collected as the positive control, and human ESCs (line H9), GM00038-iPSCs and HFF-iPSCs were used as negative controls. Data are shown as the mean ± SEM, n = 3. (G) No off-target mutations were observed in GC-iPSCs. Whole-genome sequencing was applied to detect potential off-target mutations in the GC-iPSC sample. NA, not applicable

RESULTSGeneration of non-integrative iPSCs from a CS patient

We first isolated human primary fibroblasts from a Chinese CS patient and verified the presence of two nonsense mutations, c.643G>T (p.E215X) in exon 4 and c.3776C>A (p.S1259X) in exon 18, located at different alleles of the ERCC6 gene by genomic DNA sequencing analysis (Fig. 1B). To generate patient-specific iPSCs (CS-iPSCs), a cocktail of integration-free episomal vectors expressing the reprogramming factors OCT4, SOX2, KLF4, L-MYC, LIN28, and sh-p53 was electroporated into fibroblasts according to a modified reprogramming protocol, as previously described (Hishiya and Watanabe, 2004; Okita et al., 2011; Liu et al., 2014; Ding et al., 2015; Fu et al., 2016; Wang et al., 2017; Ling et al., 2019). The derived iPSCs displayed normal karyotypes, and no residual episomal reprogramming vector element was detected in established CS-iPSCs (Fig. 1E and 2F). In addition, CS-iPSCs expressed comparable levels of pluripotency markers, including NANOG, OCT4, and SOX2 (Fig. 2B and 2C). After being implanted subcutaneously into immunocompromised mice, CS-iPSCs were able to form teratomas comprising cells from three germ lineages, as indicated by TUJ1, SMA and FOXA2 expression (Fig. 2D). These observations indicated that iPSCs bearing the CS-specific ERCC6 mutation display normal pluripotency.

Characterization of CS-iPSCs and gene-corrected CS-iPSCs. (A) Western blot analysis showing increased protein levels of ERCC6 in GC-iPSCs. β-Actin was used as the loading control. (B) RT-PCR analysis of the pluripotency markers SOX2, OCT4, and NANOG in the CS-iPSCs and GC-iPSCs. 18S rRNA was used as the loading control. (C) Immunostaining of CS-iPSCs and GC-iPSCs for the pluripotency markers OCT4, NANOG, and SOX2. Nuclei were stained with Hoechst 33342. Scale bar, 50 μm. (D) Immunostaining of TUJ1 (ectoderm), SMA (mesoderm), and FOXA2 (endoderm) in teratomas derived from CS-iPSCs and GC-iPSCs. Nuclei were stained with Hoechst 33342. Scale bar, 50 μm. (E) The percentages of Ki67-positive cells in CS-iPSCs and GC-iPSCs were determined and compared. Nuclei were stained with Hoechst 33342. Scale bar, 50 μm. Data are presented as the mean ± SEM, n = 3, ns, not significant. (F) Cell cycle profiles showing comparable percentages of different cell cycle phases in CS-iPSCs and GC-iPSCs by PI staining. Data are presented as the mean ± SEM, n = 3

Targeted gene correction of the <italic>ERCC6</italic> mutation by CRISPR/Cas9 system

To better elucidate the pathogenic mechanism underlying CS, we generated isogenic gene-corrected iPSC lines by targeted gene editing of one of the two compound heterozygous ERCC6 mutations. Using the CRISPR/Cas9 system, we electroporated an expression vector encoding mCherry and a guide RNA targeting the mutation in exon 4, a plasmid for Cas9-2A-GFP, and the single-stranded oligodeoxynucleotide (ssODN) template into CS-iPSCs (Wang et al., 2017). After fluorescence-activated cell sorting (FACS) for mCherry (guide RNA) and GFP (Cas9) double-positive cells, gene-corrected CS-iPSC clones were successfully obtained (Fig. 1C). Site-specific gene correction of the c.643G>T mutation was confirmed by genomic DNA sequencing (Fig. 1D). As the exogenous repair template ssODN was designed to contain a silent mutation, the introduced silent mutation was also found in the GC-iPSC clones, further confirming successful gene editing at the corresponding genomic target sites (Fig. 1D). Similar to CS-iPSCs, we did not detect any residual episomal reprogramming vectors in GC-iPSCs (Fig. 1F). Whole-genome DNA sequencing indicated no mutations in potential off-target sites after gene editing (Fig. 1G). GC-iPSCs also showed a normal karyotype (Fig. 1E). Western blots demonstrated elevated levels of the ERCC6 protein in GC-iPSCs (Fig. 2A), implying that the correction of the pathogenic mutation recovered the protein expression of ERCC6. Additionally, GC-iPSCs normally expressed pluripotency markers, including OCT4, NANOG, and SOX2 (Fig. 2B and 2C), and formed teratomas in vivo (Fig. 2D). CS-iPSCs and GC-iPSCs were cultured for more than 50 passages without showing abnormal growth kinetics (Fig. 2E and 2F). Unlike the previous study (Andrade et al., 2012), we did not observe elevated cellular reactive oxygen species (ROS) in CS-iPSCs compared to GC-iPSCs (Fig. S3A). In addition, RT-qPCR demonstrated that the expression levels of genes involved in the oxidative stress response were comparable between GC-iPSCs and CS-iPSCs (Fig. S3B). Taken together, these results indicated that we successfully generated GC-iPSCs exhibiting normal pluripotency.

Alleviation of aging defects in gene-corrected CS-MSCs

CS patients frequently exhibit musculoskeletal abnormalities, such as kyphosis, contracture and osteoporosis (Hishiya and Watanabe, 2004; Karikkineth et al., 2017). MSCs are multipotent mesodermal cells that can differentiate into a variety of mesodermal cell types, including osteoblasts, chondrocytes, and adipocytes, which serve as a good cell model for investigating the accelerated degeneration of mesodermal tissues caused by genetic mutations (Liu et al., 2014; Zhang et al., 2015, 2019; Kubben et al., 2016; Li et al., 2016; Pan et al., 2016; Geng et al., 2018; Wang et al., 2018b; Wu et al., 2018; Yan et al., 2019). Therefore, we first differentiated CS-iPSCs and GC-iPSCs into MSCs to investigate whether ERCC6 mutations could result in accelerated attrition of the MSC pool. Both CS-MSCs and GC-MSCs were positive for mesenchymal progenitor markers, including CD73, CD90 and CD105 (Fig. 3A). Consistent with the successful correction of ERCC6 gene mutation, increased ERCC6 protein content was observed in GC-MSCs (Fig. 3B). Next, we investigated whether normal ERCC6 activity is required for maintaining the cellular homeostasis of MSCs. Compared to isogenic gene-corrected control cells, CS-MSCs displayed features characteristic of premature senescence under replicative stress, including the early onset of cell growth arrest, reduced Ki67-positive cells, and increased senescence-associated (SA)-β-Gal activity (Fig. 3C–E). In addition, the expression levels of senescence markers, including P16, P21 and IL-8, were upregulated, while the geroprotective proteins Lamin B1 and LAP2 were downregulated in CS-MSCs relative to GC-MSCs at late passages (Fig. 3F–H). In line with the essential role of ERCC6 in NER, CS-MSCs exhibited increased expression of the DNA damage marker γH2AX (Fig. 3I), indicating compromised DNA repair in ERCC6-deficient MSCs. Next, we investigated whether CS-MSCs underwent accelerated attrition in vivo. Implanting CS-MSCs and GC-MSCs expressing luciferase into the tibialis anterior (TA) muscle of immunodeficient mice resulted in accelerated in vivo decay in CS-MSCs compared to GC-MSCs (Fig. 3J). Furthermore, we compared the multipotent differentiation potential of CS-MSCs and GC-MSCs. Relative to GC-MSCs, CS-MSCs exhibited impaired differentiation abilities towards osteoblasts, chondrocytes and white adipocytes (Fig. 3K and 3L). Altogether, these results showed that CS-MSCs displayed typical premature cellular senescence, which was rescued by the targeted correction of mutant ERCC6.

Alleviated cellular senescence in gene-corrected CS-MSCs. (A) FACS analysis indicating the expression of the cell surface markers CD73, CD90 and CD105 in CS-MSCs and GC-MSCs. ERCC6mut represents CS-MSCs, ERCC6GC represents GC-MSCs. (B) Western blot analysis showing increased protein levels of ERCC6 in GC-MSCs. β-Actin was used as the loading control. (C) Growth curves showing the cumulative population doublings of CS-MSCs and GC-MSCs. (D) Immunostaining of Ki67 showing the decreased cell proliferation of CS-MSCs compared to GC-MSCs. The percentages of Ki67-positive cells are shown in the right panel. Scale bar, 20 μm. Data are presented as the mean ± SEM, n = 3, **P < 0.01, ***P < 0.001. EP, early passage (P6); LP, late passage (P28). (E) SA-β-Gal staining of CS-MSCs and GC-MSCs at EP (P6) and LP (P28), respectively. The percentages of SA-β-Gal-positive cells are shown in the right panel. Scale bar, 50 μm. Data are presented as the mean ± SEM, n = 3, **P < 0.01, ns, not significant. (F) RT-qPCR analysis of the expression of senescence markers in CS-MSCs and GC-MSCs at passage 28. The mRNA levels were normalized to CS-MSCs. (G) Western blot analysis of P16, LAP2 and Lamin B1 in CS-MSCs and GC-MSCs. GAPDH was used as the loading control. (H) Immunostaining of LAP2 and Lamin B in CS-MSCs and GC-MSCs. The relative intensity of LAP2 was measured with ImageJ software, and the data are shown as the mean ± SEM, ***P < 0.001. More than 300 nuclei for each group were used for calculations. Scale bar, 20 μm. a.u., arbitrary units. (I) Immunostaining of γH2AX in CS-MSCs and GC-MSCs. The relative intensity of γH2AX was measured with ImageJ software, and the data are shown as the mean ± SEM, ***P < 0.001. More than 300 nuclei for each group were used for calculations. Scale bar, 20 μm. a.u., arbitrary units. (J) Accelerated attrition of CS-MSCs in vivo was detected by an in vivo imaging system (IVIS). CS-MSCs (1 × 106, left) and GC-MSCs (1 × 106, right) (passage 25) infected with luciferase lentivirus were injected into the tibialis anterior (TA) muscles of immunodeficient mice. Luciferase activities were imaged and quantified at days 0, 2, 4, and 6 after transplantation. Data are presented as the ratios of the luciferase intensity of CS-MSCs to that of GC-MSCs (fold), mean ± SD, n = 3, **P < 0.01, ***P < 0.001. (K) Comparative analysis of the osteogenic, chondrogenic and adipogenic differentiation potential of CS-MSCs and GC-MSCs. Von Kossa, Alcian blue, and oil red O staining were used to characterize osteoblasts, chondrocytes, and adipocytes, respectively. Scale bar, 50 μm. (L) The intensity of von Kossa staining was calculated by ImageJ and compared in the left panel. Data are presented as the mean ± SEM, n = 3, **P < 0.01. The cross-sectional area of chondrocyte spheres was measured and is shown in the middle panel. Data are presented as the mean ± SD, n = 14, ***P < 0.001. The relative intensity of oil red O was measured and is shown in the right panel. Data are presented as the mean ± SEM, n = 3, ***P < 0.001

Gene-corrected CS-MSCs display recovered DNA repair ability and resistance to UV-induced apoptosis and cell cycle arrest

Next, we investigated whether mutations in ERCC6 genes lead to impaired DNA damage repair ability after UV irradiation in MSCs. UV radiation usually results in the covalent dimerization of adjacent pyrimidines, typically thymine residues (thymine dimers), including cyclobutane pyrimidine dimers (CPDs) and (6-4) photoproducts (6-4PPs), in DNA (Setlow and Setlow, 1962; Friedberg, 2003; Cadet et al., 2005). Accordingly, we treated CS-MSCs and GC-MSCs with 10 J/m2 UV irradiation and examined the levels of intranuclear CPDs by immunostaining. Both CS-MSCs and GC-MSCs showed low levels of CPDs in the absence of UV irradiation; however, CS-MSCs exhibited more CPD-positive cells than GC-MSCs did at 48 h after UV irradiation (Fig. 4A). These results demonstrated that CS-MSCs were deficient in eliminating CPD photolesions after UV-induced DNA damage, and this ability was restored by ERCC6 correction. We then explored whether CS-MSCs are hypersensitive to UV-induced cellular apoptosis. CS-MSCs and GC-MSCs were cultured in the presence or absence of 10 J/m2 UV irradiation. UV irradiation induced marked cellular apoptosis in CS-MSCs relative to GC-MSCs at 48 h after UV irradiation (Fig. 4B). Western blot analysis showed increased levels of cleaved PARP (c-PARP) in CS-MSCs following UV treatment (Fig. 4C). In addition, we treated MSCs with a lower dose (1 J/m2) of UV light at each passage starting from passage 4. In this context, relative to GC-MSCs, CS-MSCs displayed compromised self-renewal ability and increased SA-β-Gal-positive cells (Fig. 4D–F), indicating that the ERCC6 deficiency rendered MSCs sensitive to replicative stress under low-dose chronic UV irradiation. Thus, CS-specific MSCs exhibited impaired DNA repair ability and increased susceptibility to UV-induced injury, and these phenotypes were rescued by the genetic correction of the pathogenic mutation.

Gene-corrected CS-MSCs display recovered DNA repair ability and counteract UV-induced apoptosis and senescence. (A) CPD immunostaining in CS-MSCs and GC-MSCs in the absence or presence of 10 J/m2 UV exposure. Nuclei were stained with Hoechst 33342. Scale bar, 50 μm. More than 300 nuclei for each group were used for calculation. The data are shown as the mean ± SEM, ns, not significant, ***P < 0.001. a.u., arbitrary units. (B) Apoptosis analysis of CS-MSCs and GC-MSCs at 48 h after 10 J/m2 UV irradiation. Quantitative data are presented as the mean ± SEM, n = 3, **P < 0.01, ***P < 0.001. (C) Western blots showing PARP cleavage in CS-MSCs and GC-MSCs in the absence or presence of 10 J/m2 UV exposure. GAPDH was used as a loading control. Quantitative data are presented as the mean ± SD, n = 3, ns, not significant, *P < 0.05. (D) Growth curves showing the cumulative population doublings of CS-MSCs and GC-MSCs in the absence (control) or presence (UV) of 1 J/m2 UV exposure at each passage starting from passage 4. (E) Clonal expansion assay showing the cell proliferation ability of CS-MSCs and GC-MSCs in the absence (control) or presence (UV) of 1 J/m2 UV exposure at passage 10. The cells were stained with crystal violet after two weeks of culture, and the relative intensity of the crystal violet staining was quantified. Data are presented as the mean ± SEM, n = 3, *P < 0.05, ***P < 0.001. (F) SA-β-Gal staining of CS-MSCs and GC-MSCs in the absence (control) or presence (UV) of 1 J/m2 UV exposure at passage 10. The percentages of SA-β-Gal-positive cells are shown in the right panel. Data are presented as the mean ± SEM, n = 3, **P < 0.01, ns, not significant

Gene-corrected CS-NSCs display improved NER ability and reduced susceptibility to UV-induced apoptosis

Due to the presence of obvious symptoms of neurodegeneration in CS patients (Cleaver et al., 2009; Natale, 2011; Laugel, 2013; Shehata et al., 2014), we next differentiated CS-iPSCs and GC-iPSCs into NSCs (referred to as CS-NSCs and GC-NSCs, respectively). Both CS-NSCs and GC-NSCs showed typical neural progenitor morphology and expressed the NSC markers Nestin, PAX6 and SOX2 (Fig. 5A). Western blots confirmed the increased protein expression of ERCC6 in GC-NSCs compared to that in uncorrected CS-NSCs (Fig. 5B). To investigate whether mutations in the ERCC6 gene impair the DNA repair ability of NSCs, we treated CS-NSCs and GC-NSCs with 5 J/m2 UV irradiation and then examined the levels of intranuclear CPDs. Similar to the results obtained with MSCs, higher levels of CPDs were observed in CS-NSCs than in GC-NSCs at 48 h after UV irradiation, indicating that targeted gene correction effectively rescued the hypersensitivity of CS-NSCs to UV irradiation (Fig. 5C). Consistent with this finding, gene correction resulted in decreased cellular apoptosis in CS-NSCs in the presence of UV treatment (Fig. 5D and 5E). Altogether, these results indicated that CS-NSCs, which are characterized by a DNA repair deficit, were prone to UV-induced apoptosis, while genetic correction resulted in the restoration of these phenotypic defects.

Gene-corrected CS-NSCs show increased NER ability and decreased susceptibility to UV-induced apoptosis. (A) Immunostaining of the NSC markers Nestin, PAX6, and SOX2 in the CS-NSCs and GC-NSCs. The nuclei were stained with Hoechst 33342. Scale bar, 50 μm. ERCC6mut represents CS-NSCs, ERCC6GC represents GC-NSCs. (B) Western blot analysis showing increased protein levels of ERCC6 in GC-NSCs. β-Actin was used as the loading control. (C) CPD immunostaining in CS-NSCs and GC-NSCs in the absence or presence of 5 J/m2 UV exposure. Nuclei were stained with Hoechst 33342. Scale bar, 50 μm. Over 300 nuclei were used for calculations. The data are shown as the mean ± SEM, ***P < 0.001. a.u., arbitrary units. (D) Apoptosis analysis of CS-NSCs and GC-NSCs at 48 h after 5 J/m2 UV irradiation. Quantitative data are presented as the mean ± SEM, n = 3, *P < 0.05, ***P < 0.001. (E) Western blots showing PARP cleavage in CS-NSCs and GC-NSCs in the absence or presence of 5 J/m2 UV exposure. GAPDH was used as a loading control. Quantitative data are presented as the mean ± SD, n = 3, *P < 0.05, ns, not significant

The <italic>ERCC6</italic> mutation results in gene expression changes associated with impaired DNA damage repair, chromatin disorganization, and compromised cell proliferation

To investigate whether gene expression profiles were disrupted in CS-specific iPSCs, MSCs and NSCs, we performed genome-wide RNA sequencing (RNA-seq) analysis (Figs. 6, S1 and S2). Principal component analysis (PCA) showed that the RNA profiles of MSCs, iPSCs and NSCs were separated as three independent subgroups (Fig. 6A), implying the existence of unique RNA expression patterns in each cell type. While there were minimal gene expression changes between CS-iPSCs and GC-iPSCs and between CS-NSCs and GC-NSCs, the mutation of ERCC6 resulted in marked changes in the transcriptome of MSCs (Figs. 6B and S1C). These observations were in line with the most striking phenotypes in CS-MSCs relative to their gene-corrected counterparts under basal culture conditions (Fig. 3C–E). UV treatment results in an increased difference in transcriptional profiles between GC-MSCs and CS-MSCs and between GC-NSCs and CS-NSCs (Figs. 6B and S1C). Notably, UV treatment induced dramatic gene expression changes in CS-specific MSCs and CS-specific NSCs (Fig. S1E), which were associated with increased DNA damage, impaired transcription, and compromised cell growth; these changes, however, became insensitive in ERCC6-corrected MSCs and NSCs, indicating that gene correction resulted in the restoration of normal transcriptional and DNA repair activity under DNA damage stress (Fig. 6C). After extensive passaging, we also observed a panel of upregulated genes related to cell division and DNA damage repair in ERCC6-corrected MSCs compared to diseased MSCs (Fig. 6D), which is in line with the rescue of premature cellular senescence in gene-corrected MSCs (Fig. 3C–J). Collectively, these transcriptomic changes support the improved cell proliferation and increased DNA damage repair ability in ERCC6-corrected adult stem cells.

The global gene expression profiles of CS-iPSCs and gene-corrected CS-iPSCs and their adult stem cell derivatives. (A) PCA of CS cells and GC cells in the absence or presence of UV (Ctrl or UV), as well as under replicative senescence (RS) stress. Each point represents a sample. Data points were computed based on Log2(FPKM + 1). (B) Volcano plots showing the differentially expressed genes between CS-iPSCs and GC-iPSCs, between CS-MSCs and GC-MSCs, and between CS-NSCs and GC-NSCs in the absence of UV (the upper panel) or in the presence of UV (the lower panel, UV), or under RS stress (the lower panel, RS). Red represents upregulated genes, and blue represents downregulated genes. (C) Gene Ontology Biological Process (GO-BP) enrichment analysis of significantly upregulated/downregulated genes in GC-MSCs compared to CS-MSCs upon UV treatment. Red represents upregulated genes, and blue represents downregulated genes. (D) Gene Ontology Biological Process (GO-BP) enrichment analysis of significantly upregulated/downregulated genes in GC-MSCs compared to CS-MSCs under RS stress. Red represents upregulated genes, and blue represents downregulated genes

Gene-corrected CS-MSCs produced in accordance with cGMP compliance guidelines show alleviated senescence and increased resistance to UV-induced apoptosis

Human mesenchymal stem cells hold the potential to be used for the treatment of aging-related disorders (Orozco et al., 2011, 2013, 2014; Golpanian et al., 2016, 2017; Tompkins et al., 2017; Yang et al., 2017; Yan et al., 2019). We next tested whether ERCC6-corrected CS-MSCs can be produced under a cGMP-compliant condition. Accordingly, we derived MSCs from iPSCs using a serum-free, animal component-free differentiation medium. The differentiation protocol was slightly modified from the serum-containing procedure (see experimental method). FACS analysis demonstrated that the derived MSCs expressed the mesenchymal progenitor cell-specific markers CD73, CD90 and CD105 (Fig. 7A). The absence of pluripotent stem cell contamination in the derived MSCs was verified by RT-qPCR and immunostaining assays (Fig. 7B and 7C). Whole-genome DNA sequencing further validated the genomic integrity during somatic cell reprogramming, gene correction, and directed differentiation to MSCs (Fig. 7D and 7E). Sterility and pathogen testing demonstrated that there was no endotoxin, mycoplasma, bacteria, or virus contamination in the culture medium of the GC-MSCs (Fig. 7F). To evaluate any potential risk of tumorigenesis in vivo, immunodeficient mice were subcutaneously injected with the ERCC6-corrected MSCs. Human ESC (line H9) and U2-OS osteosarcoma cell lines were implanted independently as positive controls. We observed that the GC-MSCs failed to form tumors, even at 8 months after implantation, in contrast with the teratomas formed from hESCs and tumors formed from U2-OS cells at 2 months post-injection (Fig. 7G).

Safety analysis of gene-corrected CS-MSCs obtained under a cGMP-compliant condition. (A) FACS analysis indicated the expression of the cell surface markers CD73, CD90 and CD105 in CS-MSCs and GC-MSCs. (B) RT-qPCR analysis of the expression of pluripotency markers OCT4, NANOG, and SOX2 in CS-MSCs and GC-MSCs. GC-iPSCs and CS-fibroblasts were used as positive and negative controls, respectively. Data are presented as the mean ± SEM, n = 3. (C) Immunostaining of the pluripotency marker NANOG in CS-MSCs and GC-MSCs. GC-iPSCs were used as a positive control, Scale bar, 50 μm. (D) Whole-genome sequencing of single-nucleotide variants (SNVs) in CS-fibroblasts, CS-iPSCs, GC-iPSCs, CS-MSCs and GC-MSCs. Sites with a heterozygosity percentage ranging between 0% and 30% were considered as SNV sites, and sites with a heterozygosity of >30% were considered as single-nucleotide polymorphisms (SNPs). (E) Whole-genome sequencing of copy number variations (CNVs) in CS-fibroblasts, CS-iPSCs, GC-iPSCs, CS-MSCs and GC-MSCs. Each point represents normalized coverage depth of each 500-kb genomic region of each chromosome. (F) Sterility and pathogen testing of the conditioned medium of GC-MSCs. a Endotoxin was identified as negative when the concentration was < 0.25 EU/mL. b CMV was identified as negative when the ratio of the OD450 value of sample to the cut-off value (S/Co) was < 1.0. c HAV was identified as negative when the ratio of the cut-off value to the OD450 nm value of the sample (Co/S) was < 0.9. d HCV was identified as negative when the ratio of the OD450 value of the sample to the cut-off value (S/Co) was < 0.9. e HIV-1 was identified as negative when the concentration = 0 pg/mL. (G) Evaluation of the potential tumorigenesis risk of GC-MSCs in vivo. A subcutaneous injection of GC-MSCs was performed in immune-deficient mice. Human ESC (line H9) and U2-OS osteosarcoma cell lines were also implanted independently as positive controls. Representative images in the lower panel showing the teratoma and tumor formed from positive cells two months after transplantation, Scale bar, 0.5 cm. HE staining of a teratoma and tumor were shown in the upper panel. Scale bar, 100 μm. The in vivo tumor-formation incidence of each cell type was calculated. n = 4 for each positive cell group, n = 5 for the GC-MSC group

Phenotypically, compared to diseased MSCs, gene-corrected MSCs generated following the cGMP compliance standard displayed increased cell proliferation and attenuated cellular senescence (Fig. 8A and 8B). In addition, the GC-MSCs were insensitive to UV-induced apoptosis (Fig. 8C and 8D). Consistent with an improved activity, these GC-MSCs exhibited better tri-lineage differentiation potential towards osteoblasts, chondrocytes and adipocytes (Fig. S3C–D). A fat pad implantation assay further demonstrated the superior in vivo neovascularization ability of GC-MSCs (Fig. 8E). Altogether, we successfully generated ERCC6-corrected MSCs with normal functional activity under a cGMP-compliant condition.

Gene-corrected CS-MSCs generated under a cGMP-compliant condition displayed alleviated aging defects and decreased susceptibility to UV-induced apoptosis. (A) Clonal expansion assay showing the cell proliferation ability of CS-MSCs and GC-MSCs. The cells were stained with crystal violet after a two-week culture, and the relative intensity of the crystal violet was quantified. Data are presented as the mean ± SEM, n = 4, **P < 0.01. Scale bar, 50 μm. (B) SA-β-Gal staining of CS-MSCs and GC-MSCs. The percentages of SA-β-Gal-positive cells are shown in the right panel. Data are presented as the mean ± SEM, n = 3, **P < 0.01. Scale bar, 50 μm. (C) Apoptosis analysis of CS-MSCs and GC-MSCs 48 h after 10 J/m2 UV irradiation. Quantitative data are presented as the mean ± SEM, n = 3, ***P < 0.001. (D) Western blots showing PARP cleavage of CS-MSCs and GC-MSCs in the presence of 10 J/m2 UV exposure. β-Actin was used as a loading control. (E) Fat pad transplantation with CS-MSCs and GC-MSCs. Left: representative immunofluorescent images showing neovascularization; right: the number of hCD31-positive vessels calculated based on 24 slices from inconsecutive frozen sections. Data are presented as the mean ± SD, n = 3 for each group, **P < 0.01. Scale bar, 50 μm

DISCUSSION

Although several mouse models exhibiting the clinical symptoms of CS have been generated and have provided valuable insights into the disease mechanism, there are still many differences in clinical features between CS patients and mouse models. For instance, in contrast to human CS patients, who do not develop skin cancer, ERCC6 mutant mice show increased susceptibility to skin cancer (van der Horst et al., 1997, 2002). Thus, CS mouse models do not fully mimic the pathophysiology of CS patients, and the knowledge learned from animal models may be poorly translated to the clinic. CS patient-specific iPSCs were initially obtained by reprogramming fibroblasts from CS patients using retroviral vectors, and these cells exhibited an elevated cell death rate and increased ROS production (Andrade et al., 2012). Our study, however, did not identify increased oxidative stress or altered levels of TXNIP (Fig. S3A and S3B). These differences may be attributed to the reprogramming vectors. Luciana et al. used retroviral vectors, which may result in random genomic integration and genomic instability during the reprograming process. In addition, the same research group recently reported that CS-iPSC-derived neurons display reduced synapse density and altered neural network synchrony (Vessoni et al., 2016). Again, this study was based on a retroviral vector-mediated somatic reprograming technique. More importantly, due to the lack of an isogenic “disease-free” control iPSC line, it is hard to determine whether the phenotypic differences are caused by ERCC6 gene mutations or genetic background variations between CS patients and control individuals. To faithfully recapitulate human CS pathogenesis, a reliable human iPSC-based disease model with isogenic gene-corrected cells is required. In this study, we generated transgene-free iPSCs from the fibroblasts of a CS patient bearing newly identified heterozygous disease-causing mutations in the ERCC6 gene and obtained isogenic gene-corrected iPSCs using the CRISPR/Cas9 system. These iPSCs were further differentiated into two types of adult stem cells, MSCs and NSCs, which presented a panel of new disease phenotypes.

Although previous studies have reported that the deficiency of functional DNA repair proteins may hinder somatic cell reprogramming and teratoma formation in vivo (i.e., WRN (Shimamoto et al., 2014; Wang et al., 2018c), p53 (Kawamura et al., 2009), and Fanconi genes (Muller et al., 2012)), we did not observe any defects in the derivation or pluripotency of CS patient-specific iPSC lines. Moreover, ERCC6 gene mutations did not compromise the chromosomal integrity of iPSCs, as indicated by karyotype analysis. Our study also provides proof of concept that CRISPR/Cas9-mediated gene editing may be amenable to correcting ERCC6 mutation in a therapeutic context. Whole-genome DNA sequencing demonstrated minimal mutational load in patient iPSCs after targeted gene correction.

Although CS patients exhibit musculoskeletal abnormalities (Hishiya and Watanabe, 2004), there are limited reports concerning mesodermal cells. Using an iPSC-based system, we have for the first time generated CS-specific MSCs that display differentiation potential towards osteoblasts, chondrocytes and white adipocytes, and these cells serve as a good cell model to study mesodermal abnormalities in CS patients. Consistent with the premature degeneration of mesenchymal progenitor cells, CS-MSCs exhibit decreased cell proliferation, accelerated senescence, and compromised differentiation ability towards osteoblasts, chondrocytes and white adipocytes, which may constitute one of the causes of the observed defects in the musculoskeletal system. In addition, in agreement with previous reports showing confounding defects in the neural system in CS patients (Cleaver et al., 2009; Natale, 2011; Laugel, 2013; Sacco et al., 2013; Ciaffardini et al., 2014; Vessoni et al., 2016), our data indicated severe DNA repair defects and increased susceptibility to UV-induced apoptosis in CS-iPSC-derived NSCs, therefore providing in-depth mechanistic insights into CS-associated neurological disorders.

Regarding the molecular mechanism, we have generated the first ERCC6 mutation-associated disease transcriptome landscapes of human MSCs and NSCs using an isogenic iPSC-based research system. Under normal culture conditions, mutation of ERCC6 resulted in the most dramatic gene expression changes in MSCs relative to NSCs and iPSCs. Consistent with this finding, CS-specific MSCs demonstrated cell type-specific accelerated senescence after serial passaging. These results suggest that the attrition of the MSC pool and the resulting mesodermal defects are a major syndrome of CS. UV radiation generates photoproducts in genomic DNA that promote genetic mutations that contribute to skin carcinogenesis or cellular senescence (Amaro-Ortiz et al., 2014; Kemp et al., 2017). In this study, we found that ERCC6 mutant MSCs and NSCs were highly susceptible to UV radiation. A defect in the initiation of transcription by RNAPII in UV-treated CS and XP/CS cells has been observed in previous studies (Rockx et al., 2000; Yamada et al., 2002; Proietti-De-Santis et al., 2006; Velez-Cruz et al., 2013). In line with these results, we observed that transcriptional blockage was rescued in gene-corrected CS-MSCs after UV irradiation. In addition, the presence of the ERCC6 mutation is associated with defects in gene expression linked to “cellular response to DNA damage”, “cellular response to stress” and “cell division”, indicating that the defective DNA repair in CS-specific adult stem cells mediates UV-induced cell phenotypic abnormalities. In addition, the mutation of ERCC6 also led to gene expression changes related to “regulation of chromatin organization” in both NSCs and MSCs. Therefore, the pathogenesis of CS may involve a complex interplay among defects in DNA damage repair, chromatin organization, and cell cycle control.

In the context of disease therapy, stem cell-based replacement therapy holds great promise toward restoring tissue homeostasis, e.g., for premature aging disorders (Golpanian et al., 2017; Tompkins et al., 2017). We and others have produced adult stem cells and other terminally differentiated cells from iPSCs derived from various human aging-related disorders, including Hutchinson-Gilford progeria syndrome (HGPS), Werner syndrome (WS), Fanconi anemia (FA), XP, amyotrophic lateral sclerosis (ALS), and Parkinson’s disease (PD) (Liu et al., 2011a, 2012, 2014; Zhang et al., 2015; Fu et al., 2016; Wang et al., 2017). Using targeted gene editing techniques, we have also edited/corrected pathogenic mutations in these patient-derived iPSCs (Liu et al., 2011b, 2012, 2014; Wang et al., 2017). MSCs can differentiate into osteoblasts, chondrocytes, myocytes and adipocytes. Previous studies have shown that MSCs ameliorate aging frailty in clinical trials (Golpanian et al., 2016, 2017; Tompkins et al., 2017). Recently, the generation of allogeneic or autologous MSCs from pluripotent stem cells has emerged as a promising new strategy for stem cell-based therapy (Yang et al., 2017; Castro-Vinuelas et al., 2018; Soontararak et al., 2018; Yan et al., 2019). In the present study, we have derived MSCs from gene-corrected CS-iPSCs under a cGMP-compliant condition. These MSCs demonstrated superior cellular activity compared to uncorrected diseased cells, retained high genomic stability, and did not form tumors in vivo. Therefore, clinical-grade GC-MSCs may represent important biomaterials for achieving autologous stem cell treatment for CS.

In summary, the isogenic CS stem cell models established in this study provide a valuable platform for studying CS pathogenesis, discovering innovative drugs, and the development of new cell replacement therapies. The transcriptomic profiles underlying disease phenotypes may be useful for discovering biomarkers for diagnosis and the development of new therapeutic approaches.

MATERIALS AND METHODSAntibodies and reagents

The primary antibodies used were as follows (company, catalogue number): anti-ERCC6 (Abcam, ab96098), anti-NANOG (Abcam, ab21624), anti-SOX2 (Santa Cruz, sc-17320), anti-OCT4 (Santa Cruz, sc-5279), anti-SMA (Sigma, A5228), anti-TUJ1 (Sigma, T2200), anti-FOXA2 (Cell Signaling Technology, 8186S), anti-CD90-FITC (BD Bioscience, 555595), anti-CD73-PE (BD Bioscience, 550257), anti-CD105-APC (BD Bioscience, 17-1057-42), anti-IgG-FITC (BD Biosciences, 555748), anti-IgG-PE (BD Biosciences, 555749), anti-IgG-APC (BD Biosciences, 555751), anti-Lamin B (Santa Cruz, sc-6217), anti-LAP2 (BD Bioscience, 611000), anti-Ki67 (ZSGB-BIO, ZM0166), anti-P16 (BD Bioscience, 550834), anti-γ-H2AX (Millipore, 05-636), anti-Nestin (Millipore, MAB5326), anti-PAX6 (Covance, PRB-278P), anti-CPD (Cosmo Bio, TMD-2), anti-cleaved PARP (Cell Signaling Technology, 9541), anti-β-Actin (Santa Cruz, sc69879), anti-GAPDH (Santa Cruz, sc-25778), and anti-hCD31 (BD Bioscience, 555445).

Generation and genotyping of CS-specific fibroblasts

CS-specific fibroblasts were generated from the skin biopsy of a CS patient carrying two heterozygous ERCC6 mutations: c.643G>T in exon 4 and c.3776C>A in exon 18. Fibroblasts were cultured with high-glucose DMEM (HyClone) containing 10% fetal bovine serum (FBS, Gemcell), 1% penicillin/streptomycin (Gibco), and 0.1 mmol/L non-essential amino acids (Gibco). Genotyping of CS-specific fibroblasts was performed using a genomic DNA PCR assay with the primers listed in Table S1. Genomic DNA from the fibroblasts of healthy donor was used as a control, as previously described (Fu et al., 2016).

iPSC generation and culture

CS patient-specific iPSCs were generated by the electroporation of fibroblasts with episomal vectors, including pCXLE-hSK, pCXLE-hOCT3/4-shp53-F and pCXLE-hUL, as previously described (Okita et al., 2011; Liu et al., 2012, 2014; Fu et al., 2016; Wang et al., 2017). The derived iPSC lines were cultured on mitomycin C-treated MEF feeder cells in human ESC medium or on Matrigel (BD Biosciences)-coated plates in mTeSR medium (STEMCELL Technology). The ESC medium consisted of DMEM/F12 (Invitrogen) supplemented with 20% KnockOut Serum Replacement (Invitrogen), 0.1 mmol/L non-essential amino acids (NEAA, Invitrogen), 1% penicillin/streptomycin (Gibco), 2 mmol/L GlutaMAX (Invitrogen), 55 μmol/L β-mercaptoethanol (Invitrogen), and 10 ng/mL bFGF (Joint Protein Central).

Plasmid construction

Guide RNA (gRNA) was designed with http://crispr.mit.edu. The gRNAs were cloned into the pCAG-mCherry-gRNA vector (Addgene #87110). For the expression of Cas9 and GFP (Cas9-2A-GFP), the pCAG-1BPNLS-Cas9-1BPNLS-2AGFP plasmid (Addgene #87109) was used (Suzuki et al., 2016). The sequences for the gRNA target and ssODN used to repair mutant alleles are as follows: Exon 4-gRNA: GGATCACGCCAGTCTGGAGTAGG. ERCC6-ssODN, 5′-CTAAAGAGACACCCTCCACTGACTACAGGCATCAGGCATCAATTCAAGAACACAGAGAAACTGCTCCTAGCATCCTCACCTGCATCCTCtTCCAGACTGGCGTGATCTAGTTCAATTTTCACCTCTG-3′.

Targeted gene correction in CS-iPSCs via the CRISPR/Cas9 system

CRISPR/Cas9-mediated gene correction of ERCC6 mutation was performed as previously described with some modifications (Peters et al., 2008). Briefly, 5 × 106 iPSCs were resuspended in 100 μL of Opti-MEM (Gibco) supplemented with 8 μg of Cas9-2A-GFP, 4 μg of gRNA-mCherry, and 8 μg of ssODN. After electroporation, the cells were cultured on Matrigel-coated plates in mTeSR medium. At forty-eight hours after electroporation, mCherry+/GFP+ cells were collected by FACS and replated onto MEF feeder cells. Two weeks later, the iPSC clones were picked and identified by genomic DNA PCR and sequencing. The primers used are listed in Table S1.

MSC generation and characterization

The differentiation of CS-iPSCs and GC-iPSCs into MSCs was performed as previously described (Zhang et al., 2015; Pan et al., 2016; Wang et al., 2018b). Briefly, embryoid bodies were plated onto Matrigel-coated plates in differentiation medium (αMEM (Invitrogen) supplemented with 10% FBS (Gemcell), 10 ng/mL bFGF (Joint Protein Central, JPC), 5 ng/mL TGFβ (Human Zyme), 0.1 mmol/L NEAA (Gibco) and 1% penicillin/ streptomycin (Gibco)). The differentiated cells were then subjected to FACS to purify the CD73/CD90/CD105 (MSC-specific surface markers) triple-positive MSCs. The purified MSCs were then cultured in αMEM medium supplemented with 10% FBS, 1 ng/mL bFGF, 1% penicillin/streptomycin, and 0.1 mmol/L NEAA.

Clinical-grade MSC differentiation and culture were performed in the cGMP level cell culture facility (Clinical-grade Stem Cell Research Center, Peking University Third Hospital) following the cGMP compliance guidelines. First, differentiation of iPSCs into MSCs was achieved using process similar to that used for general MSCs except prepared in a xeno-free and serum-free condition. Briefly, embryoid bodies were plated onto vitronectin (Gibco, A14700)-coated plates in differentiation medium (BM MSC medium (Dakewe, DKW34-BM20500) supplemented with 5% serum replacement (Helios, GMP grade, HPCFDCGL50), 5 ng/mL TGFβ (Human Zyme), 6 ng/mL bFGF (Joint Protein Central, JPC), 10 ng/mL EGF (Joint Protein Central, JPC), 10 ng/mL PDGF (Joint Protein Central, JPC) and 1% penicillin/streptomycin (Gibco)). Next, the differentiated cells were subjected to FACS to purify the CD73/CD90/CD105 triple-positive MSCs. The purified MSCs were then cultured in BM MSC medium supplemented with 5% serum replacement and 1% penicillin/streptomycin.

The differentiation potential of the MSCs towards chondrocytes, osteoblasts and adipocytes was evaluated by staining with Alcian blue (chondrogenesis), von Kossa (osteogenesis) and an oil red O (adipogenesis) kit (IHC World) after differentiation of the indicated lineage, as previously described (Zhang et al., 2015; Pan et al., 2016; Wang et al., 2018b).

Sterility and pathogen testing of MSCs generated under a cGMP-compliant condition

The conditioned medium of GC-MSCs was collected for the following test. Cell debris in the conditioned medium was removed by centrifugation at 12,000 rpm and 4 °C for 5 min. In addition, the cell culture supernatant was immediately assayed. For CMV, HAV, HCV and HIV-1 ELISA detection, the optical density (O.D.) value for each sample was determined using a microplate reader set to 450 nm (OD450). The duplicate readings for each standard, control, and experimental sample were averaged, and the average zero standard O.D. was subtracted.

Mycoplasma detection

Mycoplasma in the supernatant of the conditioned medium was detected by PCR. The primer sequences are listed in Table S1.

Endotoxin detection

Endotoxin in the supernatant of the conditioned medium was detected with the ToxinSensor Gel Clot Endotoxin Assay Kit (GenScript, Cat. No. L00351) according to the manufacturer’s protocol. Briefly, 100 μL of the supernatants from the positive control (PC), negative control (NC) or experimental samples was transferred to the LAL reagent. The vials were capped and mixed thoroughly. All vials were placed in the incubation rack and incubated at 37 °C for 60 min. Then, the vials were inverted and checked to determine whether a gel was formed. The formation of the gel was considered endotoxin positive. The endotoxin level in the positive sample was higher than 0.25 EU/mL.

CMV detection

CMV IgM in the conditioned medium was detected by ELISA (MEDSON) according to the manufacturer’s instructions. Briefly, 100 μL of the supernatants from the PC, NC or experimental samples was pipetted onto the microplate. After incubation with antigen and conjugate solution, the absorbance of the samples was determined at 450 nm. The test results are interpreted as a ratio of the sample (S) OD450 nm and the cut-off (Co) value (S/Co) according to the following standard: S/Co < 1.0 was considered negative; S/Co > 1.2 was considered positive. Co = NC + 0.25.

HAV detection

HAV IgM and IgG in the conditioned medium were detected by ELISA (DIA. PRO) following the manufacturer’s protocol. Briefly, 100 μL of the supernatants from the PC, NC or experimental samples was pipetted onto the microplate. After incubation with antigen and conjugate solution, the absorbance of the samples was determined at 450 nm. The test results are interpreted as the ratio of the cut-off value to the sample OD450 (Co/S) according to the following standard: Co/S < 0.9 was considered negative; Co/S > 1.1 was considered positive. Co = (NC + PC) / 3.

HCV detection

HCV IgM and IgG in the conditioned medium were detected by ELISA (DIA. PRO) according to the manufacturer’s guidelines. First, 100 μL of the supernatants from the PC, NC or experimental samples was pipetted onto the microplate. After incubation with antigen and conjugate solution, the test results are interpreted as the ratio of OD450 of the sample to the cut-off value (S/Co) according to the following standard: S/Co < 0.9 was considered negative; S/Co > 1.1 was considered positive. Co = NC + 0.35.

HIV-1 detection

HIV-1 Gap p24 in the conditioned medium was detected by ELISA (R&D SYSTEMS) according to the manufacturer’s protocol. Briefly, 100 μL of the supernatants from the standard, control or experimental samples was pipetted onto the microplate. After incubation with conjugate solution, the concentration of each sample was calculated by OD450. The minimum detectable dose of HIV-1 Gag p24 ranged from 0.24–3.25 pg/mL.

Febrile pathogen detection

Pathogens in the conditioned medium were detected by the Febrile Antigens Kit (Rapid Labs). Briefly, 80 μL of the supernatants from the PC, NC or experimental samples was dispensed onto a 3 cm diameter circle. One drop of the antigen suspension was added to the sample. The reaction mixture was mixed well using a stirring stick, and the slide was rocked gently by hand for 1 min. The slides were immediately observed under suitable light for any degree of agglutination. Nonreactive: smooth suspension with no visible agglutination, as shown by the NC. Reactive: any degree of agglutination visible macroscopically.

NSC generation and characterization

NSC differentiation was conducted as previously described (Liu et al., 2012; Duan et al., 2015). In brief, iPSCs cultured on MEF feeder cells were differentiated with NIM-1 medium [50% Advanced DMEM/F12 (Invitrogen), 50% Neurobasal Medium (Invitrogen), 1× N2 Supplement (Invitrogen), 1× B27 Supplement (Invitrogen), 4 µmol/L CHIR99021 (Cellagentech), 3 µmol/L SB431542 (Cellagentech), 10 ng/mL human leukemia inhibitory factor (hLIF, Millipore), 2 µmol/L dorsomorphin (Sigma), 0.1 µmol/L Compound E (EMD Chemicals Inc.) and 2 mmol/L GlutaMAX (Invitrogen)]. Two days later, the medium was changed to NIM-2 medium (50% Advanced DMEM/F12, 50% Neurobasal Medium, 1× N2 Supplement, 1× B27 Supplement, 4 µmol/L CHIR99021, 3 µmol/L SB431542, 10 ng/mL hLIF, 0.1 µmol/L Compound E and 2 mmol/L GlutaMAX) for five more days. The NSCs were then generated and further cultured in NSC maintenance medium containing 50% Neurobasal Medium, 50% Advanced DMEM/F12, 1× N2 Supplement, 1× B27 Supplement, 2 mmol/L GlutaMAX, 3 μmol/L CHIR99021, 2 μmol/L SB431542 and 10 ng/mL hLIF.

Animal experiments

All animal experiments performed in this study were approved by the Chinese Academy of Science Institutional Animal Care and Use Committee. For the teratoma formation assay, 6-week-old male NOD-SCID mice were injected subcutaneously with 3 × 106 CS-iPSCs or GC-iPSCs in a Matrigel/mTeSR solution, as previously described (Zhang et al., 2015). Teratomas with a size of approximately 10 mm in diameter were collected and subjected to immunostaining. For the MSC in vivo imaging assay, 106 CS-MSCs or GC-MSCs expressing luciferase were transplanted into the TA muscle of 6-week-old male nude mice. The grafted cells were imaged with an IVIS spectrum imaging system (XENOGEN, Caliper) by detecting luciferase activity. To evaluate the potential tumorigenesis risk of GC-MSCs in vivo, a subcutaneous injection of GC-MSCs was performed in NSG mice. Human ESC (line H9) and U2-OS osteosarcoma cell lines were also implanted independently as positive controls. Fat pad transplantation was performed as previously described (Yu et al., 2016; Geng et al., 2018). CS-MSCs or GC-MSCs (1.5 × 105) were freshly collected and resuspended in Matrigel mixture containing 50% Matrigel, 20% FBS in PBS, and 0.01% Trypan Blue (Sigma). The mixture was then injected into the fat pads of 3-week-old female NOD-SCID mice. Four weeks later, the fat pads were harvested for measuring MSC-derived vessel regeneration by immunofluorescence staining.

<bold>Senescence</bold>-<bold>associated β</bold>-<bold>galactosidase (SA</bold>-<bold>β</bold>-<bold>Gal) staining assay</bold>

SA-β-Gal staining was performed according to a previously described method (Debacq-Chainiaux et al., 2009; Zhang et al., 2015; Pan et al., 2016; Geng et al., 2018; Wang et al., 2018b). Each experiment was performed in three independent replicates.

Clonal expansion assay

Approximately 2000 cells were seeded into each well of 12-well plates and cultured for 2 weeks. Then, the cells were stained with 0.2% crystal violet, and the intensity of the crystal violet staining was quantified by ImageJ software. Each experiment was performed in three independent replicates.

<bold>RT</bold>-<bold>qPCR</bold>

Total RNA was extracted with TRIzol reagent (Invitrogen), and 2 μg of total RNA was used for cDNA synthesis using a reverse transcription master mix (Promega). Quantitative real-time PCR was conducted with the iTaq Universal SYBR Green Super Mix (Bio-Rad) with the CFX384 Real-Time PCR system (Bio-Rad). All data were normalized to the 18S rRNA transcript and calculated using the ΔΔCq method. All RT-qPCR primer pairs are listed in Table S1.

Western blot

Western blot was performed as previously described (Wang et al., 2015, 2016). Briefly, protein quantification was conducted using a BCA Kit. Protein lysates were subjected to SDS-PAGE and subsequently electrotransferred to a polyvinylidene fluoride membrane (Millipore). The membrane was incubated with the indicated primary antibodies overnight at 4 °C and HRP-conjugated secondary antibodies for 1 h at room temperate (RT), followed by visualization using the ChemiDoc XRS system (Bio-Rad). Quantification was performed with ImageJ software.

Immunofluorescence

Immunofluorescence was conducted as previously described (Wang et al., 2016). Briefly, the cells were fixed with 4% paraformaldehyde for 25 min, permeabilized with Triton X-100 (0.3% in PBS) for 25 min, incubated with blocking buffer (10% donkey serum in PBS) for 1 h at RT, and stained with primary antibodies overnight at 4 °C. Then, the cells were incubated with secondary antibodies for 1 h at RT. Hoechst 33342 (Invitrogen) was used to stain nuclear DNA.

Analysis of apoptosis by flow cytometry

A FACS-based apoptosis analysis was performed as previously described (Fu et al., 2016; Pan et al., 2016). For ROS measurement, cells were collected and incubated with 1 μmol/L H2DCFDA for 30 min using ROS Detection Reagents (Molecular Probes, C6827). The cells were later analysed using the BD LSRFortessa cell analyser.

RNA sequencing library construction

Total RNA for each sample was extracted using the RNeasy Mini Kit (Qiagen) according to the manufacturer’s instructions. After quantification of the RNA by a fragment analyzer (Advanced Analytical), RNA sequencing libraries were constructed using the TruSeq RNA Sample Preparation Kit (Illumina) according to the manufacturer’s protocols. Paired-end sequencing was performed using Illumina Hiseq X Ten platform.

RNA sequencing data processing

RNA-seq data processing was performed as previously described (Zhang et al., 2015, 2019; Geng et al., 2018; Wang et al., 2018a; Ling et al., 2019). In brief, sequencing reads were trimmed and mapped to the H. sapiens reference genome (hg19) with HISAT2 software (v2.0.4) (Kim et al., 2015). HTSeq (v0.10.0) was used to determine the transcriptional expression level of each gene (Anders et al., 2015). Differentially expressed genes (DEGs) were computed at a cut-off adjusted P value (Benjamini-Hochberg) less than 0.05 and |Log2(fold change)| more than 1 using DESeq2 (Love et al., 2014). Pearson’s correlation coefficient (R) and the Euclidian distance were calculated using R to evaluate the correlation between the replicates of each sample, which were based on Log2(FPKM + 1). PCA was also performed using R based on Log2(FPKM + 1). Gene ontology (GO) enrichment analysis was computed by Metascape (Tripathi et al., 2015). The enrichment networks were visualized using Cytoscape (Shannon et al., 2003). Protein-protein interaction networks of overlapping genes were drawn based on the search tool for the retrieval of interacting genes (STRING) database (Szklarczyk et al., 2017). The aging-associated genes were obtained from the human aging genomic resources (HAGR) database (Tacutu et al., 2013).

DNA extraction, library construction and sequencing

Genomic DNA was extracted from each sample using the QIAamp® DNA Mini Kit (Qiagen), according to the manufacturer’s protocol. DNA was randomly fragmented into ~300 bp lengths using a Covaris ultrasonic processor. DNA libraries were prepared with the NEBNext® UltraTM DNA library Prep Kit (Illumina) and quantified using a Qubit 2.0 Fluorometer (Life Technologies). The insert sizes of the fragments in the libraries were determined by the Agilent Bioanalyzer 2100. Paired-end sequencing was performed using the Illumina HiSeq X Ten platform.

Bioinformatics analyses of copy number variations, single-nucleotide variants and off-target sites

The pipeline of whole genome sequencing data processing used in this study has been described previously (Zhang et al., 2018). In brief, sequencing data were mapped to the H. sapiens reference genome (hg19) without repeat regions using the Burrows-Wheeler Aligner (BWA, version 0.7.17) (Li and Durbin, 2009). The genomic coverage for each 500 kb bin window was calculated and normalized by the average sequencing depth. The copy number variation (CNV) scatterplot was drawn by ggplot2. For the single-nucleotide variant (SNV) analysis, the read base sites with an incorrect base probability >0.001 were masked with N, and base distribution for each chromosomal location was calculated by pysamstats (version 1.0.1) (https://github.com/alimanfoo/pysamstats). The heterozygosity of each site was defined as the percentage of the second enriched base depth. SNV sites were defined by base heterozygosity (0%–30%). Potential indel sites were extracted with pysamstats (version 1.0.1) under default setting. Then indel sites were screened with sites existing in CS-iPSC genomic sequencing datasets, repeats and low-complexity regions annotated by RepeatMasker (db20170127), indel-type SNPs in humans and homopolymers. Simultaneously, 2034 off-target sites with no more than five mismatched sites were identified by Cas-OFFinder (Bae et al., 2014). None of these regions included indel sites identified by whole genome sequencing.

Statistical analysis

All results are presented as the mean ± SEM or mean ± SD. The data were statistically analysed using a two-tailed Student’s t-test to compare differences between treatments assuming equal variance with PRISM software (GraphPad 5 Software). P values <0.05, <0.01, and <0.001 were considered statistically significant (*, **, and ***, respectively).

Accession numbers

The sequencing data have been deposited in the NCBI Gene Expression Omnibus (GEO) under the accession number GSE124208, NCBI Sequence Read Archive under accession number SRP174074. +

Electronic supplementary material

Below is the link to the electronic supplementary material. +

Supplementary material 1 (PDF 3822 kb)

Supplementary material 2 (XLSX 13 kb)

Si Wang, Zheying Min, and Qianzhao Ji have contributed equally.

Change history

1/15/2022

A Correction to this paper has been published: 10.1007/s13238-021-00901-3

Acknowledgements

The authors acknowledge L. Bai, R. Bai, Q. Chu, J. Lu, S. Ma and Y. Yang for administrative assistance and W. Li, J. Jia and X. Zhang for assistance with animal experiments. This work was supported by the National Key Research and Development Program of China (2018YFC2000100), the Strategic Priority Research Program of the Chinese Academy of Sciences (XDA16010100), the National Key Research and Development Program of China (2018YFA0107203, 2017YFA0103304, 2017YFA0102802, 2016YFC1000601, 2015CB964800, 2014CB910503, and 2018YFA0108500), the National Natural Science Foundation of China (Grant Nos. 81625009, 81330008, 91749202, 91749123, 31671429, 81671377, 81771515, 31601109, 31601158, 81701388, 81601233, 81822018, 81801399, 31801010, 81801370, 81861168034, 81571400, and 81771580), the Program of the Beijing Municipal Science and Technology Commission (Z151100003915072), the Key Research Program of the Chinese Academy of Sciences (KJZDEWTZ-L05), the Beijing Municipal Commission of Health and Family Planning (PXM2018_026283_000002) and the Advanced Innovation Center for Human Brain Protection (117212, 3500-1192012).

ReferencesAmaro-OrtizAYanBD’OrazioJAUltraviolet radiation, aging and the skin: prevention of damage by topical cAMP manipulationMolecules2014196202621924838074AndersSPylPTHuberWHTSeq: a Python framework to work with high-throughput sequencing dataBioinformatics20153116616925260700AndradeLNNathansonJLYeoGWMenckCFMuotriAREvidence for premature aging due to oxidative stress in iPSCs from Cockayne syndromeHum Mol Genet2012213825383422661500AndressooJOMitchellJRde WitJHoogstratenDVolkerMToussaintWSpeksnijderEBeemsRBvan SteegHJansJAn Xpd mouse model for the combined xeroderma pigmentosum/Cockayne syndrome exhibiting both cancer predisposition and segmental progeriaCancer Cell20061012113216904611BaeSParkJKimJSCas-OFFinder: a fast and versatile algorithm that searches for potential off-target sites of Cas9 RNA-guided endonucleasesBioinformatics2014301473147524463181CadetJSageEDoukiTUltraviolet radiation-mediated damage to cellular DNAMutat Res200557131715748634Castro-VinuelasRSanjurjo-RodriguezCPineiro-RamilMHermida-GomezTFuentes-BoqueteIMde Toro-SantosFJBlanco-GarciaFJDiaz-PradoSMInduced pluripotent stem cells for cartilage repair: current status and future perspectivesEur Cell Mater2018369610930204229CiaffardiniFNicolaiSCaputoMCanuGPaccosiECostantinoMFrontiniMBalajeeASProietti-De-SantisLThe cockayne syndrome B protein is essential for neuronal differentiation and neuritogenesisCell Death Dis20145e126824874740CleaverJELamETRevetIDisorders of nucleotide excision repair: the genetic and molecular basis of heterogeneityNat Rev Genet20091075676819809470Debacq-ChainiauxFErusalimskyJDCampisiJToussaintOProtocols to detect senescence-associated beta-galactosidase (SA-betagal) activity, a biomarker of senescent cells in culture and in vivoNat Protoc200941798180620010931DingZSuiLRenRLiuYXuXFuLBaiRYuanTHaoYZhangWA widely adaptable approach to generate integration-free iPSCs from non-invasively acquired human somatic cellsProtein Cell2015638638925412771DuanSYuanGLiuXRenRLiJZhangWWuJXuXFuLLiYPTEN deficiency reprogrammes human neural stem cells towards a glioblastoma stem cell-like phenotypeNat Commun201561006826632666FriedbergECHow nucleotide excision repair protects against cancerNat Rev Cancer20011223311900249FriedbergECDNA damage and repairNature200342143644012540918FuLNXuXLRenRTWuJZhangWQYangJPRenXQWangSZhaoYSunLModeling xeroderma pigmentosum associated neurological pathologies with patients-derived iPSCsProtein Cell2016721022126874523GengLLiuZZhangWLiWWuZWangWRenRSuYWangPSunLChemical screen identifies a geroprotective role of quercetin in premature agingProtein Cell201810.1007/s13238-018-0567-y30069858GolpanianSDiFedeDLPujolMVLoweryMHLevis-DusseauSGoldsteinBJSchulmanIHLongsomboonBWolfAKhanARationale and design of the allogeneiC human mesenchymal stem cells (hMSC) in patients with aging fRAilTy via intraveno US delivery (CRATUS) study: A phase I/II, randomized, blinded and placebo controlled trial to evaluate the safety and potential efficacy of allogeneic human mesenchymal stem cell infusion in patients with aging frailtyOncotarget20167118991191226933813GolpanianSDiFedeDLKhanASchulmanIHLandinAMTompkinsBAHeldmanAWMikiRGoldsteinBJMushtaqMAllogeneic human mesenchymal stem cell infusions for aging frailtyJ Gerontol A20177215051512GorgelsTGvan der PluijmIBrandtRMGarinisGAvan SteegHvan den AardwegGJansenGHRuijterJMBergenAAvan NorrenDRetinal degeneration and ionizing radiation hypersensitivity in a mouse model for Cockayne syndromeMol Cell Biol2007271433144117145777HishiyaAWatanabeKProgeroid syndrome as a model for impaired bone formation in senile osteoporosisJ Bone Miner Metab20042239940315316860JaarsmaDvan der PluijmIde WaardMCHaasdijkEDBrandtRVermeijMRijksenYMaasAvan SteegHHoeijmakersJHAge-related neuronal degeneration: complementary roles of nucleotide excision repair and transcription-coupled repair in preventing neuropathologyPLoS Genet20117e100240522174697KarikkinethACScheibye-KnudsenMFivensonECroteauDLBohrVACockayne syndrome: clinical features, model systems and pathwaysAgeing Res Rev20173331727507608KawamuraTSuzukiJWangYVMenendezSMoreraLBRayaAWahlGMIzpisua BelmonteJCLinking the p53 tumour suppressor pathway to somatic cell reprogrammingNature20094601140114419668186KempMGSpandauDFTraversJBImpact of age and insulin-like growth factor-1 on DNA damage responses in UV-irradiated human skinMolecules201722356KimDLangmeadBSalzbergSLHISAT: a fast spliced aligner with low memory requirementsNat Methods20151235736025751142KubbenNZhangWWangLVossTCYangJQuJLiuGHMisteliTRepression of the antioxidant NRF2 pathway in premature agingCell20161651361137427259148LaugelVCockayne syndrome: the expanding clinical and mutational spectrumMech Ageing Dev201313416117023428416LiHDurbinRFast and accurate short read alignment with Burrows–Wheeler transformBioinformatics2009251754176019451168LiYZhangWChangLHanYSunLGongXTangHLiuZDengHYeYVitamin C alleviates aging defects in a stem cell model for Werner syndromeProtein Cell2016747848827271327LingCLiuZSongMZhangWWangSLiuXMaSSunSFuLChuQModeling CADASIL vascular pathologies with patient-derived induced pluripotent stem cellsProtein Cell.20191024927130778920LiuGHBarkhoBZRuizSDiepDQuJYangSLPanopoulosADSuzukiKKurianLWalshCRecapitulation of premature ageing with iPSCs from Hutchinson–Gilford progeria syndromeNature201147222122521346760LiuGHSuzukiKQuJSancho-MartinezIYiFLiMKumarSNivetEKimJSoligallaRDTargeted gene correction of laminopathy-associated LMNA mutations in patient-specific iPSCsCell Stem Cell2011868869421596650LiuGHQuJSuzukiKNivetELiMMontserratNYiFXuXRuizSZhangWProgressive degeneration of human neural stem cells caused by pathogenic LRRK2Nature201249160360723075850LiuGHSuzukiKLiMQuJMontserratNTarantinoCGuYYiFXuXZhangWModelling Fanconi anemia pathogenesis and therapeutics using integration-free patient-derived iPSCsNat Commun20145433024999918LoveMIHuberWAndersSModerated estimation of fold change and dispersion for RNA-seq data with DESeq2Genome Biol20141555025516281McKayBCCabritaMAArresting transcription and sentencing the cell: the consequences of blocked transcriptionMech Ageing Dev201313424325223542592MullerLUMilsomMDHarrisCEVyasRBrummeKMParmarKMoreauLASchambachAParkIHLondonWBOvercoming reprogramming resistance of Fanconi anemia cellsBlood20121195449545722371882MuraiMEnokidoYInamuraNYoshinoMNakatsuYvan der HorstGTHoeijmakersJHTanakaKHatanakaHEarly postnatal ataxia and abnormal cerebellar development in mice lacking Xeroderma pigmentosum Group A and Cockayne syndrome Group B DNA repair genesProc Natl Acad Sci USA200198133791338411687625NataleVA comprehensive description of the severity groups in Cockayne syndromeAm J Med Genet A2011155A1081109521480477NewmanJCBaileyADWeinerAMCockayne syndrome group B protein (CSB) plays a general role in chromatin maintenance and remodelingProc Natl Acad Sci USA20061039613961816772382OkitaKMatsumuraYSatoYOkadaAMorizaneAOkamotoSHongHNakagawaMTanabeKTezukaKA more efficient method to generate integration-free human iPS cellsNat Methods2011840941221460823OrozcoLSolerRMoreraCAlbercaMSanchezAGarcia-SanchoJIntervertebral disc repair by autologous mesenchymal bone marrow cells: a pilot studyTransplantation20119282282821792091OrozcoLMunarASolerRAlbercaMSolerFHuguetMSentisJSanchezAGarcia-SanchoJTreatment of knee osteoarthritis with autologous mesenchymal stem cells: a pilot studyTransplantation2013951535154123680930OrozcoLMunarASolerRAlbercaMSolerFHuguetMSentisJSanchezAGarcia-SanchoJTreatment of knee osteoarthritis with autologous mesenchymal stem cells: two-year follow-up resultsTransplantation201497e66e6824887752PanHGuanDLiuXLiJWangLWuJZhouJZhangWRenRLiYSIRT6 safeguards human mesenchymal stem cells from oxidative stress by coactivating NRF2Cell Res20162619020526768768Peters DT, Cowan CA, Musunuru K (2008) Genome editing in human pluripotent stem cells. In: StemBook, CambridgeProietti-De-SantisLDranePEglyJMCockayne syndrome B protein regulates the transcriptional program after UV irradiationEMBO J2006251915192316601682RockxDAMasonRvan HoffenABartonMCCitterioEBregmanDBvan ZeelandAAVrielingHMullendersLHUV-induced inhibition of transcription involves repression of transcription initiation and phosphorylation of RNA polymerase IIProc Natl Acad Sci USA200097105031050810973477SaccoRTamblynLRajakulendranNBralhaFNTropepeVLaposaRRCockayne syndrome b maintains neural precursor functionDNA Repair20131211012023245699SetlowRBSetlowJKEvidence that ultraviolet-induced thymine dimers in DNA cause biological damageProc Natl Acad Sci USA1962481250125713910967ShannonPMarkielAOzierOBaligaNSWangJTRamageDAminNSchwikowskiBIdekerTCytoscape: a software environment for integrated models of biomolecular interaction networksGenome Res2003132498250414597658ShehataLSimeonovDRRaamsAWolfeLVanderverALiXHuangYGarnerSBoerkoelCFThurmAERCC6 dysfunction presenting as progressive neurological decline with brain hypomyelinationAm J Med Genet A2014164A2892290025251875ShimamotoAKagawaHZenshoKSeraYKazukiYOsakiMOshimuraMIshigakiYHamasakiKKodamaYReprogramming suppresses premature senescence phenotypes of Werner syndrome cells and maintains chromosomal stability over long-term culturePLoS ONE20149e11290025390333SoontararakSChowLJohnsonVCoyJWheatWReganDDowSMesenchymal stem cells (MSC) derived from induced pluripotent stem cells (iPSC) equivalent to adipose-derived MSC in promoting intestinal healing and microbiome normalization in mouse inflammatory bowel disease modelStem Cells Transl Med2018745646729635868SuzukiKTsunekawaYHernandez-BenitezRWuJZhuJKimEJHatanakaFYamamotoMAraokaTLiZIn vivo genome editing via CRISPR/Cas9 mediated homology-independent targeted integrationNature201654014414927851729SzklarczykDMorrisJHCookHKuhnMWyderSSimonovicMSantosADonchevaNTRothABorkPThe STRING database in 2017: quality-controlled protein-protein association networks, made broadly accessibleNucleic Acids Res201745D362D36827924014TacutuRCraigTBudovskyAWuttkeDLehmannGTaranukhaDCostaJFraifeldVEde MagalhaesJPHuman ageing genomic resources: integrated databases and tools for the biology and genetics of ageingNucleic Acids Res201341D1027D103323193293TompkinsBADiFedeDLKhanALandinAMSchulmanIHPujolMVHeldmanAWMikiRGoldschmidt-ClermontPJGoldsteinBJAllogeneic mesenchymal stem cells ameliorate aging frailty: a phase II randomized, double-blind, placebo-controlled clinical trialJ Gerontol A20177215131522TripathiSPohlMOZhouYRodriguez-FrandsenAWangGSteinDAMoultonHMDeJesusPCheJMulderLCMeta- and orthogonal integration of influenza “OMICs” data defines a role for UBR4 in virus buddingCell Host Microbe20151872373526651948van der HorstGTvan SteegHBergRJvan GoolAJde WitJWeedaGMorreauHBeemsRBvan KreijlCFde GruijlFRDefective transcription-coupled repair in Cockayne syndrome B mice is associated with skin cancer predispositionCell1997894254359150142van der HorstGTMeiraLGorgelsTGde WitJVelasco-MiguelSRichardsonJAKampYVreeswijkMPSmitBBootsmaDUVB radiation-induced cancer predisposition in Cockayne syndrome group A (Csa) mutant miceDNA Repair2002114315712509261van der PluijmIGarinisGABrandtRMGorgelsTGWijnhovenSWDiderichKEde WitJMitchellJRvan OostromCBeemsRImpaired genome maintenance suppresses the growth hormone–insulin-like growth factor 1 axis in mice with Cockayne syndromePLoS Biol20075e217326724Velez-CruzREglyJMCockayne syndrome group B (CSB) protein: at the crossroads of transcriptional networksMech Ageing Dev201313423424223562425Velez-CruzRZadorinASCoinFEglyJMSirt1 suppresses RNA synthesis after UV irradiation in combined xeroderma pigmentosum group D/Cockayne syndrome (XP-D/CS) cellsProc Natl Acad Sci USA2013110E212E22023267107VessoniATHeraiRHKarpiakJVLealAMTrujilloCAQuinetAAgnez LimaLFMenckCFMuotriARCockayne syndrome-derived neurons display reduced synapse density and altered neural network synchronyHum Mol Genet2016251271128026755826WangSWangXWuYHanCIGF-1R signaling is essential for the proliferation of cultured mouse spermatogonial stem cells by promoting the G2/M progression of the cell cycleStem Cells Dev20152447148325356638WangSWangXMaLLinXZhangDLiZWuYZhengCFengXLiaoSRetinoic acid is sufficient for the in vitro induction of mouse spermatocytesStem Cell Rep201678094WangLXYiFFuLNYangJPWangSWangZXSuzukiKSunLXuXLYuYCRISPR/Cas9-mediated targeted gene correction in amyotrophic lateral sclerosis patient iPSCsProtein Cell2017836537828401346WangPLiuZZhangXLiJSunLJuZLiJChanPLiuGHZhangWCRISPR/Cas9-mediated gene knockout reveals a guardian role of NF-kappaB/RelA in maintaining the homeostasis of human vascular cellsProtein Cell2018994596529968158WangSHuBDingZDangYWuJLiDLiuXXiaoBZhangWRenRATF6 safeguards organelle homeostasis and cellular aging in human mesenchymal stem cellsCell Discov20184229423270WangSLiuZYeYLiBLiuTZhangWLiuGHZhangYAQuJXuDEctopic hTERT expression facilitates reprograming of fibroblasts derived from patients with Werner syndrome as a WS cellular modelCell Death Dis2018992330206203WuZZhangWSongMWangWWeiGLiWLeiJHuangYSangYChanPDifferential stem cell aging kinetics in Hutchinson–Gilford progeria syndrome and Werner syndromeProtein Cell2018933335029476423YamadaAMasutaniCHanaokaFDetection of reduced RNA synthesis in UV-irradiated Cockayne syndrome group B cells using an isolated nuclear systemBiochim Biophys Acta2002159212913412379475YanPLiQWangLLuPSuzukiKLiuZLeiJLiWHeXWangSFOXO3-engineered human ESC-derived vascular cells Promote vascular protection and regenerationCell Stem Cell201910.1016/j.stem.2018.12.00231173712YangJLiJSuzukiKLiuXWuJZhangWRenRZhangWChanPIzpisua BelmonteJCGenetic enhancement in cultured human adult stem cells conferred by a single nucleotide recodingCell Res2017271178118128685772YuQCSongWWangDZengYAIdentification of blood vascular endothelial stem cells by the expression of protein C receptorCell Res2016261079109827364685ZhangWLiJSuzukiKQuJWangPZhouJLiuXRenRXuXOcampoAAging stem cells. A Werner syndrome stem cell model unveils heterochromatin alterations as a driver of human agingScience20153481160116325931448ZhangWWanHFengGQuJWangJJingYRenRLiuZZhangLChenZSIRT6 deficiency results in developmental retardation in cynomolgus monkeysNature201856066166530135584ZhangXLiuZLiuXWangSZhangYHeXSunSMaSShyh-ChangNLiuFTelomere-dependent and telomere-independent roles of RAP1 in regulating human stem cell homeostasisProtein Cell201910.1007/s13238-019-0610-731781970
diff --git a/jcore-pmc-db-reader/src/test/resources/testdocs/PMC7511315.xml b/jcore-pmc-db-reader/src/test/resources/testdocs/PMC7511315.xml new file mode 100644 index 000000000..b28626ba1 --- /dev/null +++ b/jcore-pmc-db-reader/src/test/resources/testdocs/PMC7511315.xml @@ -0,0 +1,28 @@ + +
pmcNat CommunNat CommunNature Communications2041-1723Nature Publishing Group UKLondon32968055PMC75113151839610.1038/s41467-020-18396-7ArticleTranscriptomic profiling of human cardiac cells predicts protein kinase inhibitor-associated cardiotoxicityhttp://orcid.org/0000-0002-1664-7314van HasseltJ. G. Coen12RahmanRayees1http://orcid.org/0000-0002-1362-6534HansenJens1SternAlan1ShimJaehee V.1XiongYuguang1PickardAmanda1JayaramanGomathi1HuBin1MahajanMilind3GalloJames M.14GoldfarbJoseph1SobieEric A.1http://orcid.org/0000-0002-0341-0705BirtwistleMarc R.15http://orcid.org/0000-0003-4007-7814SchlessingerAvner
avner.schlessinger@mssm.edu
1
http://orcid.org/0000-0001-6137-109XAzelogluEvren U.
evren.azeloglu@mssm.edu
16
http://orcid.org/0000-0002-7814-0180IyengarRavi
ravi.iyengar@mssm.edu
1
grid.59734.3c0000 0001 0670 2351Department of Pharmacological Sciences and Systems Biology Center New York, Icahn School of Medicine at Mount Sinai, New York, NY USA grid.5132.50000 0001 2312 1970Division of Systems Biomedicine and Pharmacology, Leiden Academic Centre for Drug Research, Leiden University, Leiden, Netherlands grid.59734.3c0000 0001 0670 2351Department of Genetics and Genomic Sciences, and Icahn Institute for Genomic Sciences and Multiscale Biology, Icahn School of Medicine at Mount Sinai, New York, NY USA grid.273335.30000 0004 1936 9887Department of Pharmaceutical Sciences, School of Pharmacy and Pharmaceutical Sciences, University at Buffalo, Buffalo, NY USA grid.26090.3d0000 0001 0665 0280Department of Chemical and Biomolecular Engineering, Clemson University, Clemson, SC USA grid.59734.3c0000 0001 0670 2351Deparment of Medicine, Division of Nephrology, Icahn School of Medicine at Mount Sinai, New York, NY USA
2392020239202020201148091220171882020© The Author(s) 2020https://creativecommons.org/licenses/by/4.0/Open Access This article is licensed under a Creative Commons Attribution 4.0 International License, which permits use, sharing, adaptation, distribution and reproduction in any medium or format, as long as you give appropriate credit to the original author(s) and the source, provide a link to the Creative Commons license, and indicate if changes were made. The images or other third party material in this article are included in the article’s Creative Commons license, unless indicated otherwise in a credit line to the material. If material is not included in the article’s Creative Commons license and your intended use is not permitted by statutory regulation or exceeds the permitted use, you will need to obtain permission directly from the copyright holder. To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/.

Kinase inhibitors (KIs) represent an important class of anti-cancer drugs. Although cardiotoxicity is a serious adverse event associated with several KIs, the reasons remain poorly understood, and its prediction remains challenging. We obtain transcriptional profiles of human heart-derived primary cardiomyocyte like cell lines treated with a panel of 26 FDA-approved KIs and classify their effects on subcellular pathways and processes. Individual cardiotoxicity patient reports for these KIs, obtained from the FDA Adverse Event Reporting System, are used to compute relative risk scores. These are then combined with the cell line-derived transcriptomic datasets through elastic net regression analysis to identify a gene signature that can predict risk of cardiotoxicity. We also identify relationships between cardiotoxicity risk and structural/binding profiles of individual KIs. We conclude that acute transcriptomic changes in cell-based assays combined with drug substructures are predictive of KI-induced cardiotoxicity risk, and that they can be informative for future drug discovery.

Cardiotoxic adverse events associated with kinase inhibitors are a growing concern in clinical oncology. Here the authors use cellular transcriptomic responses of human cardiomyocytes treated with protein kinase inhibitors and the associated drug structural signatures to determine an integrated predictive signature of cardiotoxicity.

Subject termsToxicologyPredictive markersCardiologyhttps://doi.org/10.13039/100000051U.S. Department of Health & Human Services | NIH | National Human Genome Research Institute (NHGRI)U54HG008098IyengarRaviissue-copyright-statement© The Author(s) 2020
Introduction

Protein kinase inhibitors (KIs) are an important class of therapeutics used for the treatment of various forms of cancer1,2 and other diseases. There are currently more than 48 KIs approved for clinical use by the U.S. Food and Drug Administration (FDA) and other regulatory agencies3, and more than 250 KIs are undergoing clinical trials or are in development46. The clinical effectiveness of KIs as cancer drugs has led to a broad effort to develop drugs that are more efficacious and have reduced the propensity for adverse events. Cardiotoxicity (CT) is a clinically important adverse event associated with several KIs710. KI-associated CT manifests as loss of cardiomyocyte function, which can lead to heart failure11. Given the extensive therapeutic potential of KIs, approaches to identify and subsequently mitigate the risk for CT during early development of novel KIs and during clinical administration are urgently required.

We do not yet sufficiently understand the mechanisms underlying KI-associated CT. The human kinome consists of more than 500 protein kinases12. Given that many KIs exhibit multitarget pharmacology13, inhibition of multiple protein kinases in cardiomyocytes may lead to adverse drug effects such as CT14. For individual KIs, pathways involved in mitochondrial function8,15,16, endoplasmic reticulum stress response16, and AMPK inhibition17, have been shown to be associated with KI-induced CT18. Overall, however, the general mechanisms of KI-induced CT are still poorly understood18.

Obtaining quantitative clinical risk scores for KI-associated CT is also challenging, as the risk for KI-associated CT has not been systematically studied. The FDA adverse event report system (FAERS) database has been previously applied to quantify the risk of ADRs1921. The FAERS database contains over 9 million individual drug-associated adverse-event reports reported by industry and physicians. Through statistical analyses of the FAERS database, relatively unbiased estimates for the relative risk for specific ADRs can be computed. Such risk scores are clinically relevant as they are based on real-life patient population, and they are not solely based on selected patient cohorts. We previously used such analyses of the FAERS database in combination with systems’ pharmacology-based approaches to obtain mechanistic insights into adverse-event mechanisms21,22.

In the current study, generated as part of the NIH-funded Library of Integrated Network Based Cellular Signatures (LINCS) Drug Toxicity Signature Generation Center (DToxS), we take a top–down global approach to determine if a comprehensive profiling of gene expression changes in human cardiomyocytes can provide insight into pathways associated with KI-induced CT, and to potentially predict the risk of CT. The rationale for this approach is based on the central assumption that CT largely originates from cardiomyocytes where one or more protein kinases contribute to the pathophysiology. Since progression to heart failure takes several months to manifest, it is not immediately obvious if gene expression changes measured after drug treatment for a few days would have predictive value. Thus, a second important assumption is that early changes in gene expression upon drug treatment of cardiomyocytes are indicative of later physiological events. We test the validity of our assumptions by experimentally obtaining gene-expression patterns for the different KIs, and if these patterns could be selectively associated with the clinical risk of CT for each KI, thereby providing gene-expression signatures for KI-associated CT.

We report the generation of transcriptomic profiles from four human primary cardiomyocyte-like cell lines. These profiles are generated using 23 KIs that were FDA-approved and used extensively at the time of experimental design, such that an adequate number of clinical reports have been collected. Drugs are used at their imputed therapeutic concentrations. Through this pan-KI transcriptomic profiling, we obtained insights into the affected pathways that may be related to KI-associated CT. We show that selective patterns of gene expression can be associated with the FAERS-derived clinical risk for KI-associated CT, which may be highly relevant to identify KI drug candidates at risk for showing clinical CT. We also describe the relationships between KI CT risk and structural properties of KIs, highlighting the potential for re-engineering small molecules that exhibit a high risk for CT.

ResultsDifferences in CT risk of kinase inhibitors

In order to obtain unbiased estimates of clinical risk of KI-associated CT, we analyzed individual adverse-event reporting data from FAERS (Fig. 1a). Reporting odds ratios (RORs) were derived based on the relative frequencies of AE occurrence of each KI compared to all KIs. These risk scores provide a relative ranking of KI-associated toxicity. Kinase inhibitors were shown to have pronounced differences in the relative risk of CT (Fig. 1b). When comparing the ranking of risk scores derived from FAERS with adverse drug-reaction (ADR) reporting data from the World Health Organization (WHO) ADR reporting database, we find that the ranking from these databases largely agrees (Fig. 1c), indicating the general consistency of the clinical risk scores across databases.Cardiotoxicity of protein kinase inhibitors.

a Approach to quantify relative clinical cardiotoxicity risk scores for kinase inhibitors from the FDA Adverse Event Reporting System (FAERS) database. b Reporting odds ratio (mean and 95% confidence interval of computed odds ratio) for cardiotoxicity across kinase inhibitors from FAERS. c Comparison of ranking derived from FAERS and WHO Pharmacovigilance data shows agreement. d Literature-reported in vitro and in vivo preclinical assays to predict KI-associated cardiotoxicity poorly correlated with clinical FAERS-derived risk scores for cardiotoxicity at clinical drug concentrations. e In vitro dose–response experiments for selected KIs for viability and mitochondrial stress poorly correlate with clinical FAERS-derived risk scores for cardiotoxicity. Source data are provided in source data file.

Phenotypic assays poorly correlate with CT

We performed a literature review for in vitro and in vivo experimental datasets that aimed to predict CT risk based on phenotypic readouts, such as cell viability or beating rate from in vitro cardiomyocyte or animal models, to determine if such phenotypic experiments can predict the clinical risk scores for CT. Studies in which drugs at the clinical concentration induced more than a 20% change in various phenotypic readouts compared to control experiments were classified as predicting potential CT (Fig. 1d). Across these studies, it was apparent that there was no identifiable relationship between apparent experimental toxicity in comparison to the relative incidence of CT in patients as derived from FAERS.

We conducted dose–response experiments with selected KIs that had varying risks for CT using the cardiomyocyte cell lines that were used in the current study for transcriptomic profiling, quantifying cell viability, and mitochondrial stress after 48 h of exposure to the selected KIs. We again assessed if drugs caused more than a 20% change in cell viability and mitochondrial stress at the typical clinically used concentration (Supplementary Table 1). These studies showed a similar lack of correlation with clinical risk (Fig. 1e, Supplementary Fig. 1). These findings underscore the need for alternative approaches such as early molecular signatures for CT. This identified lack of the predictiveness of preclinical in vitro and in vivo phenotypic assays, as has been noted by others7.

Transcriptomic profiling of human primary cardiomyocyte-like cell lines

To study the transcriptomic response to KIs associated with CT, we obtained four primary cardiomyocyte lines that were isolated from ventricles of healthy adult human heart (two male and two female, PromoCell GmbH, Germany). Culture conditions, detailed phenotypic characterization of each cell line, including gene and protein expression, morphology, and functional assays, can be found on the DToxS Center website (www.dtoxs.org) under the “Cellular Metadata” section.

Confluent cardiomyocyte-like cells were treated with drugs for 48 h at concentrations similar to their clinical concentration (Supplementary Table 1) with 3–4 replicates and 3–4 cell lines (Supplementary Table 2), after which RNA was extracted and sequenced using the 3′ digital gene-expression method23 (Fig. 2a).Overview of pan-KI transcriptomic profiling in human primary cardiomyocyte-like cells.

a Overview of experimental approach to generate transcriptomic data. For each drug, genes were ranked by absolute mean fold-change gene-expression value across replicates (>3 biological replicates) and cell lines (a total of 1309 experiments), and the top 250 genes for each KI were kept. Information about the total number of replicates can be found in the source data file. b Jaccard similarity of gene-expression signature of PromoCell cardiomyocyte cell lines (102 samples) to gene-expression signatures of tissues available in the GTeX database (17,382 total samples). Boxplot whiskers refer to the upper and lower quartile of all pairwise Jaccard coefficients between each sample, within each tissue type. Information about each boxplot’s sample size, minima, maxima, and center is provided in the source data file. c Heatmap depicting the Jaccard index that indicates the magnitude of similarity in top-ranking differentially expressed genes for all KI pairs. d First three principal components (PCs) based on full mean fold-change gene-expression profiles across KIs. Source data are provided in source data file.

We investigated if transcriptomic profiles of PromoCell cardiomyocytes are related to human heart tissue and hence a good model to study CT. We compared the gene-expression similarity of untreated PromoCell cardiomyocytes against tissues available in the Genotype-Tissue Expression (GTEx) project, which contains gene-expression data from many human tissues, including the heart (Fig. 2b)24. Using the Jaccard distance for the top expressed 250 genes (based on transcript per million counts) for both untreated PromoCell and GTEx tissues, we observe that PromoCell cardiomyocytes’ expression exhibits a gene expression similar to blood (rank 2), muscle (rank 4), and heart (rank 10) tissue. Based on these results, we conclude that the PromoCell cardiomyocytes can offer comparable gene-expression changes to that of cardiomyocytes.

Limited overlap in differentially expressed genes across KIs

Differential gene-expression fold-change values were computed across the four cell lines. Initial analyses showed that the DEGs generally clustered more strongly by drugs than by cells. We calculated median fold-change values for each KI across cell lines, resulting in a single gene- expression profile for each KI. Ranked gene lists for each KI were generated by ranking by differential gene-expression p value and keeping the top 250 genes. To assess the similarity between genes present in the top 250 genes for each KI, the Jaccard index was calculated for each ranked list of KI-specific genes, which indicated a limited overlap (<0.25) between the top 250 genes across KIs (Fig. 2c). Principal component analysis showed variable gene-expression patterns for nine KIs, while for the remaining KIs, little variation in gene expression was seen (Fig. 2d), even though these remaining KIs included drugs for which CT is well established. We concluded that ranked differential gene-expression values would not be sufficient to provide clear insights into gene-expression profiles associated with CT.

Pathways correlated with KI-associated CT

To identify pathways and subcellular processes across KIs and their potential involvement with CT, we performed enrichment analysis for protein kinases and KEGG terms using the top 250 differentially expressed genes ranked by p value across cell lines and KIs. We then correlated p values of enriched terms with clinical FAERS-derived risk scores to identify potential kinases and pathways associated with CT risk (Fig. 3a). The protein kinase LIMK2, which is involved in actin cytoskeleton reorganization pathways, ranked the highest in its correlation specifically enriched for KIs with a higher risk score (Fig. 3b). Sucrose- and pyruvate-metabolism pathways were the most strongly enriched pathways correlating with high risk scores (Fig. 3c). However, since no directionality in pathways is considered in these enrichment analyses, both the positively and negatively correlated processes may play a role in the development of CT. When considering enriched protein kinases and KEGG processes across all KIs without considering correlation to CT risk, multiple pathways were identified (Supplementary Fig. 2). These findings indicate that there is likely substantial complexity underlying the action of KI in cardiomyocytes, although currently these analyses remain correlational and do not offer proof of causal relationships.Analysis of transcriptomic profiling data in relation to cardiotoxicity risk.

a Flowchart indicating ranked lists of top 250 differentially expressed genes ranked by p value for each kinase inhibitor across cell lines from the transcriptomic cardiomyocyte profiling, which were then enriched and subsequently related to clinical cardiotoxicity risk scores. Enriched kinases (b) and enriched KEGG pathways (c) (p < 0.05) that show a correlation coefficient > |0.25| with cardiotoxicity risk scores and the associated enrichment p values. Source data are provided in source data file.

Transcriptomic signature to predict CT risk

We tested if our KI-wide fold-change gene-expression profiles correlated with the KI-specific clinical risk scores for CT to identify a predictive transcriptomic signature for CT risk. Given the limited similarity between top-ranking gene-expression profiles across KIs, the entirety of the gene- expression profiles for different KIs were considered as potential predictors for KI-associated CT risk. KI-specific expression profiles of 10,749 genes were available as potential predictors for KI-specific CT risk scores. To identify genes most strongly associated with CT risk, we used an elastic net-penalized regression approach, which aims to select the most predictive variables while avoiding overfitting25.

A two-stage regression analysis was performed (Fig. 4a). From the available 23 KIs with the associated clinical CT risk scores, we randomly left out 2 KIs for external validation of the model (test set, 10% of data). The differential gene-expression profiles of 21 remaining KIs were then used to train the model. Given the limited number of available drugs, small changes in expression patterns for drug were expected to affect the identity of the overall set of predictor genes. Therefore, we generated bootstrap datasets by random resampling of KI risk and the associated gene-expression profiles. These bootstrapped datasets were then fit using elastic net models. This first step was performed to identify gene-based predictors that could consistently predict CT risk and contributed significantly to the prediction of this risk. The bootstrap analysis resulted in stable selection of potential predictors. Predictors to be included in the final elastic net regression model were selected based on their minimal root-mean-squared prediction error (RMSE) after cross- validation. Based on this cross-validation, the gene-expression-based predictors in the final elastic net models consisted of 26 genes with the associated variable importance values (Fig. 4b).Regression analysis for transcriptomic signatures to predict clinical risk.

a Overview of processing and elastic net regression analysis of transcriptomic data in combination with FAERS-derived clinical risk scores. b Transcriptomic signature genes selected to predict cardiotoxicity risk score indicating their variable importance. c Observed and predicted risk scores from the elastic net cross- validation analysis (mean and standard deviation). d External validation of the signature for six kinase inhibitors: regorafenib (REG), sunitinib (SUN), ibrutinib (IBR), lenvatinib (LEN), nintendinib (NIN), and ceritinib (CER).

Repeated cross-validation analyses indicated good predictive performance of the model for left-out KIs (Fig. 4c). We evaluated our 26-gene signature for predicting CT risk on an independent validation set of six KIs, of which three KIs were previously untested (Fig. 4d). We note that the independent validation set was performed 1 year after the original signatures were generated, using a different experimental protocol for the transcriptomic assay that was based on mRNA detection using random primers. We observed accurate predictive performance for five out of six KIs tested. The outlier, ibrutinib, had the lowest, albeit acceptable, predictive performance, with an error of 0.493 between the predicted and observed risk scores. Taken together, the developed signature can be of relevance to support risk prioritization of newly developed KIs. When we tested which of the 21 KIs drove the prediction strength of the model, we found that excluding any of four low-CT risk drugs (cabozantinib, tofacitinib, pazopanib, and erlotinib) increased the error substantially, indicating that these KIs contribute distinct information to the signature. In contrast, several of the high-ranking CT drugs could be excluded without sacrificing accuracy (Supplementary Fig. 3).

We then used the 26-gene signature to construct a protein–protein interaction network analysis to identify protein kinases and transcription factors associated with the signature (Supplementary Fig. 4). Several protein kinases were retrieved that are both known targets of the studied KIs, and which may be associated with the occurrence of KI-induced CT.

Chemical structures of KIs inform CT risk

Off-target binding or polypharmacology is commonly observed in KIs23. Since off-target binding is dependent on the structure of the drug, we investigated the relationship between kinase inhibitor chemical structure, binding target profile, and CT risk. To do this, we generated a structure–activity–similarity (SAS) map of the 26 tested inhibitors (in both the training and validation set) and their CT-risk score (Fig. 5A)26. SAS maps can be divided into four quadrants: the upper-left quadrant shows KI pairs with low chemical similarity and large changes in CT risk. The lower-left quadrant describes largely dissimilar KI pairs with small changes in CT risk. The lower-right quadrant describes KI pairs that exhibit a “smooth” structure–activity relationship, that is, small changes in chemical similarity are associated with small changes in CT risk. Finally, the upper-right quadrant indicates highly chemically similar compounds with large changes in CT risk.Structure–activity–similarity (SAS) maps of kinase inhibitor activity and cardiotoxicity.

a A SAS map relating pairwise chemical similarity measured by Tanimoto coefficient (Tc) derived from a weighted average of 4 chemical fingerprints (ECFP4, ECFP2, Daylight, and MACCS), between pairs of 26 kinase inhibitors (Table 1) and their difference in cardiotoxicity scores (DCS). The threshold for chemical similarity was the top 10% value in the distribution of Tc values: 0.38. The threshold value for DCS was half of the maximum DCS score: 0.82. b Highlighted chemical scaffolds for distinct kinase inhibitors observed in the upper- and lower-right regions. c Binding profile of kinase inhibitors based on data from Klaeger et al.5. Kinase inhibitors were hierarchically clustered based on chemical similarity, and kinase inhibitors are annotated by their binding mode (e.g., type I, type I1/2, type II, type III, type IV, or type VI)6. d Kinase inhibitor selectivity scores at 500 nM Kd. e Observed cardiotoxicity risk scores were normalized to zero and ordered based on hierarchical clustering of the kinase inhibitors. f Predicted cardiotoxicity risk scores were normalized to zero and ordered based on hierarchical clustering of the kinase inhibitors. g Absolute error from observed and predicted cardiotoxicity risk scores. Source data are provided in source data file.

KI pairs in the upper-right region represent activity cliffs, that is, that small changes in chemical structure are associated with large changes in CT risk. In this region, we find several KI pairs, in particular, we observe large activity cliffs between afatinib and bosutinib as well as bosutinib and erlotinib. Here, all four compounds have the same chemical core (Fig. 5b); however, both afatinib and erlotinib show respectively lower CT risk scores compared to bosutinib. We hypothesized that harmonization of drug substructure, similarity, and promiscuity in the context of kinase inhibitor type may inform on our ability to predict CT risk (Fig. 5c).

By investigating their KI target profiles, we observe that both afatinib and erlotinib are less promiscuous KIs compared to bosutinib (which is one of the most promiscuous KIs in this set, Fig. 5d), and they both inhibit EGFR at nanomolar concentrations. On the other hand, less promiscuous KIs, such as lapatinib and gefitinib, exhibit a comparably lower CT risk score (Fig. 5e). Indeed, we observe a correlation between kinase inhibitor promiscuity and the observed CT risk score (Supplementary Fig. 5). However, KI promiscuity may not be the sole determinant of CT risk. For example, KIs such as imatinib and nilotinib are not as promiscuous as bosutinib; however, both exhibit relatively high CT risk scores. In this case, both imatinib and nilotinib CT may be explained due to their similar chemical structure and high specificity for protein kinases such as DDR1 and ABL.

Finally, kinase inhibitors have distinct binding modes against their targets6,27,28. Kinase inhibitors that bind their kinase targets can be classified based on their binding mode, including the kinase conformation they bind and/or type of interactions they make with their kinase targets (e.g., covalent vs. noncovalent)6,27,29. For example, type I inhibitors bind an active kinase conformation, while Type I1/2, II–V bind distinct inactive states (Methods); type VI KI binds the kinase target covalently. We do not observe a clear relationship between kinase inhibitor-binding mode and CT. For example, the type II inhibitors imatinib and nilotinib are observed to have a high CT risk, while the type II inhibitors sorafenib and regorafenib have comparatively lower observed CT risk. However, both pairs of inhibitors are highly chemically similar and have similar binding targets. Taken together, the observed CT risk of a KI may be related to both a kinase inhibitor’s selectivity and its chemical structure. Furthermore, we observe a relationship between chemical structure and binding target similarity to the predictive performance of our signature (Fig. 5e–g).

Discussion

The occurrence of drug treatment-associated CT, leading to decreased cardiac function, follows the therapeutic effects of the drugs, and is only observed in a subset of the patients using the drug. This raises the question of whether it would be possible to obtain early cell-based signatures predictive for drug toxicity. Here we addressed this question by attempting associating drug treatment-induced gene-expression patterns with the clinical risk for the adverse events of interest.

By estimating clinical risk from the FAERS database, our method utilizes a relevant and unbiased approach for the quantification of CT risk. As a result, our CT risk scores lack notable pitfalls such as selection bias associated with tightly controlled clinical trials, which underestimate adverse-event risks due to cohort size, trial duration, and selective inclusion criteria for subjects. Nevertheless, there are limitations to the FAERS database as well, which we have discussed and addressed in previous work22. Specifically, use of the FAERS resource may confound demographics information such as age and sex, which was observed not to vary across different KIs. Moreover, CT risk score does not reflect absolute risk for developing CT. Rather, it reflects the relative risk for a subset of patients for which drug-associated adverse events were reported. In addition, there may be some systematic biases based on the sampling frequency of drugs by institution.

It remains unclear if all KIs induce CT through similar mechanisms, and to what extent ultimate clinical pathologies are similar. While the FAERS database allows us to distinguish between different types of CT, the annotation is not uniform and may either refer to distinct pathophysiological descriptions or rather more general clinical presentations of heart failure. To this end, we chose to lump all forms of heart failure, while excluding cardiac AEs that have known and unrelated origin such as coronary artery disease and arrhythmias.

We compared KI-associated transcriptomic response profiles generated from cultured human primary cardiomyocyte-like cells with clinical CT risk scores to obtain a reduced set of genes that may predict the relative risk for KI-associated CT. Using the clinically weighted signatures and the associated regression coefficients identified in the elastic net model, the relative risk for CT can be predicted. The risks predicted by our signatures and the associated regression model can be used in drug development to rank the potential risk of novel KIs with respect to existing KIs with better characterized clinical risks for CT.

The signatures generally showed good prediction of CT risk during cross-validation as well as on an independent set of KIs (Fig. 4), while the only poorly performant KI, ibrutinib, inhibitor of Bruton nonreceptor protein-tyrosine kinase, represents a unique KI in terms of binding mode (i.e., type VI inhibitor) and high promiscuity (Fig. 5). Specifically, it is a member of an emerging class of kinase inhibitor drugs that bind their targets covalently (type VI KIs). These drugs are highly underrepresented in the databases used in this analysis, explaining the misclassification of ibrutinib30.

The four cell lines we studied are insufficient to fully capture such human variability to KIs. Therefore, in our analysis, we used median fold-change gene-expression profiles across multiple cell lines. The resulting averaged gene-expression profiles thus reflect relatively consistent changes in gene expression across cell lines, i.e., changes in gene expression that are less likely to be highly variable across cell lines, yet may also reflect a set of predictors that may be more consistent in the population. Given that the FAERS CT risk scores also reflect a population-level CT risk, the use of these median values in fold-change gene-expression values is a reasonable starting point for our analyses.

The experimental underpinning of the transcriptomic profiles generated in this study makes them likely to be of value in selecting drug candidates with a low risk for CT as an adverse event. Our analysis is based on primary human heart-derived cardiomyocyte-like cells. Although these cell lines do have phenotypic limitations due to dedifferentiation, the signatures obtained from the cells could be relevant for prediction of clinical drug effects. These cell lines may be reflective of human cardiac pharmacology, i.e., in comparison with animal-derived cardiomyocytes, even though further characterization and standardization are still needed. Detailed characterization of these cell lines is available as metadata to the RNAseq datasets at www.dtoxs.org. Our analyses used drug exposures similar to clinically reported maximum plasma concentrations of the individual KIs, rather than using the same concentrations for all KIs, even though we did not correct for protein binding. We expect that the duration of 48-h exposure may reflect transcriptomic changes that are likely related to early changes in subcellular processes associated with the adverse event of interest.

Unfortunately, in this study, it is not feasible or ethically possible, due to lack of prior informed consent, to compare cardiac gene-expression signatures with gene-expression profiles from patients who receive KI-therapy and/or who developed KI-associated CT. We considered whether we could compare our gene-expression signatures to cardiac gene-expression data from patients with heart failure who undergo surgery. Typically these are patients with advanced disease, and the gene expression in tissue from advanced disease is not likely to be of relevance to acute drug-induced CT.

By investigating the chemical structure and binding profile similarity of KIs, we are able to observe that chemical components and scaffolds that lead to promiscuous binding of KIs to multiple binding targets are correlated with higher CT values. This is consistent with the notion that a portion of CT risk of KIs can be attributed to higher levels of off-target interactions. Indeed, when we investigate the binding profile of three chemically similar KIs: afatinib, erlotinib, and gefitinib, we find that their binding profiles are fairly specific compared to other KIs, and they have a lower normalized CT risk score. One limitation we have observed with our approach is that chemically distinct KIs (e.g., in terms of binding profile, substructural similarity, and type), such as the type IV inhibitor ibrutinib, exhibit diminished predictive performance. However, we think that using the guidelines we provide herein, this signature could still assist in the development and prioritization of KIs with lower toxicity risks.

We cautiously anticipate that clinically weighted transcriptomic signatures such as those developed in this study may be of relevance to guide safety assessment in early drug development. Unlike the relatively well-established assessment of electrophysiological safety issues such as QT prolongation, the assessment of non-QT type of CT associated with KI16 and other novel drugs31, lacks reliable biomarkers. The transcriptomic signature for CT identified in this study may help fill this gap, especially if its structure and binding profiles are closely represented within the inhibitors in this study. One could anticipate that after initial selection of promising KIs with apparent efficacy in preclinical screens, transcriptomic profiling using the signatures developed here may possibly be used to rank drugs for the expected CT risk and exclude those with high CT risk scores (Supplementary Fig. 6).

While beyond the scope of this study, future extension of our studies could explore the idea of studying individualized risk scores for CT. That is, do baseline gene-expression profiles of larger libraries of patient-derived cardiomyocyte cell lines predict the difference in risk for CT between individual patients? Ideally, such an analysis would be conducted using induced pluripotent stem cell-derived cardiomyocytes from patients, who have received KIs and experienced different levels of CT, such as was recently described for anthracycline chemotherapeutics32. This would then further enable the development of precision medicine approaches to KI therapy that could minimize the risk for CT.

MethodsCell culture and drug treatment

Adult human cardiomyocytes (Cat #: C12810) were purchased from PromoCell GmbH (Heidelberg, Germany) and grown in culture as per the manufacturer’s instructions. Four different cell line lots (Lot #: 3042901.2, 4031101.3, 2082801.2, and 2120301.2) isolated from two male and two female subjects were cultured under serum-free differentiation conditions for at least 28 days prior to drug treatment. Details regarding metadata information, including cell line metadata and the quality control and assurance metrics, can be found on www.dtoxs.org.

Dose–response experiments

For two of the four cell lines, dose–response experiments were conducted treating cells for 48 h with eight increasing perturbagen concentrations (5 nM, 50 nM, 100 nM, 500 nM, 1 µM, 5 µM, 10 µM, and 100 µM) and vehicle-treated control, in quadruplicates. We assayed for viability through image-based analysis of nuclear counts with Hoechst 33342 (Thermo Fisher, Cat #: H3570) and MitoTracker Red (Thermo Fisher, Cat #: M22426) for mitochondrial toxicity. Details of the experimental protocols for cell culture, drug treatment, and transcriptomics have been described as step-by-step standard operating procedures for the various experiments available on www.dtoxs.org.

Transcriptomics

Cells were treated for 48 h with a single perturbagen concentration around the maximal concentration (Supplementary Table 1). After drug treatment, the cells were lysed, RNA was collected using TRIzol, and gene-expression profiles were measured using the 3′ digital gene-expression method33,34.

Sequence alignment and processing of gene-expression data

The raw sequences were demultiplexed. Combined standard RNAseq files were aligned to the reference human genome hg38 using the STAR software suite35. The resulting alignment files were parsed to identify the fragments with acceptable alignment quality, to remove duplicate fragments, and to assign accepted fragments to the corresponding genes. The resulting read-count (i.e., transcript count) table was then subjected to correlation analysis at each treatment condition, to identify and remove outlier samples, determined by predefined thresholds. The gene read-count tables were then subjected to differential gene-expression analysis using the R package EdgeR36. Details of these computational procedures are described elsewhere23, and step-by-step protocols are available on www.dtoxs.org. The resulting normalized and log-transformed fold-change gene- expression values for each sample are also deposited for public access to the DToxS data repository (www.dtoxs.org).

Processing and exploratory analysis of gene-expression data

The median log-transformed gene-expression fold-change value was calculated across all cell lines for each individual KI. The resulting matrix of gene fold-change values by KIs was used for the regression analysis. To obtain insight into the general patterns present in this KI-perturbed transcriptomics dataset, we generated rankings of the top 500 genes for each drug, by their absolute mean fold-change value, i.e., whether positive or negative. For each of these KI-associated rankings, we determined the frequency of these changes being also present in the ranking of other drugs, e.g., the similarity in genes present in the top 250 gene lists for each KI. This was visualized using the Jaccard index, and by plotting the most highly drug-connected genes against the associated drugs. Principal component analysis for the first three principal components on the absolute mean fold-change values for each drug was performed to further assess similarity between drugs in their gene-expression values.

Calculation of tissue cell line expression similarity

Pairwise expression similarity scores were computed based on the Jaccard coefficient of a binary matrix based on RNA sequencing data from PromoCell cardiomyocyte exposures to kinase inhibitors. The top 500 genes for a KI were set as 1, while genes that were not in the top 500 were set as 0.

Calculation of clinical risk RORs

Adverse-event frequencies from the FDA Adverse Event Reporting System (FAERS) were obtained from the AERSmine resource37, which contains a curated version of the FAERS database. ADRs in the FAERS database are organized according to MedDRA38, which is a hierarchical ontology to classify ADRs from high-level organs associated with the pathology to reported low-level specific pathological conditions. We downloaded the frequencies of the occurrence of ADRs for all protein KIs available in FAERS, together with all other frequencies of ADRs reported for these KIs. A time-stamped record of this download to reproduce this analysis was retained. RORs were then computed for each KI using the frequency fdt of the ADR of interest, the frequency fdn of any other ADR occurring, the frequencies fnt of occurrence of the ADR of interest for any other protein kinase inhibitor, and the frequency fnn for all other ADRs and KIs. The ROR was calculated using Eq. (1)\documentclass[12pt]{minimal} + \usepackage{amsmath} + \usepackage{wasysym} + \usepackage{amsfonts} + \usepackage{amssymb} + \usepackage{amsbsy} + \usepackage{mathrsfs} + \usepackage{upgreek} + \setlength{\oddsidemargin}{-69pt} + \begin{document}$${\rm{ROR}} = \frac{{f_{dt}/f_{dn}}}{{f_{nt}/f_{nn}}},$$\end{document}ROR=fdt/fdnfnt/fnn,whereas the standard error (SE) of the log ROR was calculated using Eq. (2)\documentclass[12pt]{minimal} + \usepackage{amsmath} + \usepackage{wasysym} + \usepackage{amsfonts} + \usepackage{amssymb} + \usepackage{amsbsy} + \usepackage{mathrsfs} + \usepackage{upgreek} + \setlength{\oddsidemargin}{-69pt} + \begin{document}$${\rm{SE}}_{{\rm{logROR}}} = \sqrt {\frac{1}{{f_{dt}}} + \frac{1}{{f_{dn}}} + \frac{1}{{f_{nt}}} + \frac{1}{{f_{nn}}}},$$\end{document}SElogROR=1fdt+1fdn+1fnt+1fnn,with the log-transformed confidence interval (CI) being calculated as follows: CI = log(ROR) ± 1.96*SElogROR.

Adverse events in FAERS are mapped to the MEDDRA dictionary38. CT events related to heart failures and cardiomyopathies, excluding arrhythmogenic ADRs and coronary artery disorders, were selected from the main MEDDRA cardiac ADR group. The selected ADRs primarily reflected different stages of heart failure, which were grouped together.

Elastic net regression analysis

The FAERS-derived risk RORs for CT were regressed against the KI-associated vectors of mean fold-change values across the four cell lines. A two-step regression procedure was then used to select predictor genes reducing the sensitivity to changes in dataset composition. For this, we first generated 1000 bootstrap datasets with replacements for gene expression–KI risk score pairs. Each of these bootstrap datasets was fit using an elastic net regression model (R version 3.4.3, package glmnet, version 2.0-16). The genes that were selected as predictors (i.e., nonzero regression coefficient) and the scaled values of the gene-associated coefficients were saved for each bootstrap dataset. Across all bootstrap datasets, the relative frequency of the selection of gene-based predictors, and the mean-scaled coefficient value was computed. We then calculated the product of the mean frequency and scaled coefficient value, rank predictors by their importance with respect to robustness (selection frequency). A large number of percentiles of these rankings were evaluated using leave-one-out cross-validation. The selection percentile (99.755%) resulting in optimal prediction errors (RMSE) was then used to select a subset of gene-based predictors, and the model that generated the final gene-expression signatures. The selected predictor genes were then ranked by their relative importance, and by their median fold-change values, and displayed as clustered heatmaps. We finally evaluated the predictive value of the resulting regression model to predict CT risk scores for the two left-out KIs.

When using this approach to analyze similar datasets of cardiomyocyte transcriptomes together with risk scores, it is possible that potentially different genes are identified than those described in the current report. This difference associated with the intrinsic property of penalized regression approaches that select predictors from potentially highly correlated sets of predictor candidates. Hence, small changes in either risk scores or gene-expression datasets may affect correlation structures of the data and thereby the list of genes for a signature.

Enrichment and network analyses

Enrichment analysis was performed based on a one-tailed Fisher’s exact test using R (package stats), in order to identify enrichment of specific genes in predefined gene lists. For enrichment of pathways and biological processes, we used the KEGG database (2016), and for enrichment of protein kinases, we used the KEA database (2015). Diseases were excluded from the KEGG list of processes (e.g., diabetes, depression, and cancer), in order to only evaluate general biological processes or pathways. We used the top 250 DEGs ranked by p value for each KI to perform enrichment analysis. Subsequently enriched term p values were correlated with CT risk scores to identify kinases and pathways associated with CT risk.

The gene part of the signature for CT identified in the regression analysis was used as seed note to perform a protein–protein interaction network (PPI) analysis, conducted using the web application X2K39, which aims to identify associated kinases and transcription factors based on multiple PPI databases.

Calculation of chemical similarity

RDkit (www.rdkit.org)40 was used to generate chemical fingerprints and compute pairwise Tanimoto coefficients (Tc) between the 26 tested kinase inhibitors. For each pair of inhibitors, we first calculated the Tc using four chemical fingerprints, including Morgan_2 2,048-bit (ECFP4)41, Morgan_1 2,048-bit (ECFP2)41, Daylight-like42, and MACCS43. Because each of these fingerprints capture distinct chemical properties, we computed a weighted Tc average of the three fingerprints: 30% ECFP4, 30% ECFP2, 30% Daylight-like, and 10% MACCS, which exhibited the most optimal spread of the distribution of the pairwise distances. To generate the SAS maps (Fig. 5a), we plotted the pairwise-weighted Tc values with their difference in CT scores (DCT). Finally, 0.35 was set as the threshold for chemical similarity, while half of the maximum difference was set as the threshold for DCS. Chemical structures were drawn using Marvin (www.chemaxon.com)44 based on SMILES strings obtained from PubChem.

Calculation of KI-binding target similarity

Kinome-wide kinase inhibitor-binding (Kd) profiling data were obtained from Klaegar et al.5, which consisted of kinome-binding (Kd) profiling data for all of the tested kinase inhibitors across 242 kinases. A heatmap was generated for selected kinase inhibitors based on the negative log of the Kd values from Klaegar et al. (Fig. 5c)5. Notably, the Kd values were scaled by 100,000 to avoid negative log values.

Overview of KIs included in this analysis.

DrugThree-letter codeApproval yearaTherapeutic targetsConcentration (µM)b
AfatinibAFA2013ErbB2 and EGFR0.05
AxitinibAXI2012VEGFR1/VEGFR2/VEGFR3/PDGFRB/c-KIT0.2
BosutinibBOS2012Bcr-Abl and SRC0.1
CabozantinibCAB2012c-Met and VEGFR22
CeritinibCER2014ALK1
CrizotinibCRI2011ALK and HGFR0.25
DabrafenibDAB2013BRAF2.5
DasatinibDAS2006ABL, ARG, KIT, PDGFRα/β, and SRC0.1
ErlotinibERL2004ErbB13
GefitinibGEF2003ErbB11
ImatinibIMA2001Bcr-Abl5
LapatinibLAP2007ErbB12
NilotinibNIL2007Bcr-Abl3
PazopanibPAZ2009VEGFR2, PDGFRα/β, and KIT10
PonatinibPON2012Bcr-Abl, BEGFR, PDGFR, FGFR, EPH, SRC, c-KIT, RET, TIE2, and FLT30.1
RegorafenibREG2012RET, VEGFR, and PDGFR1
RuxolitinibRUX2011JAK1
SorafenibSOR2005BRAF, VEGFRs, PDGFRα/β, FLT3, and KIT0.5
SunitinibSUN2006VEGFR, PDGFR, CSF1R, FLT3, and KIT1
TrametinibTRA2013MEK1 and MEK20.1
TofacitinibTOF2012JAK1
VandetanibVAN2011RET, VEGFR, and EGFR0.33
VemurafenibVEM2011BRAF2

aUS approval date, first indication.

bDerived from maximum total (bound  +  free) plasma concentrations in humans as reported in the literature.

Table S3 lists the purity and literature references to clinical concentrations.

Reporting summary

Further information on research design is available in the Nature Research Reporting Summary linked to this article.

Supplementary information

+

Supplementary Information

+

Peer Review File

+

Reporting Summary

+

Source data

+

Source Data

+

Peer review information +Nature Communications thanks the anonymous reviewers for their contribution to the peer review of this work. Peer reviewer reports are available.

Publisher’s note Springer Nature remains neutral with regard to jurisdictional claims in published maps and institutional affiliations.

These authors contributed equally: J. G. Coen van Hasselt, Rayees Rahman.

These authors jointly supervised this work: Avner Schlessinger, Evren U. Azeloglu, Ravi Iyengar.

Supplementary information

Supplementary information is available for this paper at 10.1038/s41467-020-18396-7.

Acknowledgements

This project was supported in part by the NIH LINCS center grant (U54 HG008098) and the Systems Biology Center grant (P50 GM071558). J.G.C.H. received funding from the European Union MSCA program (Project ID 661588). This work was partially carried out using the Dutch national e-infrastructure with the support of SURF Foundation.

Author contributions

J.G.C.H. and R.R. performed the data analysis; J.G.C.H., R.R., J.H., M.R.B., E.S., A. Sc., E.U.A., and R.I. wrote the paper; Y.X. performed RNAseq data processing; A.P. and J.M.G. performed the mass spectrometry drug purity analyses; A.St., B.H., G.J., and J.V.S. performed the cell culture, drug perturbation, and the RNA isolation; E.U.A. supervised the experimental efforts; E.U.A. and J.M.G. determined the experimental drug concentrations and purity; M.M. supervised the RNA sequencing; J.G. supervised the quality assurance and assay reproducibility; A.Sc. supervised the cheminformatics analysis; R.I. conceived the project; all authors reviewed the paper.

Data availability

All processed RNAseq data and the curated version-controlled standard operating procedures featured in this study can be downloaded freely at (www.dtoxs.org)22 or the LINCS Data Portal (http://lincsportal.ccs.miami.edu/dcic-portal/). Raw transcriptomics data can be accessed through the Gene Expression Omnibus (GEO) repository with accession numbers GSE146096 and GSE146097. Source data for each figure are provided with this paper. All remaining data will be available from the corresponding author upon reasonable request. Source data are provided with this paper.

Code availability

All scripts are open-source and available from the DToxS GitHub repository (https://github.com/dtoxs).

Competing interests

R.R. and A.S. are co-founders of Aichemy Inc. The remaining authors declare no competing interests.

ReferencesCohenPThe role of protein phosphorylation in human health and disease: delivered on June 30th 2001 at the FEBS meeting in LisbonEur. J. Biochem.20012685001501010.1046/j.0014-2956.2001.02473.x11589691GiamasGKinases as targets in the treatment of solid tumorsCell. Signal.201022984100210.1016/j.cellsig.2010.01.01120096351KnappSSundströmMRecently targeted kinases and their inhibitors-the path to clinical trialsCurr. Opin. Pharmacol.201417C586310.1016/j.coph.2014.07.015FabbroDCowan-JacobSWMöbitzHMartiny-BaronGTargeting cancer with small-molecular-weight kinase inhibitorsMethods Mol. Biol.201279513410.1007/978-1-61779-337-0_121960212KlaegerSThe target landscape of clinical kinase drugsScience2017358eaan436810.1126/science.aan436829191878Roskoski, R. Properties of FDA-approved small molecule protein kinase inhibitors. Pharmacol. Res.10.1016/j.phrs.2019.03.006 (2019).ForceTKolajaKLCardiotoxicity of kinase inhibitors: the prediction and translation of preclinical models to clinical outcomesNat. Rev. Drug Discov.2011101112610.1038/nrd325221283106ChuTFCardiotoxicity associated with tyrosine kinase inhibitor sunitinibLancet20073702011201910.1016/S0140-6736(07)61865-018083403OrphanosGSIoannidisGNArdavanisAGCardiotoxicity induced by tyrosine kinase inhibitorsActa Oncol.20094896497010.1080/0284186090322912419734999MoslehiJJCardiovascular toxic effects of targeted cancer therapiesN. Engl. J. Med.20163751457146710.1056/NEJMra110026527732808ForceTKerkeläRCardiotoxicity of the new cancer therapeutics—mechanisms of, and approaches to, the problemDrug Discov. Today2008137788410.1016/j.drudis.2008.05.01118617014DavisMIComprehensive analysis of kinase inhibitor selectivityNat. Biotechnol.20112910465110.1038/nbt.199022037378ElkinsJMComprehensive characterization of the Published Kinase Inhibitor SetNat. Biotechnol.2016349510310.1038/nbt.337426501955HasinoffBBPatelDThe lack of target specificity of small molecule anticancer kinase inhibitors is correlated with their ability to damage myocytes in vitroToxicol. Appl. Pharmacol.201024913213910.1016/j.taap.2010.08.02620832415WillYEffect of the multitargeted tyrosine kinase inhibitors imatinib, dasatinib, sunitinib, and sorafenib on mitochondrial function in isolated rat heart mitochondria and H9c2 cellsToxicol. Sci.200810615316110.1093/toxsci/kfn15718664550KerkeläRCardiotoxicity of the cancer therapeutic agent imatinib mesylateNat. Med.20061290891610.1038/nm144616862153DohertyKRMulti-parameter in vitro toxicity testing of crizotinib, sunitinib, erlotinib, and nilotinib in human cardiomyocytesToxicol. Appl. Pharmacol.20132722455510.1016/j.taap.2013.04.02723707608ForceTKrauseDSVan EttenRAMolecular mechanisms of cardiotoxicity of tyrosine kinase inhibitionNat. Rev. Cancer2007733234410.1038/nrc210617457301BaiJPFAbernethyDRSystems pharmacology to predict drug toxicity: integration across levels of biological organizationAnnu. Rev. Pharmacol. Toxicol.2013534517310.1146/annurev-pharmtox-011112-14024823140241BergerSIIyengarRRole of systems pharmacology in understanding drug adverse eventsWiley Interdiscip. Rev.20113129135BergerSIMa’ayanAIyengarRSystems pharmacology of arrhythmiasSci. Signal.20103ra3020407125ZhaoSSystems pharmacology of adverse event mitigation by drug combinationsSci. Transl. Med.20135206ra14010.1126/scitranslmed.3006548XiongYA comparison of mRNA sequencing with random primed and 3′-directed librariesSci. Rep.201771462610.1038/s41598-017-14892-x29116112LonsdaleJThe genotype-tissue expression (GTEx) projectNat. Genet.20134558058510.1038/ng.265323715323ZouHHastieTRegularization and variable selection via the elastic net.Journal of the Royal Statistical Society20056730132010.1111/j.1467-9868.2005.00503.xGiulianottiMAWelmakerGSHoughtenRAShifting from the single to the multitarget paradigm in drug discoveryDrug Discov. Today20131849550110.1016/j.drudis.2013.01.00823340113UngP. M.-U.RahmanRSchlessingerARedefining the protein kinase conformational space with machine learningCell Chem. Biol.201825916924.e210.1016/j.chembiol.2018.05.00229861272RahmanRUngPM-USchlessingerAKinaMetrix: a web resource to investigate kinase conformations and inhibitor spaceNucleic Acids Res.201947D361D36610.1093/nar/gky91630321373DarACShokatKMThe evolution of protein kinase inhibitors from antagonists to agonists of cellular signalingAnnu. Rev. Biochem.20118076979510.1146/annurev-biochem-090308-17365621548788ZhangTHatcherJMTengMGrayNSKosticMRecent advances in selective and irreversible covalent ligand development and validationCell Chem. Biol.2019261486150010.1016/j.chembiol.2019.09.01231631011SchnellDPharmacokinetics of afatinib in subjects with mild or moderate hepatic impairmentCancer Chemother. Pharm.20147426727510.1007/s00280-014-2484-yBurridgePWHuman induced pluripotent stem cell-derived cardiomyocytes recapitulate the predilection of breast cancer patients to doxorubicin-induced cardiotoxicityNat. Med.2016225475610.1038/nm.408727089514Soumillon, M., Cacchiarelli, D., Semrau, S., van Oudenaarden, A. & Mikkelsen, T. S. Characterization of directed differentiation by high-throughput single-cell RNA-Seq. Preprint at https://www.biorxiv.org/content/10.1101/003236v1 (2014).KiviojaTCounting absolute numbers of molecules using unique molecular identifiersNat. Methods20119727410.1038/nmeth.177822101854DobinASTAR: ultrafast universal RNA-seq alignerBioinformatics201329152110.1093/bioinformatics/bts63523104886RobinsonMDMcCarthyDJSmythGKedgeR: a Bioconductor package for differential expression analysis of digital gene expression dataBioinformatics2010261394010.1093/bioinformatics/btp61619910308SarangdharMData mining differential clinical outcomes associated with drug regimens using adverse event reporting dataNat. Biotechnol.20163469770010.1038/nbt.362327404875BrownEGWoodLWoodSThe Medical Dictionary for Regulatory Activities (MedDRA)Drug Saf.19992010911710.2165/00002018-199920020-0000210082069ClarkeDJBEXpression2Kinases (X2K) Web: linking expression signatures to upstream cell signaling networksNucleic Acids Res.201846W171W17910.1093/nar/gky45829800326RDKit. http://www.rdkit.org/.RogersDHahnMExtended-connectivity fingerprintsJ. Chem. Inf. Model.20105074275410.1021/ci100050t20426451Daylight. https://www.daylight.com/.DurantJLLelandBAHenryDRNourseJGReoptimization of MDL keys for use in drug discoveryJ. Chem. Inf. Comput. Sci.2002421273128010.1021/ci010132r12444722ChemAxon - Software Solutions and Services for Chemistry & Biology. https://chemaxon.com/.
diff --git a/jcore-pmc-reader/LICENSE b/jcore-pmc-reader/LICENSE index fbbd41e05..d0f946a29 100644 --- a/jcore-pmc-reader/LICENSE +++ b/jcore-pmc-reader/LICENSE @@ -1,6 +1,6 @@ BSD 2-Clause License -Copyright (c) 2017, JULIE Lab +Copyright (c) 2022, JULIE Lab All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java index d3b402b36..5eedd46fa 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/CasPopulator.java @@ -7,6 +7,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; +import java.io.InputStream; import java.net.URI; import java.util.Iterator; @@ -22,6 +23,10 @@ public CasPopulator(Iterator nxmlIterator, Boolean omitBibReferences) throw nxmlDocumentParser.loadElementPropertyFile(settings); } + public CasPopulator(Boolean omitBibReferences) throws IOException { + this(null, omitBibReferences); + } + public void populateCas(URI nxmlUri, JCas cas) throws ElementParsingException, NoDataAvailableException { ElementParsingResult result = null; URI currentUri = nxmlUri; @@ -44,6 +49,18 @@ public void populateCas(URI nxmlUri, JCas cas) throws ElementParsingException, N cas.setDocumentText(sb.toString()); } + public void populateCas(InputStream is, JCas cas) throws ElementParsingException, NoDataAvailableException { + ElementParsingResult result; + try { + nxmlDocumentParser.reset(is, cas); + result = nxmlDocumentParser.parse(); + } catch (DocumentParsingException e) { + throw new NoDataAvailableException(e); + } + StringBuilder sb = populateCas(result, new StringBuilder()); + cas.setDocumentText(sb.toString()); + } + /** * This is the actual method that reads the parsing results, created the CAS document text and adds * the annotations from the parsing results. diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java index 7aa245057..4aec4f9a2 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java @@ -22,6 +22,11 @@ import static java.nio.charset.StandardCharsets.UTF_8; +/** + * Searches over directories and, optionally, the contents of ZIP archives for files with an (n)xml extension. + * Returns URIs that either point to single files or to entries into ZIP archives. Both can equally be accessed via + * "uri.toURL().openStream()" which is done in the NxmlDocumentParser. + */ public class NXMLURIIterator implements Iterator { private final static Logger log = LoggerFactory.getLogger(NXMLURIIterator.class); private final static Logger logFileSearch = LoggerFactory.getLogger(NXMLURIIterator.class.getCanonicalName() + ".FileSearch"); @@ -48,7 +53,7 @@ public boolean hasNext() { // The beginning: The currentDirectory is null and we start at // the given path (which actually might be a single file to // read). - log.debug("Starting background thread to search for PMC (.nxml) files at {}", basePath); + log.debug("Starting background thread to search for PMC (.xml) files at {}", basePath); CompletableFuture.runAsync(() -> setFilesAndSubDirectories(basePath, false)); fileSearchRunning = true; } @@ -78,7 +83,7 @@ private void setFilesAndSubDirectories(File directory, boolean recursiveCall) { if ((searchRecursively || directory.equals(basePath)) && !isZipFile(directory)) { logFileSearch.debug("Identified {} as a directory, reading files and subdirectories", directory); // set the files in the directory - for (File file : directory.listFiles(f -> f.isFile() && f.getName().contains(".nxml") && !isZipFile(f) && isInWhitelist(f))) { + for (File file : directory.listFiles(f -> f.isFile() && f.getName().endsWith("xml") && !isZipFile(f) && isInWhitelist(f))) { URI toURI = file.toURI(); try { uris.put(toURI); @@ -101,7 +106,7 @@ private void setFilesAndSubDirectories(File directory, boolean recursiveCall) { while (entries.hasMoreElements()) { final ZipEntry e = entries.nextElement(); if (!e.isDirectory() && e.getName().contains(".nxml") && isInWhitelist(new File(e.getName()))) { - final String urlStr = "jar:" + directory.toURI().toString() + "!/" + e.getName(); + final String urlStr = "jar:" + directory.toURI() + "!/" + e.getName(); int exclamationIndex = urlStr.indexOf('!'); final String urlEncodedStr = urlStr.substring(0, exclamationIndex + 2) + Stream.of(urlStr.substring(exclamationIndex + 2).split("/")).map(x -> URLEncoder.encode(x, UTF_8)).collect(Collectors.joining("/")); URL url = new URL(urlEncodedStr); diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java index 9f75ba8db..5285ee138 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java @@ -110,7 +110,7 @@ else if (docType.contains("JATS")) { return; } } - throw new DocTypeNotFoundException("Could not find a doctype."); + throw new DocTypeNotFoundException("Could not find a known doctype."); } private void setupParserRegistry() { diff --git a/jcore-pmc-reader/src/main/resources/LICENSE.txt b/jcore-pmc-reader/src/main/resources/LICENSE.txt index fbbd41e05..d0f946a29 100644 --- a/jcore-pmc-reader/src/main/resources/LICENSE.txt +++ b/jcore-pmc-reader/src/main/resources/LICENSE.txt @@ -1,6 +1,6 @@ BSD 2-Clause License -Copyright (c) 2017, JULIE Lab +Copyright (c) 2022, JULIE Lab All rights reserved. Redistribution and use in source and binary forms, with or without From 8c7c12010ec19ce3b71b2580ae70f73663e83b00 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 14 Feb 2022 17:19:04 +0100 Subject: [PATCH 121/269] Fix the setDBProcessingMetaData method to actually return the PK string. --- .../julielab/jcore/reader/db/DBMultiplierReader.java | 7 ++++--- .../java/de/julielab/jcore/reader/db/DBReader.java | 11 +++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplierReader.java b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplierReader.java index bfe474de8..992c64a00 100644 --- a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplierReader.java +++ b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplierReader.java @@ -19,6 +19,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -65,7 +66,7 @@ public void initialize(UimaContext context) throws ResourceInitializationExcepti } @Override - public void getNext(JCas jCas) throws CollectionException { + public void getNext(JCas jCas) throws CollectionException, IOException { log.trace("Requesting next batch of document IDs from the database."); List idList = getNextDocumentIdBatch(); if (idList.isEmpty()) @@ -119,7 +120,7 @@ public void getNext(JCas jCas) throws CollectionException { * * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#hasNext() */ - public boolean hasNext() { + public boolean hasNext() throws IOException, CollectionException { boolean hasNext = this.hasNext; if (retriever != null) hasNext = !retriever.getDocumentIds().isEmpty(); @@ -187,7 +188,7 @@ public Progress[] getProgress() { } @Override - public void close() { + public void close() throws IOException { if (dbc != null) dbc.close(); dbc = null; diff --git a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBReader.java b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBReader.java index 5a21db4be..e580fa2fa 100644 --- a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBReader.java +++ b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBReader.java @@ -118,17 +118,20 @@ public abstract class DBReader extends DBSubsetReader { private DBCIterator xmlBytes; public static String setDBProcessingMetaData(DataBaseConnector dbc, boolean readDataTable, String tableName, byte[][] data, JCas cas) { - String pkString = null; // remove previously added dbMetaData JCasUtil.select(cas, DBProcessingMetaData.class).forEach(x -> x.removeFromIndexes()); DBProcessingMetaData dbMetaData = new DBProcessingMetaData(cas); List pkIndices = dbc.getPrimaryKeyIndices(); StringArray pkArray = new StringArray(cas, pkIndices.size()); + StringBuilder pkBuilder = new StringBuilder(); for (int i = 0; i < pkIndices.size(); ++i) { Integer index = pkIndices.get(i); String pkElementValue = new String(data[index], Charset.forName("UTF-8")); pkArray.set(i, pkElementValue); + pkBuilder.append(pkElementValue); + if (i < pkIndices.size() - 1) + pkBuilder.append(","); } if (log.isDebugEnabled()) log.trace("Setting primary key for DBProcessingMetaData to {}", Arrays.toString(pkArray.toArray())); @@ -142,10 +145,9 @@ public static String setDBProcessingMetaData(DataBaseConnector dbc, boolean read } else { log.trace("Not setting the subset to DBProcessingMetaData because reading the data table is set to {}", readDataTable); } - - dbMetaData.addToIndexes(); - return pkString; + + return pkBuilder.toString(); } @Override @@ -257,6 +259,7 @@ public void close() { * pipeline status field */ protected abstract String getReaderComponentName(); + /** *

* This class is charged with retrieving batches of document IDs and documents while previously fetched documents From 11b406bd5d6fc6b6d0b0edd140bf1ff936b2c78d Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 14 Feb 2022 17:19:36 +0100 Subject: [PATCH 122/269] Fix a typo in the UIMA type imports. --- .../jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-event-flattener-ae/src/main/resources/de/julielab/jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml b/jcore-event-flattener-ae/src/main/resources/de/julielab/jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml index 7e3a1f520..296872b61 100644 --- a/jcore-event-flattener-ae/src/main/resources/de/julielab/jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml +++ b/jcore-event-flattener-ae/src/main/resources/de/julielab/jcore/ae/eventflattener/desc/jcore-event-flattener-ae.xml @@ -12,7 +12,7 @@ - + From 473d4ac43a0b137bde4ffa08c93aa6473b39500e Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 14 Feb 2022 17:20:02 +0100 Subject: [PATCH 123/269] Add the PMCDBReader as a Maven module. --- .gitignore | 1 + pom.xml | 321 ++++++++++++++++++++++++++++++++++------------------- 2 files changed, 209 insertions(+), 113 deletions(-) diff --git a/.gitignore b/.gitignore index 247d87c61..6da01ef44 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ target **/*.iml /julie-xml-tools.jar +/jcore-pmc-db-reader/src/test/resources/hiddenConfig diff --git a/pom.xml b/pom.xml index 6db724ae5..e4a6477fa 100644 --- a/pom.xml +++ b/pom.xml @@ -1,228 +1,323 @@ - + + 4.0.0 - + + - + + de.julielab - + + jcore-parent - + + 2.5.2-SNAPSHOT - + + - + + jcore-base - + + pom - + + JCoRe Base - + + The POM for the JCoRe Base projects. - + + 2.6.0-SNAPSHOT - + + - + + JULIE Lab, Germany - + + http://www.julielab.de - + + - + + - + + - + + BSD-2-Clause - + + https://opensource.org/licenses/BSD-2-Clause - + + - + + - + + https://github.com/JULIELab/jcore-base - + + - + + - + + org.apache.uima - + + uimaj-core - + + ${uima-version} - + + - + + - + + org.apache.uima - + + uimafit-core - + + ${uimafit-version} - + + - + + - + + - + jcore-annotation-adder-ae - + jcore-ace-reader - + + jcore-acronym-ae - + jcore-acronym-writer - + + jcore-banner-ae - + jcore-bc2gm-reader - + jcore-bc2gmformat-writer - + jcore-biolemmatizer-ae - + + jcore-bionlpformat-consumer - + + jcore-bionlpformat-reader - + + jcore-biosem-ae - + + jcore-conll-consumer - + + jcore-coordination-baseline-ae - + jcore-cord19-reader - + jcore-coreference-writer - + jcore-ct-reader - + jcore-db-checkpoint-ae - + jcore-descriptor-creator - + jcore-dta-reader - + + jcore-ec-code-ae - + + jcore-elasticsearch-consumer - + + jcore-embedding-writer - + + jcore-event-flattener-ae - + + jcore-feature-value-replacement-ae - + + jcore-file-reader - + + jcore-flair-ner-ae - + jcore-flair-token-embedding-ae - + jcore-flow-controllers - + + jcore-iexml-consumer - + + jcore-iexml-reader - + + jcore-ign-reader - + + jcore-iob-consumer - + + jcore-jnet-ae - + + jcore-jpos-ae - + + jcore-jsbd-ae - + + jcore-jtbd-ae - + + jcore-julielab-entity-evaluator-consumer - + + jcore-likelihood-assignment-ae - + + jcore-likelihood-detection-ae - + jcore-line-multiplier - + jcore-lingpipegazetteer-ae - + + jcore-lingpipe-porterstemmer-ae - + + jcore-lingscope-ae - + + jcore-linnaeus-species-ae - + + jcore-mantra-xml-types - + + jcore-medxn-ae - + + jcore-msdoc-reader - + + jcore-mstparser-ae - + + jcore-muc7-reader - + + jcore-mutationfinder-ae - + jcore-neo4j-relations-consumer - + + jcore-opennlp-chunk-ae - + + jcore-opennlp-parser-ae - + + jcore-opennlp-postag-ae - + + jcore-opennlp-sentence-ae - + + jcore-opennlp-token-ae - + jcore-ppd-writer - + jcore-pmc-reader - + + jcore-pubtator-reader - + + jcore-stanford-lemmatizer-ae - + + jcore-topic-indexing-ae - + + jcore-topics-writer - + + jcore-txt-consumer - + + jcore-types - + + jcore-utilities - + + jcore-xml-mapper - + + jcore-xml-reader - + + jcore-xmi-reader - + + jcore-xmi-writer - + + jedis-parent - jcore-jedis-integration-tests - - + + jcore-jedis-integration-tests + + jcore-pmc-db-reader + + + - + + scm:git:https://github.com/JULIELab/jcore-base - + + scm:git:https://github.com/JULIELab/jcore-base - + + scm:git:https://github.com/JULIELab/jcore-base - - + + + + From 8bcd76fda0576841b26c7fc81317e37cf5ccc4e9 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 14 Feb 2022 17:21:00 +0100 Subject: [PATCH 124/269] Formatting. --- .../jcore/ae/flairner/FlairNerAnnotator.java | 42 ++++++++++--------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java index de2382319..04d65d3cf 100644 --- a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java +++ b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java @@ -63,9 +63,9 @@ public class FlairNerAnnotator extends JCasAnnotator_ImplBase { private String pythonExecutable; @ConfigurationParameter(name = PARAM_STORE_EMBEDDINGS, mandatory = false, description = "Optional. Possible values: ALL, ENTITIES, NONE. The FLAIR SequenceTagger first computes the embeddings for each sentence and uses those as input for the actual NER algorithm. By default, the embeddings are not stored. By setting this parameter to ALL, the embeddings of all tokens of the sentence are retrieved from flair and stored in the embeddingVectors feature of each token. Setting the parameter to ENTITIES will restrict the embedding storage to those tokens which overlap with an entity recognized by FLAIR.") private StoreEmbeddings storeEmbeddings; - @ConfigurationParameter(name = PARAM_GPU_NUM, mandatory = false, defaultValue="0", description = "Specifies the GPU device number to be used for FLAIR. This setting can be overwritten by the Java system property 'flairner.device'.") + @ConfigurationParameter(name = PARAM_GPU_NUM, mandatory = false, defaultValue = "0", description = "Specifies the GPU device number to be used for FLAIR. This setting can be overwritten by the Java system property 'flairner.device'.") private int gpuNum; - @ConfigurationParameter(name=PARAM_COMPONENT_ID, mandatory = false, description = "Specifies the componentId feature value given to the created annotations. Defaults to 'FlairNerAnnotator'.") + @ConfigurationParameter(name = PARAM_COMPONENT_ID, mandatory = false, description = "Specifies the componentId feature value given to the created annotations. Defaults to 'FlairNerAnnotator'.") private String componentId; private AnnotationAdderConfiguration adderConfig; @@ -78,7 +78,7 @@ public void initialize(final UimaContext aContext) throws ResourceInitialization entityClass = (String) aContext.getConfigParameterValue(PARAM_ANNOTATION_TYPE); flairModel = (String) aContext.getConfigParameterValue(PARAM_FLAIR_MODEL); storeEmbeddings = StoreEmbeddings.valueOf(Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_STORE_EMBEDDINGS)).orElse(StoreEmbeddings.NONE.name())); - gpuNum = Optional.ofNullable((Integer)aContext.getConfigParameterValue(PARAM_GPU_NUM)).orElse(0); + gpuNum = Optional.ofNullable((Integer) aContext.getConfigParameterValue(PARAM_GPU_NUM)).orElse(0); componentId = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_COMPONENT_ID)).orElse(getClass().getSimpleName()); if (System.getProperty(GPU_NUM_SYS_PROP) != null) { try { @@ -157,21 +157,21 @@ public void initialize(final UimaContext aContext) throws ResourceInitialization */ @Override public void process(final JCas aJCas) throws AnalysisEngineProcessException { - int i = 0; - final AnnotationIndex sentIndex = aJCas.getAnnotationIndex(Sentence.class); - Map sentenceMap = new HashMap<>(); - for (Sentence sentence : sentIndex) { - if (sentence.getId() == null) - sentence.setId("s" + i++); - sentenceMap.put(sentence.getId(), sentence); - } - if ( log.isDebugEnabled()) { - if (sentenceMap.isEmpty()) - log.debug("Document {} does not have any sentences.", JCoReTools.getDocId(aJCas)); - if (!aJCas.getAnnotationIndex(Token.class).iterator().hasNext()) - log.debug("Document {} does not have any tokens", JCoReTools.getDocId(aJCas)); - } try { + int i = 0; + final AnnotationIndex sentIndex = aJCas.getAnnotationIndex(Sentence.class); + Map sentenceMap = new HashMap<>(); + for (Sentence sentence : sentIndex) { + if (sentence.getId() == null) + sentence.setId("s" + i++); + sentenceMap.put(sentence.getId(), sentence); + } + if (log.isDebugEnabled()) { + if (sentenceMap.isEmpty()) + log.debug("Document {} does not have any sentences.", JCoReTools.getDocId(aJCas)); + if (!aJCas.getAnnotationIndex(Token.class).iterator().hasNext()) + log.debug("Document {} does not have any tokens", JCoReTools.getDocId(aJCas)); + } JCoReOverlapAnnotationIndex intRefIndex = new JCoReOverlapAnnotationIndex<>(aJCas, InternalReference.type); final AnnotationAdderHelper helper = new AnnotationAdderHelper(); log.trace("Sending document sentences to flair for entity tagging."); @@ -206,6 +206,9 @@ public void process(final JCas aJCas) throws AnalysisEngineProcessException { final String docId = JCoReTools.getDocId(aJCas); log.error("Could not set the offsets of an annotation in document {}", docId); throw new AnalysisEngineProcessException(e); + } catch (Throwable t) { + log.error("Error in {}", this.getClass().getSimpleName(), t); + throw new AnalysisEngineProcessException(t); } } @@ -213,7 +216,7 @@ private void addTokenEmbeddings(JCas aJCas, Map sentenceMap, A final List tokenEmbeddings = taggingResponse.getTokenEmbeddings(); JCoReTreeMapAnnotationIndex tokenIndex = null; if (!tokenEmbeddings.isEmpty()) - tokenIndex = new JCoReTreeMapAnnotationIndex<>(Comparators.longOverlapComparator(),TermGenerators.longOffsetTermGenerator(), TermGenerators.longOffsetTermGenerator(), aJCas, Token.type); + tokenIndex = new JCoReTreeMapAnnotationIndex<>(Comparators.longOverlapComparator(), TermGenerators.longOffsetTermGenerator(), TermGenerators.longOffsetTermGenerator(), aJCas, Token.type); Map> originalTokenEmbeddings = new HashMap<>(); for (TokenEmbedding tokenEmbedding : tokenEmbeddings) { final Sentence sentence = sentenceMap.get(tokenEmbedding.getSentenceId()); @@ -262,7 +265,8 @@ private void addTokenEmbeddings(JCas aJCas, Map sentenceMap, A /** * Internal references can actually look like a part of a gene, e.g. "filament19" where "19" is a reference. * Exclude those spans from the gene mentions. - * @param a The gene annotation. + * + * @param a The gene annotation. * @param intRefIndex The reference index. */ private void excludeReferenceAnnotationSpans(Annotation a, JCoReOverlapAnnotationIndex intRefIndex) { From a7576266641b46c46d909f067d4a1bf73d623cf1 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 14 Feb 2022 17:21:17 +0100 Subject: [PATCH 125/269] Typo. --- .../annotationdefined/AnnotationDefinedFlowController.java | 4 ++-- .../desc/jcore-annotation-defined-flowcontroller.xml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowController.java b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowController.java index 4158059a3..c6c016e45 100644 --- a/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowController.java +++ b/jcore-flow-controllers/src/main/java/de/julielab/jcore/flow/annotationdefined/AnnotationDefinedFlowController.java @@ -10,10 +10,10 @@ /** *

Routes CASes through an aggregate analysis engine according to the {@link ToVisit} annotation present in the CAS.

- *

If there is not ToVisit annotation, the default (fixed) flow will be used. Thus, the fixed flow constraint + *

If there is no ToVisit annotation, the default (fixed) flow will be used. Thus, the fixed flow constraint * must be set on the aggregate engine.

*/ -@ResourceMetaData(name = "JCoRe Annotation Defined Flow Controller", description = "This flow controller relies on an annotation of type ToVisit to be present in the CAS. If there is no such annotation, the default fixed flow of the aggregate engine using this flow controller is used. Otherwise, die names of the components to pass the CAS to are taken from the annotation. If the annotation exists but defines to components to be visited by the CAS, no components are visited at all.", vendor = "JULIE Lab, Germany", version = "placeholder") +@ResourceMetaData(name = "JCoRe Annotation Defined Flow Controller", description = "This flow controller relies on an annotation of type ToVisit to be present in the CAS. If there is no such annotation, the default fixed flow of the aggregate engine using this flow controller is used. Otherwise, the names of the components to pass the CAS to are taken from the annotation. If the annotation exists but defines to components to be visited by the CAS, no components are visited at all.", vendor = "JULIE Lab, Germany", version = "placeholder") public class AnnotationDefinedFlowController extends JCasFlowController_ImplBase { @Override public Flow computeFlow(JCas jCas) throws AnalysisEngineProcessException { diff --git a/jcore-flow-controllers/src/main/resources/de/julielab/jcore/flow/annotationdefined/desc/jcore-annotation-defined-flowcontroller.xml b/jcore-flow-controllers/src/main/resources/de/julielab/jcore/flow/annotationdefined/desc/jcore-annotation-defined-flowcontroller.xml index 2babe5cd5..78ea9b35d 100644 --- a/jcore-flow-controllers/src/main/resources/de/julielab/jcore/flow/annotationdefined/desc/jcore-annotation-defined-flowcontroller.xml +++ b/jcore-flow-controllers/src/main/resources/de/julielab/jcore/flow/annotationdefined/desc/jcore-annotation-defined-flowcontroller.xml @@ -4,7 +4,7 @@ de.julielab.jcore.flow.annotationdefined.AnnotationDefinedFlowController JCoRe Annotation Defined Flow Controller - This flow controller relies on an annotation of type ToVisit to be present in the CAS. If there is no such annotation, the default fixed flow of the aggregate engine using this flow controller is used. Otherwise, die names of the components to pass the CAS to are taken from the annotation. If the annotation exists but defines to components to be visited by the CAS, no components are visited at all. + This flow controller relies on an annotation of type ToVisit to be present in the CAS. If there is no such annotation, the default fixed flow of the aggregate engine using this flow controller is used. Otherwise, the names of the components to pass the CAS to are taken from the annotation. If the annotation exists but defines to components to be visited by the CAS, no components are visited at all. placeholder JULIE Lab, Germany From 9794d62b40cead5c7426be8f03e4a4fc7b2b8db5 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 14 Feb 2022 17:21:35 +0100 Subject: [PATCH 126/269] Formatting, comment correction. --- .../julielab/jcore/reader/xml/XMLDBMultiplier.java | 14 ++++++++------ .../jcore/reader/xml/XMLDBMultiplierTest.java | 2 -- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java index 03c2b1160..ae7de1cef 100644 --- a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java +++ b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java @@ -62,7 +62,7 @@ public class XMLDBMultiplier extends DBMultiplier { @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT, mandatory = false, description = "For use with AnnotationDefinedFlowController. String parameter indicating the name of the " + "table where the XMI data and, thus, the hash is stored. The name must be schema qualified. Note that in this component, only the ToVisit annotation is created that determines which components to apply to a CAS with matching (unchanged) hash. The logic to actually control the CAS flow is contained in the AnnotationDefinedFlowController.") private String xmiStorageDataTable; - @ConfigurationParameter(name= PARAM_TABLE_DOCUMENT_SCHEMA, mandatory = false, description = "For use with AnnotationDefinedFlowController. The name of the schema that the document table - given with the "+PARAM_TABLE_DOCUMENT+" parameter - adheres to. Only the primary key part is required for hash value retrieval.") + @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT_SCHEMA, mandatory = false, description = "For use with AnnotationDefinedFlowController. The name of the schema that the document table - given with the " + PARAM_TABLE_DOCUMENT + " parameter - adheres to. Only the primary key part is required for hash value retrieval.") private String xmiStorageDataTableSchema; @ConfigurationParameter(name = PARAM_TO_VISIT_KEYS, mandatory = false, description = "For use with AnnotationDefinedFlowController. The delegate AE keys of the AEs this CAS should still applied on although the hash has not changed. Can be null or empty indicating that no component should be applied to the CAS. This is, however, the task of the AnnotationDefinedFlowController.") private String[] toVisitKeys; @@ -147,7 +147,7 @@ private void setToVisitAnnotation(JCas jCas) { String newHash = getHash(jCas); if (existingHash.equals(newHash)) { if (log.isTraceEnabled()) - log.trace("Document {} has a document text hash that equals the one present in the database. Creating a ToVisit annotation routing it only to the components with delegate keys {}.", pkString, toVisitKeys); + log.trace("Document {} has a document text hash that equals the one present in the database. Creating a ToVisit annotation routing it only to the components with delegate keys {}.", pkString, toVisitKeys); ToVisit toVisit = new ToVisit(jCas); if (toVisitKeys != null && toVisitKeys.length != 0) { StringArray keysArray = new StringArray(jCas, toVisitKeys.length); @@ -156,6 +156,8 @@ private void setToVisitAnnotation(JCas jCas) { } toVisit.addToIndexes(); } + } else { + log.trace("No existing hash was found for document {}", pkString); } } } @@ -212,10 +214,10 @@ private Map fetchCurrentHashesFromDatabase(RowBatch rowBatch) th while (rs.next()) { StringBuilder pkSb = new StringBuilder(); for (int i = 0; i < xmiTableSchema.getPrimaryKey().length; i++) - pkSb.append(rs.getString(i+1)).append(','); - // Remove training comma - pkSb.deleteCharAt(pkSb.length()-1); - String hash = rs.getString(xmiTableSchema.getPrimaryKey().length+1); + pkSb.append(rs.getString(i + 1)).append(','); + // Remove trailing comma + pkSb.deleteCharAt(pkSb.length() - 1); + String hash = rs.getString(xmiTableSchema.getPrimaryKey().length + 1); id2hash.put(pkSb.toString(), hash); } } catch (SQLException e) { diff --git a/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java b/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java index 3e2cd9f79..86009735d 100644 --- a/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java +++ b/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java @@ -95,7 +95,6 @@ private static void prepareSourceXMLTable(DataBaseConnector dbc, CoStoSysConnect } private static void prepareTargetXMITable(DataBaseConnector dbc, CoStoSysConnection conn) throws SQLException { - // Note that the root is "xmi" and not "xml" String documentTextFmt = "This is document text number %d"; dbc.createTable(TARGET_XMI_TABLE, "xmi_text", "Test table for hash comparison test."); dbc.assureColumnsExist(TARGET_XMI_TABLE, List.of(HASH_FIELD_NAME), "text"); @@ -138,7 +137,6 @@ public void testMultiplier() throws Exception { while (jCasIterator.hasNext()) { JCas newCas = jCasIterator.next(); documentTexts.add(newCas.getDocumentText()); - System.out.println(newCas.getDocumentText()); newCas.release(); } assertThat(documentTexts).containsExactly("This is document text number 0", "This is document text number 1", "This is document text number 2", "This is document text number 3", "This is document text number 4", "This is document text number 5", "This is document text number 6", "This is document text number 7", "This is document text number 8", "This is document text number 9"); From 296da6b944d0fec542111ff8994e48f36f13025c Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 14 Feb 2022 17:22:02 +0100 Subject: [PATCH 127/269] Remove unused throw directives. --- .../multiplier/pmc/PMCDBMultiplierHashComparisonTest.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierHashComparisonTest.java b/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierHashComparisonTest.java index a4f02e11a..a36155dfa 100644 --- a/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierHashComparisonTest.java +++ b/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplierHashComparisonTest.java @@ -27,7 +27,6 @@ import org.testcontainers.containers.PostgreSQLContainer; import java.io.File; -import java.io.IOException; import java.nio.file.Path; import java.sql.PreparedStatement; import java.sql.SQLException; @@ -57,7 +56,7 @@ public class PMCDBMultiplierHashComparisonTest { private static String costosysConfig; @BeforeAll - public static void setup() throws SQLException, UIMAException, IOException, ConfigurationException { + public static void setup() throws SQLException, ConfigurationException { postgres.start(); DBTestUtils.createAndSetHiddenConfig(Path.of("src", "test", "resources", "hiddenConfig").toString(), postgres); From 961b884e34e471ee787b91815d9eb8bfe6a4ec78 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 14 Feb 2022 17:53:00 +0100 Subject: [PATCH 128/269] Fix a bug where .nxml files were not read any more. Introduced in the course of adapting to the new PMC bulk download format. All files in there have a plain .xml extension instead of the old .nxml. --- .../java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java index 4aec4f9a2..1a4010576 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/NXMLURIIterator.java @@ -83,7 +83,7 @@ private void setFilesAndSubDirectories(File directory, boolean recursiveCall) { if ((searchRecursively || directory.equals(basePath)) && !isZipFile(directory)) { logFileSearch.debug("Identified {} as a directory, reading files and subdirectories", directory); // set the files in the directory - for (File file : directory.listFiles(f -> f.isFile() && f.getName().endsWith("xml") && !isZipFile(f) && isInWhitelist(f))) { + for (File file : directory.listFiles(f -> f.isFile() && (f.getName().contains(".xml") || f.getName().contains(".nxml")) && !isZipFile(f) && isInWhitelist(f))) { URI toURI = file.toURI(); try { uris.put(toURI); @@ -105,7 +105,7 @@ private void setFilesAndSubDirectories(File directory, boolean recursiveCall) { int numEntries = 0; while (entries.hasMoreElements()) { final ZipEntry e = entries.nextElement(); - if (!e.isDirectory() && e.getName().contains(".nxml") && isInWhitelist(new File(e.getName()))) { + if (!e.isDirectory() && (e.getName().contains(".xml") || e.getName().contains(".nxml")) && isInWhitelist(new File(e.getName()))) { final String urlStr = "jar:" + directory.toURI() + "!/" + e.getName(); int exclamationIndex = urlStr.indexOf('!'); final String urlEncodedStr = urlStr.substring(0, exclamationIndex + 2) + Stream.of(urlStr.substring(exclamationIndex + 2).split("/")).map(x -> URLEncoder.encode(x, UTF_8)).collect(Collectors.joining("/")); From 2c80ed2c209587f3e4923e6e4aa79f0cbdc636e6 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 14 Feb 2022 17:58:07 +0100 Subject: [PATCH 129/269] Add a PMC DB multiplier reader descriptor. Resolves #128. The descriptor is just a rebranding for the DBMultiplierReader. It's the same basic JeDIS process of reading data from the database, marking them as being in process etc. --- .../desc/jcore-pmc-db-multiplier-reader.xml | 191 ++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier-reader.xml diff --git a/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier-reader.xml b/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier-reader.xml new file mode 100644 index 000000000..7cbc31dcf --- /dev/null +++ b/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier-reader.xml @@ -0,0 +1,191 @@ + + + org.apache.uima.java + de.julielab.jcore.reader.db.DBMultiplierReader + + JCoRe Database PMC Multiplier Reader + A collection reader that receives the IDs of documents from a database table. Additional tables may + be specified which will, together with the IDs, be sent to a CAS multiplier extending the DBMultiplierReader. + The multiplier will read documents and the joined additional tables according to the list of document IDs + sent by this reader. The component leverages the corpus storage system (CoStoSys) for this purpose and is + part of the Jena Document Information System, JeDIS. + + 2.6.0-SNAPSHOT + JULIE Lab Jena, Germany + + + ResetTable + If set to true and the parameter 'Table' is set to a subset table, the subset table will be + reset atthe initialization of the reader to be ready for processing of the whole subset. Do not use + when multiple readers read the same subset table. + + Boolean + false + false + + + Timestamp + PostgreSQL timestamp expression that is evaluated against the data table. The data table + schema, which must be the active data table schema in the CoStoSys configuration as always, must + specify a single timestamp field for this parameter to work. Only data rows with a timestamp value + larger than the given timestamp expression will be processed. Note that when reading from a subset + table, there may be subset rows indicated to be in process which are finally not read from the data + table. This is an implementational shortcoming and might be addressed if respective feature requests + are given through the JULIE Lab GitHub page or JCoRe issues. + + String + false + false + + + FetchIdsProactively + If set to true and when reading from a subset table, batches of document IDs will be + retrieved in a background thread while the previous batch is already in process. This is meant to + minimize waiting time for the database. Deactivate this feature if you encounter issues with + databaase connections. + + Boolean + false + true + + + AdditionalTables + An array of table names. By default, the table names will be resolved against the active + data postgres schema configured in the CoStoSys configuration file. If a name is already schema + qualified, i.e. contains a dot, the active data schema will be ignored. When reading documents from + the document data table, the additional tables will be joined onto the data table using the primary + keys of the queried documents. Using the table schema for the additional documents defined by the + 'AdditionalTableSchema' parameter, the columns that are marked as 'retrieve=true' in the table + schema, are returned together with the main document data. This mechanism is most prominently used + to retrieve annotation table data together with the original document text in XMI format for the + JeDIS system. + + String + true + false + + + AdditionalTableSchemas + The table schemas that corresponds to the additional tables given with the + 'AdditionalTables' parameter. If only one schema name is given, that schema must apply to all + additional tables. + + String + true + false + + + BatchSize + + Integer + false + true + + + DBDriver + Currently unused because the Hikari JDBC library should recognize the correct driver. + However, there seem to be cases where this doesn't work (HSQLDB). So we keep the parameter for + later. When this issue comes up, the driver would have to be set manually. This isn't done right + now. + + String + false + false + + + Table + The data or subset database table to read from. The name will be resolved against the + active Postgres schema defined in the CoStoSys configuration file.However, if the name contains a + schema qualification (i.e. 'schemaname.tablename), the configuration file will be ignored in this + point. + + String + false + true + + + SelectionOrder + WARNING: Potential SQL injection vulnerability. Do not let unknown users interact with your + database with this component. An SQL ORDER clause specifying in which order the documents in the + target database table should be processed. Only the clause itself must be specified, the ORDER + keyword is automatically added. + + String + false + false + + + WhereCondition + WARNING: Potential SQL injection vulnerability. Do not let unknown users interact with your + database with this component. Only used when reading data tables directly. No effect when the + 'tableName' parameter specifies a subset table. The parameter value should be an SQL WHERE clause + restricting the documents to be read. Only the clause itself must be specified, the WHERE keyword is + added automatically. + + String + false + false + + + Limit + + Integer + false + false + + + CostosysConfigFile + File path or classpath resource location to the CoStoSys XML configuration. This + configuration must specify the table schema of the table referred to by the 'Table' parameter as + active table schema. The active table schema is always the schema of the data table that is either + queried directly for documents or, if 'tableName' points to a subset table, indirectly through the + subset table. Make also sure that the active database connection in the configuration points to the + correct database. + + String + false + true + + + + + ResetTable + + false + + + + FetchIdsProactively + + true + + + + BatchSize + + 50 + + + + SelectionOrder + + + + + + + + + + + + + + + + true + false + true + + + \ No newline at end of file From 5fdf9ac06a70e3e8eeee5747f852ad1e6637bcb7 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 14 Feb 2022 18:05:11 +0100 Subject: [PATCH 130/269] Update the meta descriptor of the PMC DB reader to include the reader descriptor. --- jcore-pmc-db-reader/component.meta | 8 ++++++-- .../julielab/jcore/reader/xmi/XmiDBMultiplierReader.java | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/jcore-pmc-db-reader/component.meta b/jcore-pmc-db-reader/component.meta index c57c78fa7..0310e53ee 100644 --- a/jcore-pmc-db-reader/component.meta +++ b/jcore-pmc-db-reader/component.meta @@ -1,13 +1,17 @@ { "categories": [ - "multiplier", - "reader" + "reader", + "multiplier" ], "description": "JeDIS database reader for PMC base documents.", "descriptors": [ { "category": "multiplier", "location": "de.julielab.jcore.multiplier.pmc.desc.jcore-pmc-db-multiplier" + }, + { + "category": "reader", + "location": "de.julielab.jcore.multiplier.pmc.desc.jcore-pmc-db-multiplier-reader" } ], "exposable": true, diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierReader.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierReader.java index 185bdd1d4..60c405b2f 100644 --- a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierReader.java +++ b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplierReader.java @@ -94,7 +94,7 @@ public void initialize(UimaContext context) throws ResourceInitializationExcepti } @Override - public void getNext(JCas jCas) throws CollectionException { + public void getNext(JCas jCas) throws CollectionException, IOException { try { super.getNext(jCas); // The above call to super.getNext has created a RowBatch annotation which we retrieve here. From 3aa6cb1bf28629a735d716801342a17ebc58f610 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 14 Feb 2022 18:24:25 +0100 Subject: [PATCH 131/269] Avoid the addition of the "PMC" prefix if it is already there. --- .../java/de/julielab/jcore/reader/pmc/parser/FrontParser.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java index af4a2b944..124e47bef 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java @@ -110,7 +110,7 @@ else if (xPathExists(String.format(pubDateFmt, "pmc-release"))) header.setSource("PubMed Central"); header.setComponentId(PMCReader.class.getName()); - pmcid.ifPresent(id -> header.setDocId("PMC" + id)); + pmcid.ifPresent(id -> header.setDocId(id.startsWith("PMC") ? id : "PMC" + id)); pmid.ifPresent(p -> { OtherID otherID = new OtherID(nxmlDocumentParser.cas); otherID.setComponentId(PMCReader.class.getName()); From b0d95fc08f07be76c9e8ee69389c9e648b8e6113 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 15 Feb 2022 11:27:07 +0100 Subject: [PATCH 132/269] BANNER still shows concurrency issues. Added more synchronization. --- .../java/banner/tagging/pipe/LemmaPOS.java | 19 +++++++++++++------ .../jcore/ae/banner/BANNERAnnotator.java | 5 ++++- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java b/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java index 36e8a7cd5..41a0a8e5c 100644 --- a/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java +++ b/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java @@ -43,11 +43,13 @@ public LemmaPOS(Lemmatiser lemmatiser, Tagger posTagger) { public void setLemmatiser(Lemmatiser lemmatiser) { initResourcesMap(); getResources().lemmatiser = lemmatiser; +// System.out.println("Setting lemmatiser to " + Thread.currentThread()); } public void setPosTagger(Tagger posTagger) { initResourcesMap(); getResources().posTagger = posTagger; +// System.out.println("Setting PoS Tagger to " + Thread.currentThread()); } synchronized private void initResourcesMap() { @@ -56,12 +58,16 @@ synchronized private void initResourcesMap() { } private Resources getResources() { - return resourcesByThread.compute(Thread.currentThread(), (t, r) -> { - Resources ret = r; - if (ret == null) - ret = new Resources(); - return ret; - }); + Thread currentThread = Thread.currentThread(); + Resources resources = resourcesByThread.get(currentThread); + if (resources == null) { + resources = new Resources(); + synchronized (resourcesByThread) { +// System.out.println("Creating resources for thread " + currentThread); + resourcesByThread.put(currentThread, resources); + } + } + return resources; } @Override @@ -118,6 +124,7 @@ public String toString() { return "Resources{" + "lemmatiser=" + lemmatiser + ", posTagger=" + posTagger + + ", idHashCode= " + System.identityHashCode(this) + '}'; } } diff --git a/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java b/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java index b5c7e816e..9241d430f 100644 --- a/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java +++ b/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java @@ -139,7 +139,10 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { // model is deserialized multiple times, the FeatureSet#pipe field seems to be always the // exact same instance, containing a single instance of LemmaPOS (again, despite reading the model // file and deserializing it multiple times). This is why the Thread -> resources map was added. - tagger = CRFTagger.load(modelIs, lemmatiser, posTagger, dictionary); +// System.out.println("Initializing BANNER: " + Thread.currentThread() + " with lemmatiser " + lemmatiser + " and POS tagger " + posTagger); + synchronized (BANNERAnnotator.class) { + tagger = CRFTagger.load(modelIs, lemmatiser, posTagger, dictionary); + } } catch (IOException e) { log.error("Could not load the BANNER model at {}", modelFilename, e); throw new AnalysisEngineProcessException(e); From f44d24e78d5f7b626ea2f7244d9f99808e02fffa Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 15 Feb 2022 11:27:32 +0100 Subject: [PATCH 133/269] Adapt PMC component names to better variants. --- jcore-pmc-db-reader/pom.xml | 2 +- .../jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jcore-pmc-db-reader/pom.xml b/jcore-pmc-db-reader/pom.xml index 21d363909..5efb1a8b2 100644 --- a/jcore-pmc-db-reader/pom.xml +++ b/jcore-pmc-db-reader/pom.xml @@ -61,7 +61,7 @@ ${jcore-utilities-version}
- JCoRe Pubmed Central DB Reader + JCoRe PubMed Central DB Reader JULIE Lab Jena, Germany http://www.julielab.de diff --git a/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier.xml b/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier.xml index 1bf858c07..3193805bf 100644 --- a/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier.xml +++ b/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier.xml @@ -4,7 +4,7 @@ true de.julielab.jcore.multiplier.pmc.PMCDBMultiplier - JCoRe Abstract Database Multiplier + JCoRe PMC Database Multiplier A multiplier that receives document IDs to read from a database table from the DBMultiplierReader. The reader also delivers the path to the corpus storage system (CoStoSys) configuration and additional tables for joining with the main data table. This multiplier class is abstract and cannot be used directly.Extending classes must implement the next() method to actually read documents from the database and populate CASes with them. This component is a part of the Jena Document Information System, JeDIS. JULIE Lab Jena, Germany JULIE Lab Jena, Germany From 76d5190397b2e2cfdea82f7b0e11bc0079e8edcc Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 15 Feb 2022 11:31:37 +0100 Subject: [PATCH 134/269] Adapt the PMC DB reader name in component.meta, too. --- jcore-pmc-db-reader/component.meta | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-pmc-db-reader/component.meta b/jcore-pmc-db-reader/component.meta index 0310e53ee..667465029 100644 --- a/jcore-pmc-db-reader/component.meta +++ b/jcore-pmc-db-reader/component.meta @@ -21,5 +21,5 @@ "groupId": "de.julielab", "version": "2.6.0-SNAPSHOT" }, - "name": "JCoRe Pubmed Central DB Reader" + "name": "JCoRe PubMed Central DB Reader" } From 42906eabee103ef37a09efba0612c1bd980a1ee9 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 15 Feb 2022 12:03:07 +0100 Subject: [PATCH 135/269] Correcting the Maven module structure: The PMC DB reader belongs to JeDIS. --- jedis-parent/pom.xml | 1 + pom.xml | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/jedis-parent/pom.xml b/jedis-parent/pom.xml index 0b8807ef9..b5cbf4f94 100644 --- a/jedis-parent/pom.xml +++ b/jedis-parent/pom.xml @@ -36,6 +36,7 @@ ../jcore-xml-db-reader ../jcore-xmi-db-reader ../jcore-xmi-db-writer + ../jcore-pmc-db-reader diff --git a/pom.xml b/pom.xml index e4a6477fa..8ebd5e10c 100644 --- a/pom.xml +++ b/pom.xml @@ -299,8 +299,6 @@ jcore-jedis-integration-tests - jcore-pmc-db-reader - From f265bd22f3c39406701d1c788a8af0290d6aa4b3 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 15 Feb 2022 12:36:42 +0100 Subject: [PATCH 136/269] Import the flow controller types in the annotation defined flow controller descriptor. --- .../desc/jcore-annotation-defined-flowcontroller.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/jcore-flow-controllers/src/main/resources/de/julielab/jcore/flow/annotationdefined/desc/jcore-annotation-defined-flowcontroller.xml b/jcore-flow-controllers/src/main/resources/de/julielab/jcore/flow/annotationdefined/desc/jcore-annotation-defined-flowcontroller.xml index 78ea9b35d..b64a02723 100644 --- a/jcore-flow-controllers/src/main/resources/de/julielab/jcore/flow/annotationdefined/desc/jcore-annotation-defined-flowcontroller.xml +++ b/jcore-flow-controllers/src/main/resources/de/julielab/jcore/flow/annotationdefined/desc/jcore-annotation-defined-flowcontroller.xml @@ -9,6 +9,11 @@ JULIE Lab, Germany + + + + + false From b9f1ad189be2ab7a307f00ef4a7435c33d9be752 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 15 Feb 2022 12:37:22 +0100 Subject: [PATCH 137/269] Clarify the SHA hash checking parameter description for the XML database multipliers. --- .../java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java | 2 +- .../multiplier/pmc/desc/jcore-pmc-db-multiplier-reader.xml | 2 +- .../main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java b/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java index 551b8dacb..447e95929 100644 --- a/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java +++ b/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java @@ -42,7 +42,7 @@ public class PMCDBMultiplier extends DBMultiplier { private final static Logger log = LoggerFactory.getLogger(PMCDBMultiplier.class); @ConfigurationParameter(name = PARAM_OMIT_BIB_REFERENCES, mandatory = false, defaultValue = "false", description = "If set to true, references to the bibliography are omitted from the CAS text.") protected boolean omitBibReferences; - @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, mandatory = false, description = "For use with AnnotationDefinedFlowController. Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS skips all component except the DBCheckpointAE to mark the document as processed.") + @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, mandatory = false, description = "For use with AnnotationDefinedFlowController. Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS is directly routed to the components specified in the " + PARAM_TO_VISIT_KEYS + " parameter, skipping all other components. Note that this only works with AAEs where the first component is an 'AnnotationControlledFlow'.") private String documentItemToHash; @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT, mandatory = false, description = "For use with AnnotationDefinedFlowController. String parameter indicating the name of the " + "table where the XMI data and, thus, the hash is stored. The name must be schema qualified. Note that in this component, only the ToVisit annotation is created that determines which components to apply to a CAS with matching (unchanged) hash. The logic to actually control the CAS flow is contained in the AnnotationDefinedFlowController.") diff --git a/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier-reader.xml b/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier-reader.xml index 7cbc31dcf..6bfd2a7c3 100644 --- a/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier-reader.xml +++ b/jcore-pmc-db-reader/src/main/resources/de/julielab/jcore/multiplier/pmc/desc/jcore-pmc-db-multiplier-reader.xml @@ -3,7 +3,7 @@ org.apache.uima.java de.julielab.jcore.reader.db.DBMultiplierReader - JCoRe Database PMC Multiplier Reader + JCoRe PMC Database Multiplier Reader A collection reader that receives the IDs of documents from a database table. Additional tables may be specified which will, together with the IDs, be sent to a CAS multiplier extending the DBMultiplierReader. The multiplier will read documents and the joined additional tables according to the list of document IDs diff --git a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java index ae7de1cef..b429470c2 100644 --- a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java +++ b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java @@ -57,7 +57,7 @@ public class XMLDBMultiplier extends DBMultiplier { protected String[] rowMappingArray; @ConfigurationParameter(name = PARAM_MAPPING_FILE, description = XMLDBReader.DESC_MAPPING_FILE) protected String mappingFileStr; - @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, mandatory = false, description = "For use with AnnotationDefinedFlowController. Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS skips all component except the DBCheckpointAE to mark the document as processed.") + @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, mandatory = false, description = "For use with AnnotationDefinedFlowController. Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS is directly routed to the components specified in the " + PARAM_TO_VISIT_KEYS + " parameter, skipping all other components. Note that this only works with AAEs where the first component is an 'AnnotationControlledFlow'.") private String documentItemToHash; @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT, mandatory = false, description = "For use with AnnotationDefinedFlowController. String parameter indicating the name of the " + "table where the XMI data and, thus, the hash is stored. The name must be schema qualified. Note that in this component, only the ToVisit annotation is created that determines which components to apply to a CAS with matching (unchanged) hash. The logic to actually control the CAS flow is contained in the AnnotationDefinedFlowController.") From d62512a9560253a4b58b2ce5b6b855de06e9fcbb Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 15 Feb 2022 17:03:24 +0100 Subject: [PATCH 138/269] Start writing a consumer for GNormPlus compatible BioC documents. Regards #129. --- jcore-gnp-bioc-writer/LICENSE | 26 ++ jcore-gnp-bioc-writer/README.md | 34 ++ jcore-gnp-bioc-writer/pom.xml | 54 +++ .../consumer/gnp/BioCCollectionWriter.java | 55 +++ .../consumer/gnp/BioCDocumentPopulator.java | 29 ++ .../consumer/gnp/GNormPlusFormatWriter.java | 75 ++++ .../jcore/consumer/gnp/desc/PLACEHOLDER | 1 + .../gnp/desc/jcore-gnp-bioc-writer.xml | 21 ++ .../gnp/GNormPlusFormatWriterTest.java | 13 + pom.xml | 341 ++++++++++++------ 10 files changed, 536 insertions(+), 113 deletions(-) create mode 100644 jcore-gnp-bioc-writer/LICENSE create mode 100644 jcore-gnp-bioc-writer/README.md create mode 100644 jcore-gnp-bioc-writer/pom.xml create mode 100644 jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java create mode 100644 jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java create mode 100644 jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java create mode 100644 jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/PLACEHOLDER create mode 100644 jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/jcore-gnp-bioc-writer.xml create mode 100644 jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriterTest.java diff --git a/jcore-gnp-bioc-writer/LICENSE b/jcore-gnp-bioc-writer/LICENSE new file mode 100644 index 000000000..fbbd41e05 --- /dev/null +++ b/jcore-gnp-bioc-writer/LICENSE @@ -0,0 +1,26 @@ +BSD 2-Clause License + +Copyright (c) 2017, JULIE Lab +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/jcore-gnp-bioc-writer/README.md b/jcore-gnp-bioc-writer/README.md new file mode 100644 index 000000000..6b6af0a20 --- /dev/null +++ b/jcore-gnp-bioc-writer/README.md @@ -0,0 +1,34 @@ +# JCoRe GNormPlus BioC Writer + +**Descriptor Path**: +``` +de.julielab.jcore.consumer.gnp.desc.jcore-gnp-bioc-writer +``` + +Writes CAS documents into the BioC XML format used by the gene tagger and normalizer GNormPlus. + + + +**1. Parameters** + +| Parameter Name | Parameter Type | Mandatory | Multivalued | Description | +|----------------|----------------|-----------|-------------|-------------| +| param1 | UIMA-Type | Boolean | Boolean | Description | +| param2 | UIMA-Type | Boolean | Boolean | Description | + +**2. Predefined Settings** + +| Parameter Name | Parameter Syntax | Example | +|----------------|------------------|---------| +| param1 | Syntax-Description | `Example` | +| param2 | Syntax-Description | `Example` | + +**3. Capabilities** + +| Type | Input | Output | +|------|:-----:|:------:| +| de.julielab.jcore.types.TYPE | | `+` | +| de.julielab.jcore.types.ace.TYPE | `+` | | + + +[1] Some Literature? diff --git a/jcore-gnp-bioc-writer/pom.xml b/jcore-gnp-bioc-writer/pom.xml new file mode 100644 index 000000000..465d66c2e --- /dev/null +++ b/jcore-gnp-bioc-writer/pom.xml @@ -0,0 +1,54 @@ + + + + 4.0.0 + jcore-gnp-bioc-writer + jar + de.julielab + + + de.julielab + jcore-base + 2.6.0-SNAPSHOT + + + + + ch.qos.logback + logback-classic + test + + + org.slf4j + slf4j-api + + + de.julielab + jcore-types + ${jcore-types-version} + + + com.pengyifan.bioc + pengyifan-bioc + 1.0.3 + + + de.julielab + jcore-utilities + ${jcore-utilities-version} + + + org.junit.jupiter + junit-jupiter-engine + + + JCoRe GNormPlus BioC Writer + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-gnp-bioc-writer + Writes CAS documents into the BioC XML format used by the gene tagger and normalizer GNormPlus. + + diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java new file mode 100644 index 000000000..a0e03880a --- /dev/null +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java @@ -0,0 +1,55 @@ +package de.julielab.jcore.consumer.gnp; + +import com.pengyifan.bioc.BioCCollection; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.xml.stream.XMLStreamException; +import java.io.File; +import java.io.IOException; + +/** + * Writes a collection of BioC documents into a single file. That file is created within a subdirectory of + * some base directory und changes over time to avoid overflowing directories. + */ +public class BioCCollectionWriter { + private final static Logger log = LoggerFactory.getLogger(BioCCollectionWriter.class); + private int numFilesPerDir; + private File baseDir; + private File currentDir; + private int numWrittenIntoCurrentDir; + + public BioCCollectionWriter(int numFilesPerDir, File baseDir) { + this.numFilesPerDir = numFilesPerDir; + this.baseDir = baseDir; + } + + public void writeBioCCollection(BioCCollection collection) throws XMLStreamException, IOException { + File collectionFile = null; + synchronized (BioCCollectionWriter.class) { + if (!baseDir.exists()) { + log.debug("Creating base BioC collection directory {}", baseDir); + baseDir.mkdirs(); + } + if (currentDir == null) { + int i = 0; + do { + currentDir = new File(baseDir, "bioc_collections_" + i++); + } while (currentDir.exists()); + i = 0; + do { + collectionFile = new File(currentDir, "bioc_collection_" + i++ + ".xml"); + } while (collectionFile.exists()); + } + } + + com.pengyifan.bioc.io.BioCCollectionWriter writer = new com.pengyifan.bioc.io.BioCCollectionWriter(collectionFile); + writer.writeCollection(collection); + ++numWrittenIntoCurrentDir; + // "close" the current directory if the number of files for it has been reached + if (numWrittenIntoCurrentDir >= numFilesPerDir) { + currentDir = null; + numWrittenIntoCurrentDir = 0; + } + } +} diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java new file mode 100644 index 000000000..e161ee4ad --- /dev/null +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java @@ -0,0 +1,29 @@ +package de.julielab.jcore.consumer.gnp; + +import com.pengyifan.bioc.BioCDocument; +import de.julielab.jcore.types.AbstractText; +import de.julielab.jcore.types.Section; +import de.julielab.jcore.types.Title; +import de.julielab.jcore.types.Zone; +import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.jcas.JCas; + +/** + * Extracts text passages from the CAS and adds them to a new BioCDocument. + */ +public class BioCDocumentPopulator { + public BioCDocument populate(JCas jCas) { + AnnotationIndex zoneIndex = jCas.getAnnotationIndex(Zone.type); + for (Zone z : zoneIndex) { + if (z instanceof Title) { + // only document title; other titles should be accessed via features of the zone body + } + else if (z instanceof AbstractText) { + // don't check for structured parts; for GNormPlus the only important thing is title, abstract, body + } else if (z instanceof Section) { + // handle headings + } + } + return null; + } +} diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java new file mode 100644 index 000000000..cde7c209b --- /dev/null +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java @@ -0,0 +1,75 @@ +package de.julielab.jcore.consumer.gnp; + +import com.pengyifan.bioc.BioCCollection; +import com.pengyifan.bioc.BioCDocument; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.util.Date; + +@ResourceMetaData(name = "JCoRe GNormPlus BioC Writer", description = "Writes CAS documents into the BioC XML format used by the gene tagger and normalizer GNormPlus.", vendor = "JULIE Lab Jena, Germany") +@TypeCapability(inputs = {}, outputs = {}) +public class GNormPlusFormatWriter extends JCasAnnotator_ImplBase { + + public static final String PARAM_NUM_DOCS_PER_FILE = "NumDocsPerFile"; + public static final String PARAM_NUM_FILES_PER_DIR = "NumFilesPerDir"; + public static final String PARAM_BASE_DIR = "BaseDirectory"; + private final static Logger log = LoggerFactory.getLogger(GNormPlusFormatWriter.class); + @ConfigurationParameter(name = PARAM_NUM_DOCS_PER_FILE, description = "The number of documents (i.e. CASes) that should be written into a single BioC XML file.") + private int numDocsPerFile; + @ConfigurationParameter(name = PARAM_NUM_FILES_PER_DIR, description = "The number of files that should be put in a directory before a new one is created.") + private int numDocsPerDir; + @ConfigurationParameter(name = PARAM_BASE_DIR, description = "The base directory into which to create new directories that contain the actual BioC collection files.") + private String baseDirectory; + + private BioCDocumentPopulator bioCDocumentPopulator; + private BioCCollectionWriter bioCCollectionWriter; + private BioCCollection currentCollection; + + /** + * This method is called a single time by the framework at component + * creation. Here, descriptor parameters are read and initial setup is done. + */ + @Override + public void initialize(final UimaContext aContext) throws ResourceInitializationException { + numDocsPerFile = (int) aContext.getConfigParameterValue(PARAM_NUM_DOCS_PER_FILE); + numDocsPerDir = (int) aContext.getConfigParameterValue(PARAM_NUM_FILES_PER_DIR); + baseDirectory = (String) aContext.getConfigParameterValue(PARAM_BASE_DIR); + + bioCDocumentPopulator = new BioCDocumentPopulator(); + bioCCollectionWriter = new BioCCollectionWriter(numDocsPerDir, new File(baseDirectory)); + + currentCollection = new BioCCollection("UTF-8", "1.0", new Date().toString(), true, "JCoRe GNormPlus BioC Writer", "PubTator.key"); + } + + /** + * This method is called for each document going through the component. This + * is where the actual work happens. + */ + @Override + public void process(final JCas jCas) throws AnalysisEngineProcessException { + try { + BioCDocument doc = bioCDocumentPopulator.populate(jCas); + currentCollection.addDocument(doc); + if (currentCollection.getDocmentCount() >= numDocsPerFile) { + bioCCollectionWriter.writeBioCCollection(currentCollection); + currentCollection.clearDocuments(); + currentCollection.clearInfons(); + } + } catch (Exception e) { + log.error("Exception was raised for document {}", JCoReTools.getDocId(jCas)); + throw new AnalysisEngineProcessException(e); + } + } +} + diff --git a/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/PLACEHOLDER b/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/PLACEHOLDER new file mode 100644 index 000000000..9f6c6ddb5 --- /dev/null +++ b/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/PLACEHOLDER @@ -0,0 +1 @@ +The actual descriptor must be created by UIMA fit. diff --git a/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/jcore-gnp-bioc-writer.xml b/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/jcore-gnp-bioc-writer.xml new file mode 100644 index 000000000..47d89e355 --- /dev/null +++ b/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/jcore-gnp-bioc-writer.xml @@ -0,0 +1,21 @@ + + + org.apache.uima.java + true + GNormPlusFormatWriter + + JCoRe GNormPlus BioC Writer + + 2.3.0-SNAPSHOT + JULIE Lab Jena, Germany + + + + + + true + true + false + + + diff --git a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriterTest.java b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriterTest.java new file mode 100644 index 000000000..1f6b31b3e --- /dev/null +++ b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriterTest.java @@ -0,0 +1,13 @@ + +package de.julielab.jcore.consumer.gnp; + + + + +/** + * Unit tests for jcore-gnp-bioc-writer. + * + */ +public class GNormPlusFormatWriterTest{ +// TODO +} diff --git a/pom.xml b/pom.xml index 8ebd5e10c..995fec529 100644 --- a/pom.xml +++ b/pom.xml @@ -1,321 +1,436 @@ - + + 4.0.0 - + + - + + de.julielab - + + jcore-parent - + + 2.5.2-SNAPSHOT - + + - + + jcore-base - + + pom - + + JCoRe Base - + + The POM for the JCoRe Base projects. - + + 2.6.0-SNAPSHOT - + + - + + JULIE Lab, Germany - + + http://www.julielab.de - + + - + + - + + - + + BSD-2-Clause - + + https://opensource.org/licenses/BSD-2-Clause - + + - + + - + + https://github.com/JULIELab/jcore-base - + + - + + - + + org.apache.uima - + + uimaj-core - + + ${uima-version} - + + - + + - + + org.apache.uima - + + uimafit-core - + + ${uimafit-version} - + + - + + - + + - + + jcore-annotation-adder-ae - + + jcore-ace-reader - + + jcore-acronym-ae - + + jcore-acronym-writer - + + jcore-banner-ae - + + jcore-bc2gm-reader - + + jcore-bc2gmformat-writer - + + jcore-biolemmatizer-ae - + + jcore-bionlpformat-consumer - + + jcore-bionlpformat-reader - + + jcore-biosem-ae - + + jcore-conll-consumer - + + jcore-coordination-baseline-ae - + + jcore-cord19-reader - + + jcore-coreference-writer - + + jcore-ct-reader - + + jcore-db-checkpoint-ae - + + jcore-descriptor-creator - + + jcore-dta-reader - + + jcore-ec-code-ae - + + jcore-elasticsearch-consumer - + + jcore-embedding-writer - + + jcore-event-flattener-ae - + + jcore-feature-value-replacement-ae - + + jcore-file-reader - + + jcore-flair-ner-ae - + + jcore-flair-token-embedding-ae - + + jcore-flow-controllers - + + jcore-gnp-bioc-writer jcore-iexml-consumer - + + jcore-iexml-reader - + + jcore-ign-reader - + + jcore-iob-consumer - + + jcore-jnet-ae - + + jcore-jpos-ae - + + jcore-jsbd-ae - + + jcore-jtbd-ae - + + jcore-julielab-entity-evaluator-consumer - + + jcore-likelihood-assignment-ae - + + jcore-likelihood-detection-ae - + + jcore-line-multiplier - + + jcore-lingpipegazetteer-ae - + + jcore-lingpipe-porterstemmer-ae - + + jcore-lingscope-ae - + + jcore-linnaeus-species-ae - + + jcore-mantra-xml-types - + + jcore-medxn-ae - + + jcore-msdoc-reader - + + jcore-mstparser-ae - + + jcore-muc7-reader - + + jcore-mutationfinder-ae - + + jcore-neo4j-relations-consumer - + + jcore-opennlp-chunk-ae - + + jcore-opennlp-parser-ae - + + jcore-opennlp-postag-ae - + + jcore-opennlp-sentence-ae - + + jcore-opennlp-token-ae - + + jcore-ppd-writer - + + jcore-pmc-reader - + + jcore-pubtator-reader - + + jcore-stanford-lemmatizer-ae - + + jcore-topic-indexing-ae - + + jcore-topics-writer - + + jcore-txt-consumer - + + jcore-types - + + jcore-utilities - + + jcore-xml-mapper - + + jcore-xml-reader - + + jcore-xmi-reader - + + jcore-xmi-writer - + + jedis-parent - + + jcore-jedis-integration-tests - + + + jcore-gnp-bioc-writer + - + + - + + scm:git:https://github.com/JULIELab/jcore-base - + + scm:git:https://github.com/JULIELab/jcore-base - + + scm:git:https://github.com/JULIELab/jcore-base - + + - + + From 1af3f7223c2454aeb398701082fe0e5d3c1fdeef Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 16 Feb 2022 10:41:40 +0100 Subject: [PATCH 139/269] Implement the BioCDocumentPopulator. --- .../consumer/gnp/BioCDocumentPopulator.java | 67 ++++++++++++++++--- .../consumer/gnp/GNormPlusFormatWriter.java | 15 ++++- 2 files changed, 70 insertions(+), 12 deletions(-) diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java index e161ee4ad..e1a096c1c 100644 --- a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java @@ -1,29 +1,76 @@ package de.julielab.jcore.consumer.gnp; import com.pengyifan.bioc.BioCDocument; -import de.julielab.jcore.types.AbstractText; -import de.julielab.jcore.types.Section; -import de.julielab.jcore.types.Title; -import de.julielab.jcore.types.Zone; +import com.pengyifan.bioc.BioCPassage; +import de.julielab.jcore.types.*; +import de.julielab.jcore.utility.JCoReTools; import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.jcas.JCas; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Extracts text passages from the CAS and adds them to a new BioCDocument. */ public class BioCDocumentPopulator { + private final static Logger log = LoggerFactory.getLogger(BioCDocumentPopulator.class); + public BioCDocument populate(JCas jCas) { + BioCDocument doc = new BioCDocument(JCoReTools.getDocId(jCas)); AnnotationIndex zoneIndex = jCas.getAnnotationIndex(Zone.type); for (Zone z : zoneIndex) { if (z instanceof Title) { - // only document title; other titles should be accessed via features of the zone body - } - else if (z instanceof AbstractText) { + Title t = (Title) z; + String titleType; + switch (t.getTitleType()) { + case "document": + titleType = "title"; + break; + case "section": + titleType = "section_title"; + break; + case "figure": + titleType = "figure_title"; + break; + case "table": + titleType = "table_title"; + break; + default: + log.debug("Unhandled title type {}", t.getTitleType()); + titleType = "other_title"; + break; + } + BioCPassage p = getPassageForAnnotation(t); + p.putInfon("type", titleType); + doc.addPassage(p); + } else if (z instanceof AbstractText) { // don't check for structured parts; for GNormPlus the only important thing is title, abstract, body - } else if (z instanceof Section) { - // handle headings + AbstractText at = (AbstractText) z; + BioCPassage p = getPassageForAnnotation(at); + p.putInfon("type", "abstract"); + doc.addPassage(p); + } else if (z instanceof Paragraph) { + Paragraph pa = (Paragraph) z; + BioCPassage p = getPassageForAnnotation(pa); + p.putInfon("type", "paragraph"); + doc.addPassage(p); + } else { + log.debug("Unhandled Zone: {}", z); } } - return null; + return doc; + } + + /** + * Creates a BioCPassage with offset and text corresponding to the passed annotation a. + * + * @param a The annotation to create a BioCPassage for. + * @return A BioCPassage corresponding to a in offset and text. + */ + private BioCPassage getPassageForAnnotation(Annotation a) { + BioCPassage p = new BioCPassage(); + p.setOffset(a.getBegin()); + p.setText(a.getCoveredText()); + return p; } } diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java index cde7c209b..0aa125cc0 100644 --- a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java @@ -10,7 +10,6 @@ import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,7 +40,7 @@ public class GNormPlusFormatWriter extends JCasAnnotator_ImplBase { * creation. Here, descriptor parameters are read and initial setup is done. */ @Override - public void initialize(final UimaContext aContext) throws ResourceInitializationException { + public void initialize(final UimaContext aContext) { numDocsPerFile = (int) aContext.getConfigParameterValue(PARAM_NUM_DOCS_PER_FILE); numDocsPerDir = (int) aContext.getConfigParameterValue(PARAM_NUM_FILES_PER_DIR); baseDirectory = (String) aContext.getConfigParameterValue(PARAM_BASE_DIR); @@ -71,5 +70,17 @@ public void process(final JCas jCas) throws AnalysisEngineProcessException { throw new AnalysisEngineProcessException(e); } } + + @Override + public void collectionProcessComplete() throws AnalysisEngineProcessException { + super.collectionProcessComplete(); + try { + if (currentCollection.getDocmentCount() != 0) + bioCCollectionWriter.writeBioCCollection(currentCollection); + } catch (Exception e) { + log.error("Could not write final batch of BioCDocuments.", e); + throw new AnalysisEngineProcessException(e); + } + } } From bbc96482a72e0990238476f9d41017a7cc2f8065 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 16 Feb 2022 12:48:43 +0100 Subject: [PATCH 140/269] Add the title type `abstract` to abstract titles. --- .../julielab/jcore/reader/pmc/resources/elementproperties.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties.yml b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties.yml index 230bbf929..321ddf287 100644 --- a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties.yml +++ b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties.yml @@ -7,6 +7,9 @@ title: default-feature-values: titleType: other paths: + - path: abstract/title + default-feature-values: + titleType: abstract - path: sec/title type: de.julielab.jcore.types.SectionTitle default-feature-values: From 6be8f9c6257db550788281562b09769e211f8782 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 16 Feb 2022 16:08:22 +0100 Subject: [PATCH 141/269] Implement tests for the GNormPlus BioC writer. Writing into BioC documents does not. They are also distributed into files and directories as intended. --- jcore-gnp-bioc-writer/pom.xml | 4 + .../consumer/gnp/BioCCollectionWriter.java | 35 ++++---- .../consumer/gnp/BioCDocumentPopulator.java | 11 ++- .../consumer/gnp/GNormPlusFormatWriter.java | 4 +- .../gnp/BioCDocumentPopulatorTest.java | 41 +++++++++ .../gnp/GNormPlusFormatWriterTest.java | 77 ++++++++++++++++- .../consumer/gnp/TestDocumentGenerator.java | 85 +++++++++++++++++++ 7 files changed, 233 insertions(+), 24 deletions(-) create mode 100644 jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java create mode 100644 jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/TestDocumentGenerator.java diff --git a/jcore-gnp-bioc-writer/pom.xml b/jcore-gnp-bioc-writer/pom.xml index 465d66c2e..4381dfd93 100644 --- a/jcore-gnp-bioc-writer/pom.xml +++ b/jcore-gnp-bioc-writer/pom.xml @@ -42,6 +42,10 @@ org.junit.jupiter junit-jupiter-engine
+ + org.assertj + assertj-core + JCoRe GNormPlus BioC Writer diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java index a0e03880a..785976d1a 100644 --- a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java @@ -5,8 +5,9 @@ import org.slf4j.LoggerFactory; import javax.xml.stream.XMLStreamException; -import java.io.File; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; /** * Writes a collection of BioC documents into a single file. That file is created within a subdirectory of @@ -15,34 +16,36 @@ public class BioCCollectionWriter { private final static Logger log = LoggerFactory.getLogger(BioCCollectionWriter.class); private int numFilesPerDir; - private File baseDir; - private File currentDir; + private Path baseDir; + private Path currentDir; private int numWrittenIntoCurrentDir; - public BioCCollectionWriter(int numFilesPerDir, File baseDir) { + public BioCCollectionWriter(int numFilesPerDir, Path baseDir) { this.numFilesPerDir = numFilesPerDir; this.baseDir = baseDir; } public void writeBioCCollection(BioCCollection collection) throws XMLStreamException, IOException { - File collectionFile = null; + Path collectionFile = null; synchronized (BioCCollectionWriter.class) { - if (!baseDir.exists()) { - log.debug("Creating base BioC collection directory {}", baseDir); - baseDir.mkdirs(); - } + // currentDir is either null at the very beginning or after a batch of documents have been written if (currentDir == null) { int i = 0; do { - currentDir = new File(baseDir, "bioc_collections_" + i++); - } while (currentDir.exists()); - i = 0; - do { - collectionFile = new File(currentDir, "bioc_collection_" + i++ + ".xml"); - } while (collectionFile.exists()); + currentDir = Path.of(baseDir.toString(), "bioc_collections_" + i++); + } while (Files.exists(currentDir)); + } + int i = 0; + do { + collectionFile = Path.of(currentDir.toString(), "bioc_collection_" + i++ + ".xml"); + } while (Files.exists(collectionFile)); + if (!Files.exists(collectionFile.getParent())) { + log.debug("Creating base BioC collection directory {}", baseDir); + Files.createDirectories(collectionFile.getParent()); } } - + if (collectionFile == null) + throw new IllegalStateException("No file for the next collection was constructed. This is a programming error."); com.pengyifan.bioc.io.BioCCollectionWriter writer = new com.pengyifan.bioc.io.BioCCollectionWriter(collectionFile); writer.writeCollection(collection); ++numWrittenIntoCurrentDir; diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java index e1a096c1c..7dd246876 100644 --- a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java @@ -22,6 +22,8 @@ public BioCDocument populate(JCas jCas) { if (z instanceof Title) { Title t = (Title) z; String titleType; + if (t.getTitleType() == null) + throw new IllegalArgumentException("The titleType feature was not set for " + t); switch (t.getTitleType()) { case "document": titleType = "title"; @@ -54,8 +56,13 @@ public BioCDocument populate(JCas jCas) { BioCPassage p = getPassageForAnnotation(pa); p.putInfon("type", "paragraph"); doc.addPassage(p); - } else { - log.debug("Unhandled Zone: {}", z); + } else if (z instanceof Caption) { + Caption c = (Caption) z; + BioCPassage p = getPassageForAnnotation(c); + if (c.getCaptionType() == null) + throw new IllegalArgumentException("The captionType feature is null for " + c); + p.putInfon("type", c.getCaptionType()); + doc.addPassage(p); } } return doc; diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java index 0aa125cc0..24f016a69 100644 --- a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java @@ -13,7 +13,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.File; +import java.nio.file.Path; import java.util.Date; @ResourceMetaData(name = "JCoRe GNormPlus BioC Writer", description = "Writes CAS documents into the BioC XML format used by the gene tagger and normalizer GNormPlus.", vendor = "JULIE Lab Jena, Germany") @@ -46,7 +46,7 @@ public void initialize(final UimaContext aContext) { baseDirectory = (String) aContext.getConfigParameterValue(PARAM_BASE_DIR); bioCDocumentPopulator = new BioCDocumentPopulator(); - bioCCollectionWriter = new BioCCollectionWriter(numDocsPerDir, new File(baseDirectory)); + bioCCollectionWriter = new BioCCollectionWriter(numDocsPerDir, Path.of(baseDirectory)); currentCollection = new BioCCollection("UTF-8", "1.0", new Date().toString(), true, "JCoRe GNormPlus BioC Writer", "PubTator.key"); } diff --git a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java new file mode 100644 index 000000000..8f831bbf0 --- /dev/null +++ b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java @@ -0,0 +1,41 @@ +package de.julielab.jcore.consumer.gnp; + +import com.pengyifan.bioc.BioCCollection; +import com.pengyifan.bioc.BioCDocument; +import com.pengyifan.bioc.io.BioCCollectionWriter; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.Test; + +import java.io.ByteArrayOutputStream; +import java.nio.charset.StandardCharsets; +import java.util.Date; + +import static org.assertj.core.api.Assertions.assertThat; +class BioCDocumentPopulatorTest { + @Test + public void populate() throws Exception { + BioCDocumentPopulator populator = new BioCDocumentPopulator(); + JCas jCas = TestDocumentGenerator.prepareCas(1); + BioCDocument biocDoc = populator.populate(jCas); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + BioCCollection collection = new BioCCollection("UTF-8", "1.0", (new Date()).toString(), true, "jUnit Test", "PubTator.key"); + collection.addDocument(biocDoc); + BioCCollectionWriter collectionWriter = new BioCCollectionWriter(baos); + collectionWriter.writeCollection(collection); + String resultXml = baos.toString(StandardCharsets.UTF_8); + // Just check that the test text contents are there that are used in TestDocumentGenerator and that + // there are no duplicates + assertThat(resultXml).containsOnlyOnce("This is the title of document 1."); + assertThat(resultXml).containsOnlyOnce("title"); + // The abstract should be one single string + assertThat(resultXml).containsOnlyOnce("BACKGROUND This abstract section belongs to document 1.\nRESULTS There are certainly some results reported by document 1."); + assertThat(resultXml).containsOnlyOnce("INTRODUCTION"); + assertThat(resultXml).containsOnlyOnce("section_title"); + assertThat(resultXml).contains("paragraph"); + assertThat(resultXml).containsOnlyOnce("This is section 1, paragraph 1 of document 1."); + assertThat(resultXml).containsOnlyOnce("This is a second paragraph in the first section."); + assertThat(resultXml).containsOnlyOnce("table_title"); + assertThat(resultXml).containsOnlyOnce("Tab1."); + assertThat(resultXml).containsOnlyOnce("This is the table1 caption."); + } +} \ No newline at end of file diff --git a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriterTest.java b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriterTest.java index 1f6b31b3e..16a3ec233 100644 --- a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriterTest.java +++ b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriterTest.java @@ -1,13 +1,82 @@ - package de.julielab.jcore.consumer.gnp; +import com.pengyifan.bioc.BioCCollection; +import com.pengyifan.bioc.io.BioCCollectionReader; +import org.apache.commons.io.FileUtils; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import static org.assertj.core.api.Assertions.assertThat; /** * Unit tests for jcore-gnp-bioc-writer. - * */ -public class GNormPlusFormatWriterTest{ -// TODO +public class GNormPlusFormatWriterTest { + + private static final Path BASEDIR = Path.of("src", "test", "resources", "testoutput"); + + @AfterAll + public static void cleanFinally() { + FileUtils.deleteQuietly(BASEDIR.toFile()); + } + + @BeforeEach + public void cleanOutput() { + FileUtils.deleteQuietly(BASEDIR.toFile()); + } + + private AnalysisEngine getWriterInstance(int docsPerFile, int filesPerDir) throws ResourceInitializationException { + return AnalysisEngineFactory.createEngine(GNormPlusFormatWriter.class, GNormPlusFormatWriter.PARAM_BASE_DIR, BASEDIR.toString(), GNormPlusFormatWriter.PARAM_NUM_DOCS_PER_FILE, docsPerFile, GNormPlusFormatWriter.PARAM_NUM_FILES_PER_DIR, filesPerDir); + } + + @Test + public void process1() throws Exception { + // write a single document + JCas jCas = TestDocumentGenerator.prepareCas(1); + AnalysisEngine writer = getWriterInstance(1, 1); + writer.process(jCas); + writer.collectionProcessComplete(); + + assertThat(Path.of(BASEDIR.toString(), "bioc_collections_0", "bioc_collection_0.xml")).exists().isNotEmptyFile(); + } + + @Test + public void process2() throws Exception { + // write a single document + JCas jCas = TestDocumentGenerator.createTestJCas(); + AnalysisEngine writer = getWriterInstance(2, 3); + for (int i = 0; i < 15; ++i) { + TestDocumentGenerator.prepareCas(jCas, i); + writer.process(jCas); + jCas.reset(); + } + writer.collectionProcessComplete(); + + assertThat(Files.list(BASEDIR)).hasSize(3); + for (int i : List.of(0, 1, 2)) { + List fileIndices = i < 2 ? List.of(0, 1, 2) : List.of(0,1); + for (int j : fileIndices) { + assertThat(Path.of(BASEDIR.toString(), "bioc_collections_"+i, "bioc_collection_"+j+".xml")).exists().isNotEmptyFile(); + } + } + // there should only be two files in the last directory + assertThat(Path.of(BASEDIR.toString(), "bioc_collections_2", "bioc_collection_2.xml")).doesNotExist(); + + // the last file should only contain a single document + BioCCollectionReader reader = new BioCCollectionReader(Path.of(BASEDIR.toString(), "bioc_collections_2", "bioc_collection_1.xml")); + BioCCollection lastCollection = reader.readCollection(); + assertThat(lastCollection.getDocmentCount()).isEqualTo(1); + + } + } diff --git a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/TestDocumentGenerator.java b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/TestDocumentGenerator.java new file mode 100644 index 000000000..da5e83a6f --- /dev/null +++ b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/TestDocumentGenerator.java @@ -0,0 +1,85 @@ +package de.julielab.jcore.consumer.gnp; + +import de.julielab.jcore.types.*; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.uima.UIMAException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; + +public class TestDocumentGenerator { + + public static JCas createTestJCas() throws UIMAException { + return JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); + } + + public static JCas prepareCas(int docId) throws UIMAException { + JCas jCas = createTestJCas(); + return prepareCas(jCas, docId); + } + + public static JCas prepareCas(JCas jCas, int docId) { + Header h = new de.julielab.jcore.types.pubmed.Header(jCas); + h.setDocId(String.valueOf(docId)); + h.addToIndexes(); + + StringBuilder sb = new StringBuilder(); + String ls = System.getProperty("line.separator"); + int currentBegin = sb.length(); + sb.append("This is the title of document ").append(docId).append("."); + Title t = new Title(jCas, currentBegin, sb.length()); + t.setTitleType("document"); + t.addToIndexes(); + currentBegin = sb.length(); + sb.append("BACKGROUND This abstract section belongs to document ").append(docId).append("."); + AbstractSectionHeading ash1 = new AbstractSectionHeading(jCas, currentBegin, currentBegin + 10); + ash1.setTitleType("abstract"); + AbstractSection as1 = new AbstractSection(jCas, currentBegin, sb.length()); + as1.setAbstractSectionHeading(ash1); + currentBegin = sb.length(); + sb.append(ls); + sb.append("RESULTS There are certainly some results reported by document ").append(docId).append("."); + AbstractSectionHeading ash2 = new AbstractSectionHeading(jCas, currentBegin, currentBegin + 7); + ash2.setTitleType("abstract"); + AbstractSection as2 = new AbstractSection(jCas, currentBegin, sb.length()); + as2.setAbstractSectionHeading(ash2); + AbstractText at = new AbstractText(jCas, as1.getBegin(), as2.getEnd()); + at.setStructuredAbstractParts(JCoReTools.addToFSArray(JCoReTools.addToFSArray(null, as1), as2)); + at.addToIndexes(); + sb.append(ls); + currentBegin = sb.length(); + sb.append("INTRODUCTION This is section 1, paragraph 1 of document ").append(docId).append("."); + SectionTitle st1 = new SectionTitle(jCas, currentBegin, currentBegin + 12); + st1.setTitleType("section"); + Section s1 = new Section(jCas, currentBegin, sb.length()); + st1.addToIndexes(); + s1.setSectionHeading(st1); + s1.addToIndexes(); + // paragraphs do not include the heading + Paragraph p11 = new Paragraph(jCas, s1.getBegin() + 13, s1.getEnd()); + p11.addToIndexes(); + currentBegin = sb.length(); + sb.append("This is a second paragraph in the first section."); + Paragraph p12 = new Paragraph(jCas, currentBegin, sb.length()); + p12.addToIndexes(); + currentBegin = sb.length(); + int objectBegin = sb.length(); + sb.append("Let this be table content."); + currentBegin = sb.length(); + sb.append("Tab1."); + Title tabTitle = new Title(jCas, currentBegin, sb.length()); + tabTitle.setTitleType("table"); + tabTitle.addToIndexes(); + currentBegin = sb.length(); + sb.append("This is the table1 caption."); + Caption tCap = new Caption(jCas, currentBegin, sb.length()); + tCap.setCaptionType("table"); + tCap.addToIndexes(); + Table tab = new Table(jCas, objectBegin, sb.length()); + tab.setObjectTitle(tabTitle); + tab.setObjectCaption(tCap); + tab.addToIndexes(); + tab.addToIndexes(); + jCas.setDocumentText(sb.toString()); + return jCas; + } +} From 2b6a9c2c015b9f9d145371a8865e45b46cd2d352 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 16 Feb 2022 16:17:33 +0100 Subject: [PATCH 142/269] Add descriptor and component.meta to the GNP BioC writer. Resolves #129. --- jcore-gnp-bioc-writer/component.meta | 20 ++++++ jcore-gnp-bioc-writer/pom.xml | 4 ++ .../jcore/consumer/gnp/desc/PLACEHOLDER | 1 - .../gnp/desc/jcore-gnp-bioc-writer.xml | 71 ++++++++++++++----- 4 files changed, 76 insertions(+), 20 deletions(-) create mode 100644 jcore-gnp-bioc-writer/component.meta delete mode 100644 jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/PLACEHOLDER diff --git a/jcore-gnp-bioc-writer/component.meta b/jcore-gnp-bioc-writer/component.meta new file mode 100644 index 000000000..78c499835 --- /dev/null +++ b/jcore-gnp-bioc-writer/component.meta @@ -0,0 +1,20 @@ +{ + "categories": [ + "consumer" + ], + "description": "Writes CAS documents into the BioC XML format used by the gene tagger and normalizer GNormPlus.", + "descriptors": [ + { + "category": "consumer", + "location": "de.julielab.jcore.consumer.gnp.desc.jcore-gnp-bioc-writer" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-gnp-bioc-writer", + "groupId": "de.julielab", + "version": "2.6.0-SNAPSHOT" + }, + "name": "JCoRe GNormPlus BioC Writer" +} diff --git a/jcore-gnp-bioc-writer/pom.xml b/jcore-gnp-bioc-writer/pom.xml index 4381dfd93..93aa158ea 100644 --- a/jcore-gnp-bioc-writer/pom.xml +++ b/jcore-gnp-bioc-writer/pom.xml @@ -46,6 +46,10 @@ org.assertj assertj-core + + de.julielab + jcore-descriptor-creator + JCoRe GNormPlus BioC Writer diff --git a/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/PLACEHOLDER b/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/PLACEHOLDER deleted file mode 100644 index 9f6c6ddb5..000000000 --- a/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/PLACEHOLDER +++ /dev/null @@ -1 +0,0 @@ -The actual descriptor must be created by UIMA fit. diff --git a/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/jcore-gnp-bioc-writer.xml b/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/jcore-gnp-bioc-writer.xml index 47d89e355..524f590ea 100644 --- a/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/jcore-gnp-bioc-writer.xml +++ b/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/jcore-gnp-bioc-writer.xml @@ -1,21 +1,54 @@ - org.apache.uima.java - true - GNormPlusFormatWriter - - JCoRe GNormPlus BioC Writer - - 2.3.0-SNAPSHOT - JULIE Lab Jena, Germany - - - - - - true - true - false - - - + org.apache.uima.java + true + de.julielab.jcore.consumer.gnp.GNormPlusFormatWriter + + JCoRe GNormPlus BioC Writer + Writes CAS documents into the BioC XML format used by the gene tagger and normalizer GNormPlus. + JULIE Lab Jena, Germany + + + NumDocsPerFile + The number of documents (i.e. CASes) that should be written into a single BioC XML file. + Integer + false + true + + + NumFilesPerDir + The number of files that should be put in a directory before a new one is created. + Integer + false + true + + + BaseDirectory + The base directory into which to create new directories that contain the actual BioC collection files. + String + false + true + + + + + + + + + + + + + + + + + + + true + true + false + + + \ No newline at end of file From c535e78199407a35d084989bfa8d1a63a2c88831 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 16 Feb 2022 16:21:27 +0100 Subject: [PATCH 143/269] Remove duplicate module entry for GNP BioC Writer. --- pom.xml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 995fec529..662cf49ef 100644 --- a/pom.xml +++ b/pom.xml @@ -406,8 +406,7 @@ jcore-jedis-integration-tests - jcore-gnp-bioc-writer - + From b0e1be9d3d7a4e15fb78b61c84ae8f6df06d6f84 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 17 Feb 2022 10:06:52 +0100 Subject: [PATCH 144/269] Adapt output more to the GNormPlus corpus format. Which currently means to put the structures abstract section labels into the text (BACKGROUND, RESULTS etc). Also, sort out empty titles (which are the structured abstract headings in our current handling of PubMed). --- .../consumer/gnp/BioCCollectionWriter.java | 2 +- .../consumer/gnp/BioCDocumentPopulator.java | 26 ++++++++++++++++--- .../gnp/BioCDocumentPopulatorTest.java | 2 +- .../consumer/gnp/TestDocumentGenerator.java | 12 +++++---- .../xmi/desc/jcore-xmi-db-multiplier.xml | 1 + 5 files changed, 32 insertions(+), 11 deletions(-) diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java index 785976d1a..df5b12587 100644 --- a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java @@ -26,7 +26,7 @@ public BioCCollectionWriter(int numFilesPerDir, Path baseDir) { } public void writeBioCCollection(BioCCollection collection) throws XMLStreamException, IOException { - Path collectionFile = null; + Path collectionFile; synchronized (BioCCollectionWriter.class) { // currentDir is either null at the very beginning or after a batch of documents have been written if (currentDir == null) { diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java index 7dd246876..96120276c 100644 --- a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java @@ -19,6 +19,8 @@ public BioCDocument populate(JCas jCas) { BioCDocument doc = new BioCDocument(JCoReTools.getDocId(jCas)); AnnotationIndex zoneIndex = jCas.getAnnotationIndex(Zone.type); for (Zone z : zoneIndex) { + if (z.getEnd() - z.getBegin() <= 0) + continue; if (z instanceof Title) { Title t = (Title) z; String titleType; @@ -46,11 +48,27 @@ public BioCDocument populate(JCas jCas) { p.putInfon("type", titleType); doc.addPassage(p); } else if (z instanceof AbstractText) { - // don't check for structured parts; for GNormPlus the only important thing is title, abstract, body AbstractText at = (AbstractText) z; - BioCPassage p = getPassageForAnnotation(at); - p.putInfon("type", "abstract"); - doc.addPassage(p); + if (at.getStructuredAbstractParts() != null && at.getStructuredAbstractParts().size() > 0) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < at.getStructuredAbstractParts().size() && at.getStructuredAbstractParts(i) != null; ++i) { + AbstractSection abstractPart = at.getStructuredAbstractParts(i); + String sectionLabel = ((AbstractSectionHeading) abstractPart.getAbstractSectionHeading()).getLabel(); + sb.append(sectionLabel).append(": "); + sb.append(abstractPart.getCoveredText()); + if (i < at.getStructuredAbstractParts().size() - 1 && at.getStructuredAbstractParts(i+1) != null) + sb.append(" "); + } + BioCPassage p = new BioCPassage(); + p.setOffset(at.getBegin()); + p.setText(sb.toString()); + p.putInfon("type", "abstract"); + doc.addPassage(p); + } else { + BioCPassage p = getPassageForAnnotation(at); + p.putInfon("type", "abstract"); + doc.addPassage(p); + } } else if (z instanceof Paragraph) { Paragraph pa = (Paragraph) z; BioCPassage p = getPassageForAnnotation(pa); diff --git a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java index 8f831bbf0..9f085bc0b 100644 --- a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java +++ b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java @@ -28,7 +28,7 @@ public void populate() throws Exception { assertThat(resultXml).containsOnlyOnce("This is the title of document 1."); assertThat(resultXml).containsOnlyOnce("title"); // The abstract should be one single string - assertThat(resultXml).containsOnlyOnce("BACKGROUND This abstract section belongs to document 1.\nRESULTS There are certainly some results reported by document 1."); + assertThat(resultXml).containsOnlyOnce("BACKGROUND: This abstract section belongs to document 1. RESULTS: There are certainly some results reported by document 1."); assertThat(resultXml).containsOnlyOnce("INTRODUCTION"); assertThat(resultXml).containsOnlyOnce("section_title"); assertThat(resultXml).contains("paragraph"); diff --git a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/TestDocumentGenerator.java b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/TestDocumentGenerator.java index da5e83a6f..55ca81a02 100644 --- a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/TestDocumentGenerator.java +++ b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/TestDocumentGenerator.java @@ -30,15 +30,17 @@ public static JCas prepareCas(JCas jCas, int docId) { t.setTitleType("document"); t.addToIndexes(); currentBegin = sb.length(); - sb.append("BACKGROUND This abstract section belongs to document ").append(docId).append("."); - AbstractSectionHeading ash1 = new AbstractSectionHeading(jCas, currentBegin, currentBegin + 10); + sb.append("This abstract section belongs to document ").append(docId).append("."); + AbstractSectionHeading ash1 = new AbstractSectionHeading(jCas); + ash1.setLabel("BACKGROUND"); ash1.setTitleType("abstract"); AbstractSection as1 = new AbstractSection(jCas, currentBegin, sb.length()); as1.setAbstractSectionHeading(ash1); - currentBegin = sb.length(); sb.append(ls); - sb.append("RESULTS There are certainly some results reported by document ").append(docId).append("."); - AbstractSectionHeading ash2 = new AbstractSectionHeading(jCas, currentBegin, currentBegin + 7); + currentBegin = sb.length(); + sb.append("There are certainly some results reported by document ").append(docId).append("."); + AbstractSectionHeading ash2 = new AbstractSectionHeading(jCas); + ash2.setLabel("RESULTS"); ash2.setTitleType("abstract"); AbstractSection as2 = new AbstractSection(jCas, currentBegin, sb.length()); as2.setAbstractSectionHeading(ash2); diff --git a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml index c124b4804..bd4929ad1 100644 --- a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml +++ b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml @@ -29,6 +29,7 @@ + From 3a6b74ed4552c346e454e3f7f47573f0a1eefc2c Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 17 Feb 2022 11:46:00 +0100 Subject: [PATCH 145/269] Revoke the last change regarding structured abstracts. The issue was actually the newlines in the abstract text, not the omitted headings. --- .../consumer/gnp/BioCDocumentPopulator.java | 25 +++---------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java index 96120276c..488f42613 100644 --- a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java @@ -49,26 +49,9 @@ public BioCDocument populate(JCas jCas) { doc.addPassage(p); } else if (z instanceof AbstractText) { AbstractText at = (AbstractText) z; - if (at.getStructuredAbstractParts() != null && at.getStructuredAbstractParts().size() > 0) { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < at.getStructuredAbstractParts().size() && at.getStructuredAbstractParts(i) != null; ++i) { - AbstractSection abstractPart = at.getStructuredAbstractParts(i); - String sectionLabel = ((AbstractSectionHeading) abstractPart.getAbstractSectionHeading()).getLabel(); - sb.append(sectionLabel).append(": "); - sb.append(abstractPart.getCoveredText()); - if (i < at.getStructuredAbstractParts().size() - 1 && at.getStructuredAbstractParts(i+1) != null) - sb.append(" "); - } - BioCPassage p = new BioCPassage(); - p.setOffset(at.getBegin()); - p.setText(sb.toString()); - p.putInfon("type", "abstract"); - doc.addPassage(p); - } else { - BioCPassage p = getPassageForAnnotation(at); - p.putInfon("type", "abstract"); - doc.addPassage(p); - } + BioCPassage p = getPassageForAnnotation(at); + p.putInfon("type", "abstract"); + doc.addPassage(p); } else if (z instanceof Paragraph) { Paragraph pa = (Paragraph) z; BioCPassage p = getPassageForAnnotation(pa); @@ -95,7 +78,7 @@ public BioCDocument populate(JCas jCas) { private BioCPassage getPassageForAnnotation(Annotation a) { BioCPassage p = new BioCPassage(); p.setOffset(a.getBegin()); - p.setText(a.getCoveredText()); + p.setText(a.getCoveredText().replaceAll("\n", " ")); return p; } } From ef27b1d5b174e091c2ff8a4c75317a296f812eac Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 17 Feb 2022 18:19:37 +0100 Subject: [PATCH 146/269] Add a mechanism to avoid mirror subset reset for updated JeDIS document whose text hasn't changed; resolves #130. The XMLDBMultiplier already had the "ToVisit" mechanism for the AnnotationDefinedFlowController that could skip the majority of a pipeline when the document hash in the database was the same as the hash for the updated document text. However, we needed to update the base documents and, thus, needed to not skip the XMI writer. But by default, that would cause the mirror subsets to reset. Now, the XMLDBMultiplier fills the new feature of the DBProcessingMetaData annotation named "isDocumentHashUnchanged". If it is set to be unchanged, the XMIDBWriter will not reset the mirror subsets for that document. --- .../jcore-document-meta-extension-types.xml | 5 + .../jcore/consumer/xmi/XMIDBWriter.java | 53 ++-- .../jcore/consumer/xmi/XmiDataInserter.java | 28 ++- .../jcore/consumer/xmi/XmiDBWriterTest.java | 229 ++++++++++++++---- .../jcore/reader/xml/XMLDBMultiplier.java | 3 +- 5 files changed, 245 insertions(+), 73 deletions(-) diff --git a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-document-meta-extension-types.xml b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-document-meta-extension-types.xml index 115927024..200ff0383 100644 --- a/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-document-meta-extension-types.xml +++ b/jcore-types/src/main/resources/de/julielab/jcore/types/extensions/jcore-document-meta-extension-types.xml @@ -25,6 +25,11 @@ This feature is used by the DBCheckpointAE. It allows components in the pipeline to prevent a document to be marked as being finished with processing. This can be used to indicate issues with specific documents which will require reprocessing. uima.cas.Boolean + + isDocumentHashUnchanged + For use by the XMIDBWriter. Used to prohibit that mirror subsets reset to 'not processed' for this document when there was no change in the document text. That allows to update the base document without indicating that a reprocessing is required. This is useful when the document is updated by the distributor (e.g. PubMed) but the text contents have not changed. + uima.cas.Boolean + diff --git a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java index b9594dda3..8a085cf8b 100644 --- a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java +++ b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java @@ -29,6 +29,7 @@ import de.julielab.jcore.types.Header; import de.julielab.jcore.types.XmiMetaData; import de.julielab.jcore.types.ext.DBProcessingMetaData; +import de.julielab.jcore.utility.JCoReTools; import de.julielab.xml.*; import de.julielab.xml.binary.BinaryJeDISNodeEncoder; import de.julielab.xml.binary.BinaryStorageAnalysisResult; @@ -250,6 +251,7 @@ public class XMIDBWriter extends JCasAnnotator_ImplBase { @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, mandatory = false, description = "Possible values: document_text. If this parameter is set to a valid value, the SHA256 hash for the given value will be calculated, base64 encoded and added to each document as a new column in the document table. The column will be named after the parameter value, suffixed by '_sha256'.") private String documentItemToHash; private Map shaMap; + private Set mirrorResetIds; private String mappingCacheKey; private DocumentReleaseCheckpoint docReleaseCheckpoint; private List currentDocumentIdBatch; @@ -288,8 +290,8 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept // The deletion of obsolete annotations should only be active when the base document is stored because then, old annotations won't be valid any more. deleteObsolete &= storeBaseDocument; baseDocumentAnnotationTypes = Arrays.stream( - Optional.ofNullable((String[]) aContext.getConfigParameterValue(PARAM_BASE_DOCUMENT_ANNOTATION_TYPES)) - .orElse(new String[0])) + Optional.ofNullable((String[]) aContext.getConfigParameterValue(PARAM_BASE_DOCUMENT_ANNOTATION_TYPES)) + .orElse(new String[0])) .collect(Collectors.toSet()); attributeSize = (Integer) aContext.getConfigParameterValue(PARAM_ATTRIBUTE_SIZE); writeBatchSize = Optional.ofNullable((Integer) aContext.getConfigParameterValue(PARAM_WRITE_BATCH_SIZE)).orElse(50); @@ -423,6 +425,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept if (useBinaryFormat) { this.binaryEncoder = new BinaryJeDISNodeEncoder(); } + mirrorResetIds = new HashSet<>(); log.info(XMIDBWriter.class.getName() + " initialized."); log.info("Effective document table name: {}", effectiveDocTableName); @@ -509,7 +512,13 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { } catch (IllegalArgumentException e) { // Do nothing; this is not the work item CAS } - DocumentId docId = getDocumentId(aJCas); + Collection metaDatas = JCasUtil.select(aJCas, DBProcessingMetaData.class); + if (metaDatas.size() > 1) + throw new AnalysisEngineProcessException(new IllegalArgumentException( + "There is more than one type of DBProcessingMetaData in document " + JCoReTools.getDocId(aJCas))); + Optional metaData = metaDatas.stream().findAny(); + DocumentId docId = getDocumentId(aJCas, metaData); + setMirrorResetStateForDocId(docId, metaData); if (docId == null) { log.warn("The current document does not have a document ID. It is omitted from database import."); return; @@ -518,12 +527,8 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { currentDocumentIdBatch.add(docId); if (subsetTable == null) { - Collection metaData = JCasUtil.select(aJCas, DBProcessingMetaData.class); if (!metaData.isEmpty()) { - if (metaData.size() > 1) - throw new AnalysisEngineProcessException(new IllegalArgumentException( - "There is more than one type of DBProcessingMetaData in document " + docId)); - subsetTable = metaData.stream().findAny().get().getSubsetTable(); + subsetTable = metaData.get().getSubsetTable(); if (subsetTable != null && storeBaseDocument) { // Check if we are about to read from a mirror subset and to update the base document. This is not allowed @@ -563,6 +568,19 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { } } + private void setMirrorResetStateForDocId(DocumentId docId, Optional metaData) { + if (metaData.isPresent()) { + // mirror subset reset is only necessary if we store the base document in any way; + // additionally, we check if the document text hash key is reported to by different to its already + // existing database entry. Only then the mirror subsets should be reset for this document. + if (storeBaseDocument && !metaData.get().getIsDocumentHashUnchanged()) + mirrorResetIds.add(docId); + } else { + // default: reset the mirror tables + mirrorResetIds.add(docId); + } + } + private void handleAddhash(JCas aJCas, DocumentId docId) { if (documentItemToHash != null) { final String documentText = aJCas.getDocumentText(); @@ -836,15 +854,14 @@ private Map convertModuleLabelsToColumnNames(Map< return convertedMap; } - private DocumentId getDocumentId(JCas aJCas) { + private DocumentId getDocumentId(JCas aJCas, Optional metaData) { DocumentId docId = null; - try { - DBProcessingMetaData dbProcessingMetaData = JCasUtil.selectSingle(aJCas, DBProcessingMetaData.class); - docId = new DocumentId(dbProcessingMetaData); - } catch (IllegalArgumentException e) { - // it seems there is not DBProcessingMetaData we could get a complex primary key from. The document ID + if (metaData.isPresent()) { + docId = new DocumentId(metaData.get()); + } else { + // it seems there is no DBProcessingMetaData we could get a complex primary key from. The document ID // will have to do. - log.trace("Could not find the primary key in the DBProcessingMetaData due to exception: {}. Using the document ID as primary key.", e.getMessage()); + log.trace("Could not find the primary key in the DBProcessingMetaData because no meta data annotation is set. Using the document ID as primary key."); } if (docId == null) { AnnotationIndex headerIndex = aJCas.getAnnotationIndex(Header.type); @@ -1005,7 +1022,7 @@ public void batchProcessComplete() throws AnalysisEngineProcessException { final boolean readyToSendData = processXmiBuffer(); if (readyToSendData) { if (!(featuresToMapDryRun && useBinaryFormat)) - annotationInserter.sendXmiDataToDatabase(effectiveDocTableName, annotationModules, subsetTable, storeBaseDocument, deleteObsolete, shaMap); + annotationInserter.sendXmiDataToDatabase(effectiveDocTableName, annotationModules, subsetTable, mirrorResetIds, deleteObsolete, shaMap); else log.info("The dry run to see details about features to be mapped in the binary format is activated. No contents are written into the database."); log.trace("Clearing {} annotation modules", annotationModules.size()); @@ -1015,6 +1032,7 @@ public void batchProcessComplete() throws AnalysisEngineProcessException { if (docReleaseCheckpoint != null) docReleaseCheckpoint.release(jedisSyncKey, currentDocumentIdBatch.stream()); currentDocumentIdBatch.clear(); + mirrorResetIds.clear(); } } catch (XmiDataInsertionException e) { throw new AnalysisEngineProcessException(e); @@ -1034,7 +1052,7 @@ public void collectionProcessComplete() throws AnalysisEngineProcessException { try { processXmiBuffer(); if (!(featuresToMapDryRun && useBinaryFormat)) - annotationInserter.sendXmiDataToDatabase(effectiveDocTableName, annotationModules, subsetTable, storeBaseDocument, deleteObsolete, shaMap); + annotationInserter.sendXmiDataToDatabase(effectiveDocTableName, annotationModules, subsetTable, mirrorResetIds, deleteObsolete, shaMap); else log.info("The dry run to see details about features to be mapped in the binary format is activated. No contents are written into the database."); annotationModules.clear(); @@ -1043,6 +1061,7 @@ public void collectionProcessComplete() throws AnalysisEngineProcessException { if (docReleaseCheckpoint != null) docReleaseCheckpoint.release(jedisSyncKey, currentDocumentIdBatch.stream()); currentDocumentIdBatch.clear(); + mirrorResetIds.clear(); } catch (XmiDataInsertionException e) { throw new AnalysisEngineProcessException(e); } diff --git a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java index 1a75f474e..390e27e67 100644 --- a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java +++ b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java @@ -18,6 +18,7 @@ import java.sql.SQLException; import java.util.*; import java.util.function.Function; +import java.util.function.Predicate; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -59,23 +60,33 @@ public XmiDataInserter(Set annotationModuleColumnNames, * will be a primary key constraint violation, i.e. duplicates). * * @param annotationModules - * @param storeBaseDocument + * @param mirrorResetIds * @param deleteObsolete * @param shaMap * @throws XmiDataInsertionException * @throws AnalysisEngineProcessException */ - public void sendXmiDataToDatabase(String xmiTableName, List annotationModules, String subsetTableName, Boolean storeBaseDocument, Boolean deleteObsolete, Map shaMap) throws XmiDataInsertionException { + public void sendXmiDataToDatabase(String xmiTableName, List annotationModules, String subsetTableName, Set mirrorResetIds, Boolean deleteObsolete, Map shaMap) throws XmiDataInsertionException { log.trace("Sending {} XMI data items", annotationModules.size()); final Map> dataByDoc = annotationModules.stream().collect(Collectors.groupingBy(XmiData::getDocId)); // Collect all document IDs we want to add something for into the database. This can be annotations or the hash. final Set documentIdsWithData = shaMap != null ? Sets.union(dataByDoc.keySet(), shaMap.keySet()) : dataByDoc.keySet(); log.trace("There are {} documents with values to be updated in the database.", documentIdsWithData.size()); class RowIterator implements Iterator> { + /** + * An iterator that always returns only rows for a subset of document IDs. Either the ones that need mirror subsets to be reset or those for which mirror subsets should not be reset. + * @param returnDocumentsWithMirrorReset + */ + public RowIterator(boolean returnDocumentsWithMirrorReset) { + Predicate mirrorResetFilterPredicate = docId -> mirrorResetIds.contains(docId); + if (!returnDocumentsWithMirrorReset) + mirrorResetFilterPredicate = Predicate.not(mirrorResetFilterPredicate); + docIdIterator = Stream.concat(documentIdsWithData.stream(), processedDocumentIds.stream()).filter(mirrorResetFilterPredicate).distinct().iterator(); + } // Add documents that have been processed but no data. We need to do this to override potentially existing // annotation values with null to remove them. - private Iterator docIdIterator = Stream.concat(documentIdsWithData.stream(), processedDocumentIds.stream()).distinct().iterator(); + private Iterator docIdIterator; private FieldConfig fieldConfig = dbc.getFieldConfiguration(schemaDocument); private List> fields = fieldConfig.getFields(); @@ -169,12 +180,15 @@ public void remove() { // This is the private in-line defined class from above. All values are already contained in the class // definition. - RowIterator iterator = new RowIterator(); + RowIterator iterator = new RowIterator(true); try { if (updateMode) { - log.debug("Updating {} XMI CAS data in database table '{}'.", - annotationModules.size(), xmiTableName); - dbc.updateFromRowIterator(iterator, xmiTableName, false, storeBaseDocument, schemaDocument); + log.debug("Updating {} XMI CAS data in database table '{}' for documents with mirror subset resets.", + mirrorResetIds.size(), xmiTableName); + dbc.updateFromRowIterator(iterator, xmiTableName, false, true, schemaDocument); + log.debug("Updating {} XMI CAS data in database table '{}' for documents without mirror subset resets.", + annotationModules.size()-mirrorResetIds.size(), xmiTableName); + dbc.updateFromRowIterator(new RowIterator(false), xmiTableName, false, false, schemaDocument); } else { log.debug("Inserting {} XMI CAS data into database table '{}'.", annotationModules.size(), xmiTableName); diff --git a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java index 68150ad75..fc93a2138 100644 --- a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java +++ b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java @@ -4,6 +4,7 @@ import de.julielab.costosys.dbconnection.DataBaseConnector; import de.julielab.jcore.db.test.DBTestUtils; import de.julielab.jcore.types.*; +import de.julielab.jcore.types.ext.DBProcessingMetaData; import de.julielab.xml.XmiSplitConstants; import org.apache.commons.configuration2.ex.ConfigurationException; import org.apache.uima.UIMAException; @@ -11,19 +12,23 @@ import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.StringArray; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.InvalidXMLException; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; import org.testcontainers.containers.PostgreSQLContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.sql.ResultSet; import java.sql.SQLException; import java.util.List; -import java.util.Map; -import java.util.Set; +import java.util.*; import java.util.stream.Collectors; import static org.assertj.core.api.Assertions.assertThat; @@ -32,7 +37,7 @@ @Testcontainers public class XmiDBWriterTest { @Container - public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:"+DataBaseConnector.POSTGRES_VERSION); + public static PostgreSQLContainer postgres = new PostgreSQLContainer("postgres:" + DataBaseConnector.POSTGRES_VERSION); private static String costosysConfig; private static DataBaseConnector dbc; @@ -58,48 +63,6 @@ public static JCas getJCasWithRequiredTypes() throws UIMAException { "de.julielab.jcore.types.jcore-xmi-splitter-types"); } - @Test - public void testXmiDBWriterSplitAnnotations() throws Exception { - - AnalysisEngine xmiWriter = AnalysisEngineFactory.createEngine("de.julielab.jcore.consumer.xmi.desc.jcore-xmi-db-writer", - XMIDBWriter.PARAM_ANNOS_TO_STORE, new String[]{Token.class.getCanonicalName(), Sentence.class.getCanonicalName()}, - XMIDBWriter.PARAM_COSTOSYS_CONFIG, costosysConfig, - XMIDBWriter.PARAM_STORE_ALL, false, - XMIDBWriter.PARAM_STORE_BASE_DOCUMENT, true, - XMIDBWriter.PARAM_TABLE_DOCUMENT, "_data.documents2", - XMIDBWriter.PARAM_DO_GZIP, false, - XMIDBWriter.PARAM_STORE_RECURSIVELY, true, - XMIDBWriter.PARAM_UPDATE_MODE, true, - XMIDBWriter.PARAM_BASE_DOCUMENT_ANNOTATION_TYPES, new String[]{MeshHeading.class.getCanonicalName(), AbstractText.class.getCanonicalName(), Title.class.getCanonicalName(), de.julielab.jcore.types.pubmed.Header.class.getCanonicalName()} - ); - JCas jCas = getJCasWithRequiredTypes(); - final Header header = new Header(jCas); - header.setDocId("789"); - header.addToIndexes(); - jCas.setDocumentText("This is a sentence. This is another one."); - new Sentence(jCas, 0, 19).addToIndexes(); - new Sentence(jCas, 20, 40).addToIndexes(); - // Of course, these token offsets are wrong, but it doesn't matter to the test - new Token(jCas, 0, 19).addToIndexes(); - new Token(jCas, 20, 40).addToIndexes(); - assertThatCode(() -> xmiWriter.process(jCas)).doesNotThrowAnyException(); - jCas.reset(); - xmiWriter.collectionProcessComplete(); - - dbc = DBTestUtils.getDataBaseConnector(postgres); - try (CoStoSysConnection ignored = dbc.obtainOrReserveConnection()) { - assertThat(dbc.tableExists("_data.documents2")).isTrue(); - - assertThat(dbc.getTableColumnNames("_data.documents2")).contains("de_julielab_jcore_types_token", "de_julielab_jcore_types_sentence"); - assertThat(dbc.isEmpty("_data.documents2", XmiSplitConstants.BASE_DOC_COLUMN)).isFalse(); - assertThat(dbc.isEmpty("_data.documents2", XmiDataInserter.FIELD_MAX_XMI_ID)).isFalse(); - assertThat(dbc.isEmpty("_data.documents2", "sofa_mapping")).isFalse(); - assertThat(dbc.isEmpty("_data.documents2", "de_julielab_jcore_types_token")).isFalse(); - assertThat(dbc.isEmpty("_data.documents2", "de_julielab_jcore_types_sentence")).isFalse(); - - } - } - @Test public void testXmiDBWriterSplitAnnotationsSpecifyAnnotationSchemas() throws Exception { @@ -148,7 +111,7 @@ public void testXmiDBWriterSplitAnnotationsSpecifyAnnotationSchemas() throws Exc public void testXmiDBWriterSplitAnnotationsDefaultAnnotationSchemas() throws Exception { AnalysisEngine xmiWriter = AnalysisEngineFactory.createEngine("de.julielab.jcore.consumer.xmi.desc.jcore-xmi-db-writer", - XMIDBWriter.PARAM_ANNOS_TO_STORE, new String[]{ Token.class.getCanonicalName(), Sentence.class.getCanonicalName()}, + XMIDBWriter.PARAM_ANNOS_TO_STORE, new String[]{Token.class.getCanonicalName(), Sentence.class.getCanonicalName()}, XMIDBWriter.PARAM_ANNO_DEFAULT_QUALIFIER, "testschema", XMIDBWriter.PARAM_COSTOSYS_CONFIG, costosysConfig, XMIDBWriter.PARAM_STORE_ALL, false, @@ -216,9 +179,179 @@ public void testXmiSubtypeStorage() throws Exception { assertThat(dbc.tableExists("_data.documents3")).isTrue(); ResultSet rs = ignored.createStatement().executeQuery("SELECT " + XmiSplitConstants.BASE_DOC_COLUMN + " FROM " + "_data.documents3"); assertThat(rs.next()).isTrue(); - String documentString = rs.getString(1); - System.out.println(documentString); +// String documentString = rs.getString(1); +// System.out.println(documentString); + + } + } + + @Nested + class WriteWithMirrorSubsets { + /** + * This test checks that the XMI is split as intended and distributed into database table columns as annotation modules. + * @throws Exception + */ + @Test + public void testXmiDBWriterSplitAnnotations() throws Exception { + + AnalysisEngine xmiWriter = getXmiWriterForDocuments2(); + JCas jCas = getJCasWithRequiredTypes(); + prepareDocument1(jCas); + assertThatCode(() -> xmiWriter.process(jCas)).doesNotThrowAnyException(); + jCas.reset(); + prepareDocument2(jCas); + assertThatCode(() -> xmiWriter.process(jCas)).doesNotThrowAnyException(); + xmiWriter.collectionProcessComplete(); + + dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setActiveTableSchema("medline_2017"); + try (CoStoSysConnection ignored = dbc.obtainOrReserveConnection()) { + assertThat(dbc.tableExists("_data.documents2")).isTrue(); + + assertThat(dbc.getTableColumnNames("_data.documents2")).contains("de_julielab_jcore_types_token", "de_julielab_jcore_types_sentence"); + assertThat(dbc.isEmpty("_data.documents2", XmiSplitConstants.BASE_DOC_COLUMN)).isFalse(); + assertThat(dbc.isEmpty("_data.documents2", XmiDataInserter.FIELD_MAX_XMI_ID)).isFalse(); + assertThat(dbc.isEmpty("_data.documents2", "sofa_mapping")).isFalse(); + assertThat(dbc.isEmpty("_data.documents2", "de_julielab_jcore_types_token")).isFalse(); + assertThat(dbc.isEmpty("_data.documents2", "de_julielab_jcore_types_sentence")).isFalse(); + + } + + // create a subset for nested tests and set its only entry to "processed" + try (CoStoSysConnection ignored = dbc.obtainOrReserveConnection()) { + dbc.createSubsetTable("_data._data_mirror", "_data.documents2", 1, "Test subset", "medline_2017"); + dbc.initMirrorSubset("_data._data_mirror", "_data.documents2", true, "medline_2017"); + List idsList = new ArrayList<>(); + idsList.add(new byte[][]{"789".getBytes(StandardCharsets.UTF_8)}); + idsList.add(new byte[][]{"890".getBytes(StandardCharsets.UTF_8)}); + dbc.setProcessed("_data._data_mirror", idsList); + assertThat(dbc.status("_data._data_mirror", EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED)).isProcessed).isEqualTo(2); + } + } + + /** + * Produces the test XMI writer for this nested test group. It stores the base document which should cause + * mirror subsets to reset the "is processed" status to false for the written documents. + * @return The XMI writer for testing. + * @throws InvalidXMLException + * @throws IOException + * @throws ResourceInitializationException + */ + private AnalysisEngine getXmiWriterForDocuments2() throws InvalidXMLException, IOException, ResourceInitializationException { + return AnalysisEngineFactory.createEngine("de.julielab.jcore.consumer.xmi.desc.jcore-xmi-db-writer", + XMIDBWriter.PARAM_ANNOS_TO_STORE, new String[]{Token.class.getCanonicalName(), Sentence.class.getCanonicalName()}, + XMIDBWriter.PARAM_COSTOSYS_CONFIG, costosysConfig, + XMIDBWriter.PARAM_STORE_ALL, false, + XMIDBWriter.PARAM_STORE_BASE_DOCUMENT, true, + XMIDBWriter.PARAM_TABLE_DOCUMENT, "_data.documents2", + XMIDBWriter.PARAM_DO_GZIP, false, + XMIDBWriter.PARAM_STORE_RECURSIVELY, true, + XMIDBWriter.PARAM_UPDATE_MODE, true, + XMIDBWriter.PARAM_BASE_DOCUMENT_ANNOTATION_TYPES, new String[]{MeshHeading.class.getCanonicalName(), AbstractText.class.getCanonicalName(), Title.class.getCanonicalName(), de.julielab.jcore.types.pubmed.Header.class.getCanonicalName()} + ); + } + + /** + * Prepares the first of two documents used in these nested tests. + * @param jCas The CAS to populate with the test data. + */ + private void prepareDocument1(JCas jCas) { + final Header header = new Header(jCas); + header.setDocId("789"); + header.addToIndexes(); + jCas.setDocumentText("This is a sentence. This is another one."); + new Sentence(jCas, 0, 19).addToIndexes(); + new Sentence(jCas, 20, 40).addToIndexes(); + // Of course, these token offsets are wrong, but it doesn't matter to the test + new Token(jCas, 0, 19).addToIndexes(); + new Token(jCas, 20, 40).addToIndexes(); + } + + /** + * Prepares the second of two documents used in these nested tests. + * @param jCas The CAS to populate with the test data. + */ + private void prepareDocument2(JCas jCas) { + final Header header2 = new Header(jCas); + header2.setDocId("890"); + header2.addToIndexes(); + jCas.setDocumentText("Sentence of document 2."); + new Sentence(jCas, 0, jCas.getDocumentText().length()).addToIndexes(); + } + + /** + * Default case: mirror subsets should be reset after writing the base document + */ + @Nested + class CheckMirrorSubsetIsReset { + @Test + public void testMirrorSubsetReset() throws Exception { + AnalysisEngine xmiWriter = getXmiWriterForDocuments2(); + JCas jCas = getJCasWithRequiredTypes(); + prepareDocument1(jCas); + assertThatCode(() -> xmiWriter.process(jCas)).doesNotThrowAnyException(); + jCas.reset(); + prepareDocument2(jCas); + assertThatCode(() -> xmiWriter.process(jCas)).doesNotThrowAnyException(); + xmiWriter.collectionProcessComplete(); + + dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setActiveTableSchema("medline_2017"); + + // check that the subset table has been reset + try (CoStoSysConnection ignored = dbc.obtainOrReserveConnection()) { + assertThat(dbc.status("_data._data_mirror", EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED)).isProcessed).isEqualTo(0); + assertThat(dbc.status("_data._data_mirror", EnumSet.of(DataBaseConnector.StatusElement.TOTAL)).total).isEqualTo(2); + // set it again to processed for the next test + List idsList = new ArrayList<>(); + idsList.add(new byte[][]{"789".getBytes(StandardCharsets.UTF_8)}); + idsList.add(new byte[][]{"890".getBytes(StandardCharsets.UTF_8)}); + dbc.setProcessed("_data._data_mirror", idsList); + assertThat(dbc.status("_data._data_mirror", EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED)).isProcessed).isEqualTo(2); + } + } + } + + /** + * The interesting test case: Given a DBProcessingMetaData annotation that specifies that the document + * text hasn't changed between a former document version in the database and the newly written version, + * the mirror subsets should not be reset to "is not processed" for the given document. + */ + @Nested + class CheckMirrorSubsetIsNotReset { + @Test + public void testMirrorSubsetNotReset() throws Exception { + // precondition check: the mirror subset is currently processed + // this main test will be to ensure that the mirror subset stays this way + try (CoStoSysConnection ignored = dbc.obtainOrReserveConnection()) { + assertThat(dbc.status("_data._data_mirror", EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED)).isProcessed).isEqualTo(2); + assertThat(dbc.status("_data._data_mirror", EnumSet.of(DataBaseConnector.StatusElement.TOTAL)).total).isEqualTo(2); + } + AnalysisEngine xmiWriter = getXmiWriterForDocuments2(); + JCas jCas = getJCasWithRequiredTypes(); + prepareDocument1(jCas); + // This is the important part: tell the writer not to reset mirror subsets for this document + DBProcessingMetaData processingMetaData = new DBProcessingMetaData(jCas); + processingMetaData.setIsDocumentHashUnchanged(true); + StringArray pk = new StringArray(jCas, 1); + pk.set(0, "789"); + processingMetaData.setPrimaryKey(pk); + processingMetaData.addToIndexes(); + assertThatCode(() -> xmiWriter.process(jCas)).doesNotThrowAnyException(); + jCas.reset(); + prepareDocument2(jCas); + assertThatCode(() -> xmiWriter.process(jCas)).doesNotThrowAnyException(); + xmiWriter.collectionProcessComplete(); + + dbc = DBTestUtils.getDataBaseConnector(postgres); + dbc.setActiveTableSchema("medline_2017"); + // check that the subset table has NOT been reset for document 789 but for the other + try (CoStoSysConnection ignored = dbc.obtainOrReserveConnection()) { + assertThat(dbc.status("_data._data_mirror", EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED)).isProcessed).isEqualTo(1); + assertThat(dbc.status("_data._data_mirror", EnumSet.of(DataBaseConnector.StatusElement.TOTAL)).total).isEqualTo(2); + } + } } } } diff --git a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java index b429470c2..f3c3d7790 100644 --- a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java +++ b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java @@ -57,7 +57,7 @@ public class XMLDBMultiplier extends DBMultiplier { protected String[] rowMappingArray; @ConfigurationParameter(name = PARAM_MAPPING_FILE, description = XMLDBReader.DESC_MAPPING_FILE) protected String mappingFileStr; - @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, mandatory = false, description = "For use with AnnotationDefinedFlowController. Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS is directly routed to the components specified in the " + PARAM_TO_VISIT_KEYS + " parameter, skipping all other components. Note that this only works with AAEs where the first component is an 'AnnotationControlledFlow'.") + @ConfigurationParameter(name = PARAM_ADD_SHA_HASH, mandatory = false, description = "For use with AnnotationDefinedFlowController and XMIDBWriter. Possible values: document_text, defaults to 'document_text' and thus doesn't need to be specified manually at the moment. This parameter needs to match the value for the same parameter given to the XMIDBWriter in this pipeline. Then, a comparison between the existing hash in the database and the new hash of the CAS read in this pipeline can be made. In case the hashes match, the CAS is directly routed to the components specified in the " + PARAM_TO_VISIT_KEYS + " parameter, skipping all other components. Note that this only works with AAEs where the first component is an 'AnnotationControlledFlow'. Additionally, the DBProcessingMetaData#hasDocumentHashChanged is set. This can be used by the XMIDBWriter to omit the reset of mirror subsets when updating the base document when the actual CAS text stayed the same.") private String documentItemToHash; @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT, mandatory = false, description = "For use with AnnotationDefinedFlowController. String parameter indicating the name of the " + "table where the XMI data and, thus, the hash is stored. The name must be schema qualified. Note that in this component, only the ToVisit annotation is created that determines which components to apply to a CAS with matching (unchanged) hash. The logic to actually control the CAS flow is contained in the AnnotationDefinedFlowController.") @@ -148,6 +148,7 @@ private void setToVisitAnnotation(JCas jCas) { if (existingHash.equals(newHash)) { if (log.isTraceEnabled()) log.trace("Document {} has a document text hash that equals the one present in the database. Creating a ToVisit annotation routing it only to the components with delegate keys {}.", pkString, toVisitKeys); + dbProcessingMetaData.setIsDocumentHashUnchanged(true); ToVisit toVisit = new ToVisit(jCas); if (toVisitKeys != null && toVisitKeys.length != 0) { StringArray keysArray = new StringArray(jCas, toVisitKeys.length); From cf3f2e01b22e22a7c7eaa9e1405dc36e171b2a47 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 18 Feb 2022 16:19:36 +0100 Subject: [PATCH 147/269] Write GNormPlus BioC XML reader (#131). In progress; already written base classes and tests. --- jcore-gnp-bioc-reader/BioC.dtd | 158 ++++++++ jcore-gnp-bioc-reader/LICENSE | 26 ++ jcore-gnp-bioc-reader/README.md | 34 ++ jcore-gnp-bioc-reader/pom.xml | 58 +++ .../jcore/reader/BioCCasPopulator.java | 153 ++++++++ .../GNormPlusFormatMultiplierReader.java | 94 +++++ .../jcore/reader/MissingInfonException.java | 22 ++ .../de/julielab/jcore/reader/desc/PLACEHOLDER | 4 + .../reader/desc/jcore-bnp-bioc-reader.xml | 20 + .../jcore/reader/BioCCasPopulatorTest.java | 62 ++++ .../GNormPlusFormatMultiplierReaderTest.java | 69 ++++ .../test-input-path/bioc_collection_3.xml | 1 + .../subdir1/bioc_collection_0.xml | 2 + .../subdir1/bioc_collection_1.xml | 2 + .../subdir2/bioc_collection_2.xml | 2 + pom.xml | 342 ++++++++++++------ 16 files changed, 936 insertions(+), 113 deletions(-) create mode 100644 jcore-gnp-bioc-reader/BioC.dtd create mode 100644 jcore-gnp-bioc-reader/LICENSE create mode 100644 jcore-gnp-bioc-reader/README.md create mode 100644 jcore-gnp-bioc-reader/pom.xml create mode 100644 jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java create mode 100644 jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReader.java create mode 100644 jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/MissingInfonException.java create mode 100644 jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/PLACEHOLDER create mode 100644 jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-reader.xml create mode 100644 jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java create mode 100644 jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReaderTest.java create mode 100644 jcore-gnp-bioc-reader/src/test/resources/test-input-path/bioc_collection_3.xml create mode 100644 jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_0.xml create mode 100644 jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_1.xml create mode 100644 jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir2/bioc_collection_2.xml diff --git a/jcore-gnp-bioc-reader/BioC.dtd b/jcore-gnp-bioc-reader/BioC.dtd new file mode 100644 index 000000000..8bd0d55ca --- /dev/null +++ b/jcore-gnp-bioc-reader/BioC.dtd @@ -0,0 +1,158 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/jcore-gnp-bioc-reader/LICENSE b/jcore-gnp-bioc-reader/LICENSE new file mode 100644 index 000000000..fbbd41e05 --- /dev/null +++ b/jcore-gnp-bioc-reader/LICENSE @@ -0,0 +1,26 @@ +BSD 2-Clause License + +Copyright (c) 2017, JULIE Lab +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/jcore-gnp-bioc-reader/README.md b/jcore-gnp-bioc-reader/README.md new file mode 100644 index 000000000..7947f772a --- /dev/null +++ b/jcore-gnp-bioc-reader/README.md @@ -0,0 +1,34 @@ +# JCoRe GNormPlus BioC Reader + +**Descriptor Path**: +``` +de.julielab.jcore.reader.desc.jcore-bnp-bioc-reader +``` + +A reader for the BioC format used by GNormPlus. Reads the text and the annotations, both species and genes. + + + +**1. Parameters** + +| Parameter Name | Parameter Type | Mandatory | Multivalued | Description | +|----------------|----------------|-----------|-------------|-------------| +| param1 | UIMA-Type | Boolean | Boolean | Description | +| param2 | UIMA-Type | Boolean | Boolean | Description | + +**2. Predefined Settings** + +| Parameter Name | Parameter Syntax | Example | +|----------------|------------------|---------| +| param1 | Syntax-Description | `Example` | +| param2 | Syntax-Description | `Example` | + +**3. Capabilities** + +| Type | Input | Output | +|------|:-----:|:------:| +| de.julielab.jcore.types.TYPE | | `+` | +| de.julielab.jcore.types.ace.TYPE | `+` | | + + +[1] Some Literature? diff --git a/jcore-gnp-bioc-reader/pom.xml b/jcore-gnp-bioc-reader/pom.xml new file mode 100644 index 000000000..86008eabd --- /dev/null +++ b/jcore-gnp-bioc-reader/pom.xml @@ -0,0 +1,58 @@ + + + + 4.0.0 + jcore-bnp-bioc-reader + jar + de.julielab + + + de.julielab + jcore-base + 2.6.0-SNAPSHOT + + + + + ch.qos.logback + logback-classic + test + + + org.slf4j + slf4j-api + + + de.julielab + jcore-descriptor-creator + + + com.pengyifan.bioc + pengyifan-bioc + 1.0.3 + + + de.julielab + jcore-types + ${jcore-types-version} + + + org.junit.jupiter + junit-jupiter-engine + + + org.assertj + assertj-core + + + JCoRe GNormPlus BioC Reader + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-bnp-bioc-reader + A reader for the BioC format used by GNormPlus. Reads the text and the annotations, both species and + genes. + + diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java new file mode 100644 index 000000000..4af6d0342 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java @@ -0,0 +1,153 @@ +package de.julielab.jcore.reader; + +import com.pengyifan.bioc.*; +import com.pengyifan.bioc.io.BioCCollectionReader; +import de.julielab.jcore.types.*; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.xml.stream.XMLStreamException; +import java.io.IOException; +import java.nio.file.Path; +import java.util.Collection; +import java.util.Iterator; +import java.util.Optional; +import java.util.stream.Stream; + +/** + * Reads a BioC collection from file and adds the species and gene annotations from its documents to a JCases. + */ +public class BioCCasPopulator { + + private final static Logger log = LoggerFactory.getLogger(BioCCasPopulator.class); + private final BioCCollection bioCCollection; + private int pos; + + public BioCCasPopulator(Path biocCollectionPath) throws XMLStreamException, IOException { + try (BioCCollectionReader bioCCollectionReader = new BioCCollectionReader(biocCollectionPath)) { + bioCCollection = bioCCollectionReader.readCollection(); + } + pos = 0; + } + + public void populateWithNextDocument(JCas jCas) throws XMLStreamException, IOException { + BioCDocument document = bioCCollection.getDocument(pos++); + setDocumentText(jCas, document); + Iterator allAnnotations = Stream.concat(document.getAnnotations().stream(), document.getPassages().stream().map(BioCPassage::getAnnotations).flatMap(Collection::stream)).iterator(); + for (BioCAnnotation annotation : (Iterable)() ->allAnnotations) { + Optional type = annotation.getInfon("type"); + if (!type.isPresent()) + throw new IllegalArgumentException("BioCDocument " + document.getID() + " has an annotation that does not specify its type: " + annotation); + try { + switch (type.get()) { + case "Gene": + addGeneAnnotation(annotation, jCas); + break; + case "Species": + addSpeciesAnnotation(annotation, jCas); + break; + } + } catch (MissingInfonException e) { + throw new IllegalArgumentException("BioCDocument " + document.getID() + " has an annotation issue; see cause exception.", e); + } + } + } + + private void setDocumentText(JCas jCas, BioCDocument document) { + StringBuilder sb = new StringBuilder(); + // iterate over the passages and create the complete document text from their individual text elements + for (BioCPassage passage : document.getPassages()) { + int offset = passage.getOffset(); + // The offset of the passage must match its starting position in the StringBuilder or the annotation + // offsets won't match. We might need to fill up the StringBuilder to reach the given offset. + while (sb.length() < offset) + sb.append(" "); + if (passage.getText().isPresent()) { + sb.append(passage.getText().get()); + Optional type = passage.getInfon("type"); + if (type.isPresent()) { + int passageEnd = offset + passage.getText().get().length(); + Zone passageAnnotation; + // The values in this switch are basically determined by the values created in the BioCDocumentPopulator in the jcore-gnp-bioc-writer project. + switch (type.get()) { + case "title": + passageAnnotation = new Title(jCas, offset, passageEnd); + ((Title) passageAnnotation).setTitleType("document"); + break; + case "section_title": + passageAnnotation = new Title(jCas, offset, passageEnd); + ((Title) passageAnnotation).setTitleType("section"); + break; + case "figure_title": + passageAnnotation = new Title(jCas, offset, passageEnd); + ((Title) passageAnnotation).setTitleType("figure"); + break; + case "table_title": + passageAnnotation = new Title(jCas, offset, passageEnd); + ((Title) passageAnnotation).setTitleType("table"); + break; + case "abstract": + passageAnnotation = new AbstractText(jCas, offset, passageEnd); + break; + case "paragraph": + passageAnnotation = new Paragraph(jCas, offset, passageEnd); + break; + case "figure": + case "table": + // for figures and tables we have actually no means to distinguish between captions and the actual object; mainly because the actual objects have so far not been part of the CAS documents; thus, this can only be a caption until the objects themselves are added + passageAnnotation = new Caption(jCas, offset, passageEnd); + ((Caption) passageAnnotation).setCaptionType(type.get()); + default: + log.debug("Unhandled passage type {}", type.get()); + passageAnnotation = new Zone(jCas, offset, passageEnd); + break; + } + passageAnnotation.addToIndexes(); + } + } + } + jCas.setDocumentText(sb.toString()); + } + + private void addSpeciesAnnotation(BioCAnnotation annotation, JCas jCas) throws MissingInfonException { + Optional taxId = annotation.getInfon("NCBI Taxonomy"); + if (!taxId.isPresent()) + throw new MissingInfonException("Species annotation does not specify its taxonomy ID: " + annotation); + // the "total location" is the span from the minimum location value to the maximum location value; + // for GNormPlus, there are no discontinuing annotations anyway + BioCLocation location = annotation.getTotalLocation(); + Organism organism = new Organism(jCas, location.getOffset(), location.getOffset() + location.getLength()); + ResourceEntry resourceEntry = new ResourceEntry(jCas, organism.getBegin(), organism.getEnd()); + resourceEntry.setSource("NCBI Taxonomy"); + resourceEntry.setComponentId(GNormPlusFormatMultiplierReader.class.getCanonicalName()); + resourceEntry.setEntryId(taxId.get()); + FSArray resourceEntryList = new FSArray(jCas, 1); + resourceEntryList.set(0, resourceEntry); + organism.setResourceEntryList(resourceEntryList); + organism.addToIndexes(); + } + + private void addGeneAnnotation(BioCAnnotation annotation, JCas jCas) throws MissingInfonException { + Optional geneId = annotation.getInfon("NCBI Gene"); + if (!geneId.isPresent()) + throw new MissingInfonException("Gene annotation does not specify its gene ID: " + annotation); + // the "total location" is the span from the minimum location value to the maximum location value; + // for GNormPlus, there are no discontinuing annotations anyway + BioCLocation location = annotation.getTotalLocation(); + Gene gene = new Gene(jCas, location.getOffset(), location.getOffset() + location.getLength()); + ResourceEntry resourceEntry = new ResourceEntry(jCas, gene.getBegin(), gene.getEnd()); + resourceEntry.setSource("NCBI Gene"); + resourceEntry.setComponentId(GNormPlusFormatMultiplierReader.class.getCanonicalName()); + resourceEntry.setEntryId(geneId.get()); + FSArray resourceEntryList = new FSArray(jCas, 1); + resourceEntryList.set(0, resourceEntry); + gene.setResourceEntryList(resourceEntryList); + gene.addToIndexes(); + } + + public int documentsLeftInCollection() { + return bioCCollection.getDocmentCount() - pos; + } +} diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReader.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReader.java new file mode 100644 index 000000000..dc04596e4 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReader.java @@ -0,0 +1,94 @@ +package de.julielab.jcore.reader; + +import de.julielab.jcore.types.casmultiplier.JCoReURI; +import org.apache.uima.UimaContext; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Progress; +import org.apache.uima.util.ProgressImpl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URI; +import java.nio.file.FileVisitOption; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Iterator; +import java.util.Optional; +import java.util.stream.Stream; + +@ResourceMetaData(name = "JCoRe GNormPlus Format Multiplier Reader", description = "A reader for the BioC XML format used by GNormPlus. Requires the matching multiplier.") +public class GNormPlusFormatMultiplierReader extends JCasCollectionReader_ImplBase { + + public static final String PARAM_INPUT_PATH = "InputPath"; + public static final String PARAM_RECURSIVE = "Recursive"; + public static final String PARAM_BATCH_SIZE = "BatchSize"; + private final static Logger log = LoggerFactory.getLogger(GNormPlusFormatMultiplierReader.class); + @ConfigurationParameter(name = PARAM_INPUT_PATH, description = "Path to a directory or file to be read. In case of a directory, all files ending in .xml will be read.") + private String inputPathString; + @ConfigurationParameter(name = PARAM_RECURSIVE, mandatory = false, defaultValue = "true", description = "Whether to read also the subdirectories of the input directory, if the input path points to a directory.") + private boolean recursive; + @ConfigurationParameter(name = PARAM_BATCH_SIZE, mandatory = false, defaultValue = "20", description = "The number of XML file URI references to send to the CAS multipliers in each work assignment. Defaults to 20.") + private int batchSize; + private Iterator fileIterator; + private int completed; + + + /** + * This method is called a single time by the framework at component + * creation. Here, descriptor parameters are read and initial setup is done. + */ + @Override + public void initialize(UimaContext context) throws ResourceInitializationException { + super.initialize(context); + inputPathString = (String) context.getConfigParameterValue(PARAM_INPUT_PATH); + recursive = Optional.of((boolean) context.getConfigParameterValue(PARAM_RECURSIVE)).orElse(true); + try { + Path inputPath = Path.of(inputPathString); + Stream pathStream; + if (recursive) + pathStream = Files.walk(inputPath, FileVisitOption.FOLLOW_LINKS); + else + pathStream = Files.list(inputPath); + pathStream = pathStream.filter(p -> p.toString().endsWith(".xml")); + fileIterator = pathStream.iterator(); + } catch (IOException e) { + log.error("Could not read the files of inputPath {}", inputPathString, e); + throw new ResourceInitializationException(e); + } + completed = 0; + } + + @Override + public void getNext(JCas jCas) throws CollectionException { + for (int i = 0; i < batchSize && fileIterator.hasNext(); i++) { + URI uri = fileIterator.next().toUri(); + try { + JCoReURI fileType = new JCoReURI(jCas); + fileType.setUri(uri.toString()); + fileType.addToIndexes(); + } catch (Exception e) { + log.error("Exception with URI: " + uri, e); + throw new CollectionException(e); + } + completed++; + } + } + + + @Override + public Progress[] getProgress() { + return new Progress[]{new ProgressImpl(completed, -1, "documents")}; + } + + @Override + public boolean hasNext() { + return fileIterator.hasNext(); + } + +} diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/MissingInfonException.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/MissingInfonException.java new file mode 100644 index 000000000..59277495c --- /dev/null +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/MissingInfonException.java @@ -0,0 +1,22 @@ +package de.julielab.jcore.reader; + +public class MissingInfonException extends Exception { + public MissingInfonException() { + } + + public MissingInfonException(String message) { + super(message); + } + + public MissingInfonException(String message, Throwable cause) { + super(message, cause); + } + + public MissingInfonException(Throwable cause) { + super(cause); + } + + public MissingInfonException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/PLACEHOLDER b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/PLACEHOLDER new file mode 100644 index 000000000..e4b0b196a --- /dev/null +++ b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/PLACEHOLDER @@ -0,0 +1,4 @@ +The actual descriptor must be created by UIMA fit. +For this purpose, use UIMAfit annotations to annotate the reader component class. +Then employ the jcore-descriptor-creator's main method to build the descriptor from the reader class. +The jcore-descriptor-creator is already on the classpath as a Maven dependency. diff --git a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-reader.xml b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-reader.xml new file mode 100644 index 000000000..9ce0d444f --- /dev/null +++ b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-reader.xml @@ -0,0 +1,20 @@ + + + org.apache.uima.java + GNormPlusFormatMultiplierReader + + JCoRe GNormPlus BioC Reader + This is only a placeholder descriptor. Please use UIMAfit to annotate the component parameters. Then employ the jcore-descriptor-creator's main method to build the descriptor from the reader class GNormPlusFormatMultiplierReader. The jcore-descriptor-creator is already on the classpath as a Maven dependency. + 2.3.0-SNAPSHOT + JULIE Lab Jena, Germany + + + + + + true + true + true + + + diff --git a/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java new file mode 100644 index 000000000..dddbb8704 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java @@ -0,0 +1,62 @@ +package de.julielab.jcore.reader; + +import de.julielab.jcore.types.*; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.assertj.core.api.Condition; +import org.junit.jupiter.api.Test; + +import java.nio.file.Path; +import java.util.Collection; + +import static org.assertj.core.api.Assertions.assertThat; + +class BioCCasPopulatorTest { + + private JCas getJCas() throws Exception { + return JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-semantics-biology-types"); + } + + @Test + public void populateWithNextDocument() throws Exception { + BioCCasPopulator bioCCasPopulator = new BioCCasPopulator(Path.of("src", "test", "resources", "test-input-path", "bioc_collection_3.xml")); + assertThat(bioCCasPopulator.documentsLeftInCollection()).isEqualTo(2); + JCas jCas = getJCas(); + bioCCasPopulator.populateWithNextDocument(jCas); + + assertThat(jCas.getDocumentText()).startsWith("Langerin").endsWith("antigen-processing pathway."); + Title title = JCasUtil.selectSingle(jCas, Title.class); + assertThat(title).extracting(Title::getTitleType).isEqualTo("document"); + assertThat(title).extracting(Title::getCoveredText).isEqualTo("Langerin, a novel C-type lectin specific to Langerhans cells, is an endocytic receptor that induces the formation of Birbeck granules."); + AbstractText abstractText = JCasUtil.selectSingle(jCas, AbstractText.class); + assertThat(abstractText).extracting(AbstractText::getCoveredText).is(new Condition<>(s -> s.startsWith("We have identified"), "Abstract has an unexpected beginning")); + // this document does not have organisms, we check those for the second document in the collection below + Collection genes = JCasUtil.select(jCas, Gene.class); + assertThat(genes).hasSize(7); + for (Gene o : genes) { + assertThat(o.getResourceEntryList()).isNotNull(); + assertThat(o.getResourceEntryList()).hasSize(1); + assertThat(o.getResourceEntryList(0)).extracting(ResourceEntry::getComponentId).isEqualTo(GNormPlusFormatMultiplierReader.class.getCanonicalName()); + assertThat(o.getResourceEntryList(0)).extracting(ResourceEntry::getSource).isEqualTo("NCBI Gene"); + assertThat(o.getResourceEntryList(0)).extracting(ResourceEntry::getEntryId).isNotNull(); + } + assertThat(genes).extracting(Gene::getCoveredText).contains("Langerin"); + + assertThat(bioCCasPopulator.documentsLeftInCollection()).isEqualTo(1); + jCas.reset(); + bioCCasPopulator.populateWithNextDocument(jCas); + assertThat(jCas.getDocumentText()).startsWith("BCAR1, a human homologue"); + + Collection organisms = JCasUtil.select(jCas, Organism.class); + assertThat(organisms).isNotEmpty(); + for (Organism o : organisms) { + assertThat(o.getResourceEntryList()).isNotNull(); + assertThat(o.getResourceEntryList()).hasSize(1); + assertThat(o.getResourceEntryList(0)).extracting(ResourceEntry::getComponentId).isEqualTo(GNormPlusFormatMultiplierReader.class.getCanonicalName()); + assertThat(o.getResourceEntryList(0)).extracting(ResourceEntry::getSource).isEqualTo("NCBI Taxonomy"); + assertThat(o.getResourceEntryList(0)).extracting(ResourceEntry::getEntryId).isNotNull(); + } + assertThat(organisms).extracting(Organism::getCoveredText).contains("human", "patients", "rat", "retrovirus", "ZR-75-1"); + } +} \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReaderTest.java b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReaderTest.java new file mode 100644 index 000000000..b2ad2190e --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReaderTest.java @@ -0,0 +1,69 @@ + +package de.julielab.jcore.reader; + + +import de.julielab.jcore.types.casmultiplier.JCoReURI; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.Test; + +import java.nio.file.Path; +import java.util.Collection; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Unit tests for jcore-bnp-bioc-reader. + * @author + * + */ +public class GNormPlusFormatMultiplierReaderTest{ + + private JCas getCas() throws Exception { + return JCasFactory.createJCas("de.julielab.jcore.types.casmultiplier.jcore-uri-multiplier-types"); + } + @Test + public void testReader() throws Exception { + CollectionReader reader = CollectionReaderFactory.createReader(GNormPlusFormatMultiplierReader.class, GNormPlusFormatMultiplierReader.PARAM_INPUT_PATH, Path.of("src", "test", "resources", "test-input-path").toString()); + assertThat(reader.hasNext()).isTrue(); + JCas jCas = getCas(); + reader.getNext(jCas.getCas()); + Collection uris = JCasUtil.select(jCas, JCoReURI.class); + assertThat(uris).extracting(JCoReURI::getUri).map(Path::of).map(Path::getFileName).map(Path::toString).containsExactlyInAnyOrder("bioc_collection_2.xml", "bioc_collection_3.xml", "bioc_collection_0.xml", "bioc_collection_1.xml"); + assertThat(reader.hasNext()).isFalse(); + } + + @Test + public void testReader2() throws Exception { + // check that the non-recursive mode also works + CollectionReader reader = CollectionReaderFactory.createReader(GNormPlusFormatMultiplierReader.class, GNormPlusFormatMultiplierReader.PARAM_INPUT_PATH, Path.of("src", "test", "resources", "test-input-path").toString(), GNormPlusFormatMultiplierReader.PARAM_RECURSIVE, false); + assertThat(reader.hasNext()); + JCas jCas = getCas(); + reader.getNext(jCas.getCas()); + Collection uris = JCasUtil.select(jCas, JCoReURI.class); + assertThat(uris).extracting(JCoReURI::getUri).map(Path::of).map(Path::getFileName).map(Path::toString).containsExactlyInAnyOrder("bioc_collection_3.xml"); + assertThat(reader.hasNext()).isFalse(); + } + + @Test + public void testReader3() throws Exception { + // check that the batch size parameter works as intended + CollectionReader reader = CollectionReaderFactory.createReader(GNormPlusFormatMultiplierReader.class, GNormPlusFormatMultiplierReader.PARAM_INPUT_PATH, Path.of("src", "test", "resources", "test-input-path").toString(), GNormPlusFormatMultiplierReader.PARAM_BATCH_SIZE, 2); + assertThat(reader.hasNext()).isTrue(); + JCas jCas = getCas(); + reader.getNext(jCas.getCas()); + Collection uris = JCasUtil.select(jCas, JCoReURI.class); + assertThat(uris).hasSize(2); + assertThat(reader.hasNext()).isTrue(); + jCas.reset(); + // there should another batch available + reader.getNext(jCas.getCas()); + Collection uris2 = JCasUtil.select(jCas, JCoReURI.class); + assertThat(uris2).hasSize(2); + // now the reader should be exhausted + assertThat(reader.hasNext()).isFalse(); + } +} diff --git a/jcore-gnp-bioc-reader/src/test/resources/test-input-path/bioc_collection_3.xml b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/bioc_collection_3.xml new file mode 100644 index 000000000..a874a1823 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/bioc_collection_3.xml @@ -0,0 +1 @@ +JCoRe GNormPlus BioC WriterFri Feb 18 13:55:36 CET 2022PubTator.key10661407title0Langerin, a novel C-type lectin specific to Langerhans cells, is an endocytic receptor that induces the formation of Birbeck granules.50489GeneLangerinabstract135We have identified a type II Ca2+-dependent lectin displaying mannose-binding specificity, exclusively expressed by Langerhans cells (LC), and named Langerin. LC are uniquely characterized by Birbeck granules (BG), which are organelles consisting of superimposed and zippered membranes. Here, we have shown that Langerin is constitutively associated with BG and that antibody to Langerin is internalized into these structures. Remarkably, transfection of Langerin cDNA into fibroblasts created a compact network of membrane structures with typical features of BG. Langerin is thus a potent inducer of membrane superimposition and zippering leading to BG formation. Our data suggest that induction of BG is a consequence of the antigen-capture function of Langerin, allowing routing into these organelles and providing access to a nonclassical antigen-processing pathway.50489GeneLangerin50489GeneLangerin50489GeneLangerin50489GeneLangerin50489GeneLangerin50489GeneLangerin10639512title0BCAR1, a human homologue of the adapter protein p130Cas, and antiestrogen resistance in breast cancer cells.9564GeneBCAR19564Genep130Cas9606Specieshumanabstract109Treatment of breast cancer with the antiestrogen tamoxifen is effective in approximately one half of the patients with estrogen receptor-positive disease, but tumors recur frequently because of the development of metastases that are resistant to tamoxifen. We have previously shown that mutagenesis of human estrogen-dependent ZR-75-1 breast cancer cells by insertion of a defective retrovirus genome caused the cells to become antiestrogen resistant. In this study, we isolated and characterized the crucial gene at the breast cancer antiestrogen resistance 1 (BCAR1) locus. Transfer of the BCAR1 locus from retrovirus-mutated, antiestrogen-resistant cells to estrogen-dependent ZR-75-1 cells by cell fusion conferred an antiestrogen-resistant phenotype on the recipient cells. The complete coding sequence of BCAR1 was isolated by use of exon-trapping and complementary DNA (cDNA) library screening. Sequence analysis of human BCAR1 cDNA predicted a protein of 870 amino acids that was strongly homologous to rat p130Cas-adapter protein. Genomic analysis revealed that BCAR1 consists of seven exons and is located at chromosome 16q23.1. BCAR1 transcripts were detected in multiple human tissues and were similar in size to transcripts produced by retrovirus-mutated ZR-75-1 cells. Transfection of BCAR1 cDNA into ZR-75-1 cells again resulted in sustained cell proliferation in the presence of antiestrogens, confirming that BCAR1 was the responsible gene in the locus. Overexpression of the BCAR1 gene confers antiestrogen resistance on human ZR-75-1 breast cancer cells. Overexpression of BCAR1 in retrovirus-mutated cells appears to result from activation of the gene's promoter. The isolation and characterization of this gene open new avenues to elucidating mechanisms by which the growth of human breast cancer becomes independent of estrogen.9564Genebreast cancer antiestrogen resistance 19564GeneBCAR19564GeneBCAR19564GeneBCAR19564GeneBCAR125414Genep130Cas-adapter protein9564GeneBCAR19564GeneBCAR19564GeneBCAR19564GeneBCAR19564GeneBCAR19564GeneBCAR19606Speciespatients9606Specieshuman31931Speciesretrovirus9606Specieshuman10116Speciesrat9606Specieshuman31931Speciesretrovirus9606Specieshuman31931Speciesretrovirus9606Specieshuman9606SpeciesZR-75-19606SpeciesZR-75-19606SpeciesZR-75-19606SpeciesZR-75-19606SpeciesZR-75-1 \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_0.xml b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_0.xml new file mode 100644 index 000000000..a2f9b537c --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_0.xml @@ -0,0 +1,2 @@ +JCoRe GNormPlus BioC WriterFri Feb 18 13:55:36 CET 2022PubTator.key1378843title0Cloning and expression of a cell surface receptor for advanced glycosylation end products of proteins.abstract103Advanced glycosylation end products of proteins (AGEs) are nonenzymatically glycosylated proteins which accumulate in vascular tissue in aging and at an accelerated rate in diabetes. A approximately 35-kDa polypeptide with a unique NH2-terminal sequence has been isolated from bovine lung and found to be present on the surface of endothelial cells where it mediates the binding of AGEs (receptor for advanced glycosylation end product or RAGE). Using an oligonucleotide probe based on the amino-terminal sequence of RAGE, an apparently full-length cDNA of 1.5 kilobases was isolated from a bovine lung cDNA library. This cDNA encoded a 394 amino acid mature protein comprised of the following putative domains: an extracellular domain of 332 amino acids, a single hydrophobic membrane spanning domain of 19 amino acids, and a carboxyl-terminal domain of 43 amino acids. A partial clone encoding the human counterpart of RAGE, isolated from a human lung library, was found to be approximately 90% homologous to the bovine molecule. Based on computer analysis of the amino acid sequence of RAGE and comparison with databases, RAGE is a new member of the immunoglobulin superfamily of cell surface molecules and shares significant homology with MUC 18, NCAM, and the cytoplasmic domain of CD20. Expression of the RAGE cDNA in 293 cells allowed them to bind 125I-AGE-albumin in a saturable and dose-dependent manner (Kd approximately 100 nM), blocked by antibody to RAGE. Western blots of 293 cells transfected with RAGE cDNA probed with anti-RAGE IgG demonstrated expression of immunoreactive protein compared to its absence in mock-transfected cells. These results suggest that RAGE functions as a cell surface receptor for AGEs, which could potentially mediate cellular effects of this class of glycosylated proteins.280986GeneRAGE280986GeneRAGE177GeneRAGE280986GeneRAGE280986GeneRAGE505653GeneCD20280986GeneRAGE280986GeneRAGE280986GeneRAGE280986GeneRAGE280986GeneRAGE9913Speciesbovine9913Speciesbovine9606Specieshuman9606Specieshuman9913Speciesbovine10896916title0Alpha(2) adrenoceptors regulate proliferation of human intestinal epithelial cells.150GeneAlpha(2) adrenoceptors9606Specieshumanabstract84Previous studies on rodents have suggested that catecholamines stimulate proliferation of the intestinal epithelium through activation of alpha(2) adrenoceptors located on crypt cells. The occurrence of this effect awaits demonstration in humans and the molecular mechanisms involved have not yet been elucidated. Here, we examined the effect of alpha(2) agonists on a clone of Caco2 cells expressing the human alpha(2A) adrenoceptor. Cells were transfected with a bicistronic plasmid containing the alpha2C10 and neomycin phosphotransferase genes. G418 resistant clones were assayed for receptor expression using radioligand binding. Receptor functionality was assessed by testing its ability to couple Gi proteins and to inhibit cAMP production. Mitogen activated protein kinase (MAPK) phosphorylation was followed by western blot, and cell proliferation was estimated by measuring protein and DNA content. Permanent transfection of Caco2 cells allowed us to obtain a clone (Caco2-3B) expressing alpha(2A) adrenoceptors at a density similar to that found in normal human intestinal epithelium. Caco2-3B retained morphological features and brush border enzyme expression characteristic of enterocytic differentiation. The receptor was coupled to Gi2/Gi3 proteins and its stimulation caused marked diminution of forskolin induced cAMP production. Treatment of Caco2-3B with UK14304 (alpha(2) agonist) induced a rapid increase in the phosphorylation state of MAPK, extracellular regulated protein kinase 1 (Erk1), and 2 (Erk2). This event was totally abolished in pertussis toxin treated cells and in the presence of kinase inhibitors (genistein or PD98059). It was unaffected by protein kinase C downregulation but correlated with a transient increase in Shc tyrosine phosphorylation. Finally, sustained exposure of Caco2-3B to UK14304 resulted in modest but significant acceleration of cell proliferation. None of these effects was observed in the parental cell line Caco2. The results obtained in the present study support a regulatory role for alpha(2) adrenoceptors in intestinal cell proliferation.150Genealpha(2) adrenoceptors150Genealpha(2A) adrenoceptor150Genealpha2C105595;5594;5595GeneMAPK5595;5594;5595GeneMAPK5595Geneextracellular regulated protein kinase 15595GeneErk15594GeneErk26464GeneShc150Genealpha(2) adrenoceptors9606Specieshumans9606Specieshuman9606Specieshuman9606SpeciesCaco29606SpeciesCaco2 \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_1.xml b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_1.xml new file mode 100644 index 000000000..b2144e781 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_1.xml @@ -0,0 +1,2 @@ +JCoRe GNormPlus BioC WriterFri Feb 18 13:55:36 CET 2022PubTator.key10880510title0Human TREK2, a 2P domain mechano-sensitive K+ channel with multiple regulations by polyunsaturated fatty acids, lysophospholipids, and Gs, Gi, and Gq protein-coupled receptors.54207GeneTREK29606SpeciesHumanabstract177Mechano-sensitive and fatty acid-activated K(+) belong to the structural class of K(+) channel with two pore domains. Here, we report the isolation and the characterization of a novel member of this family. This channel, called TREK2, is closely related to TREK1 (78% of homology). Its gene is located on chromosome 14q31. TREK2 is abundantly expressed in pancreas and kidney and to a lower level in brain, testis, colon, and small intestine. In the central nervous system, TREK2 has a widespread distribution with the highest levels of expression in cerebellum, occipital lobe, putamen, and thalamus. In transfected cells, TREK2 produces rapidly activating and non-inactivating outward rectifier K(+) currents. The single-channel conductance is 100 picosiemens at +40 mV in 150 mm K(+). The currents can be strongly stimulated by polyunsaturated fatty acid such as arachidonic, docosahexaenoic, and linoleic acids and by lysophosphatidylcholine. The channel is also activated by acidification of the intracellular medium. TREK2 is blocked by application of intracellular cAMP. As with TREK1, TREK2 is activated by the volatile general anesthetics chloroform, halothane, and isoflurane and by the neuroprotective agent riluzole. TREK2 can be positively or negatively regulated by a variety of neurotransmitter receptors. Stimulation of the G(s)-coupled receptor 5HT4sR or the G(q)-coupled receptor mGluR1 inhibits channel activity, whereas activation of the G(i)-coupled receptor mGluR2 increases TREK2 currents. These multiple types of regulations suggest that TREK2 plays an important role as a target of neurotransmitter action.54207GeneTREK23776GeneTREK154207GeneTREK254207GeneTREK254207GeneTREK254207GeneTREK23776GeneTREK154207GeneTREK254207GeneTREK23360Gene5HT4sR2911GenemGluR114800GenemGluR254207GeneTREK254207GeneTREK210803599title0Enhanced growth of MCF-7 breast cancer cells overexpressing parathyroid hormone-related peptide.5744Geneparathyroid hormone-related peptide9606SpeciesMCF-7abstract97PTH-related peptide (PTHrP) is a secreted protein produced by breast cancer cells both in vivo and in vitro. Because of its structural similarity to PTH at the amino terminus, the two proteins interact with a common cell surface receptor, the PTH/PTHrP receptor. When overproduced by tumor cells, PTHrP enters the circulation, giving rise to the common paraneoplastic syndrome of humoral hypercalcemia of malignancy. Although initially discovered in malignancies, PTHrP is now known to be produced by most cells and tissues in the body. It acts as an autocrine and paracrine mediator of cell proliferation and differentiation, effects which are mediated via the PTH/PTHrP receptor. Recent evidence also has shown that, directly after translation, PTHrP is able to enter the nucleus and/or nucleolus and influence cell cycle progression and apoptosis. In this study, we have either overproduced PTHrP or inhibited endogenous PTHrP production in the breast cancer cell line, MCF-7. Overexpression of PTHrP was associated with an increase in mitogenesis, whereas inhibiting endogenous PTHrP production resulted in decreased cell proliferation. The overexpressed peptide targeted to the perinuclear space. In contrast, PTHrP interaction with the cell surface PTH/PTHrP receptor resulted in decreased cell proliferation in the same cell line. This latter effect is dependent on interaction with the receptor, in that exogenously added PTHrP moieties known not to interact with the receptor had no effect on cell growth. Furthermore, neutralization of added peptide with an anti-PTHrP antiserum completely abolished the growth inhibitory effects. In contrast, this antibody has no effect on the increased proliferation rate of the MCF-7 transfectants that overexpress PTHrP, compared with control cells. The net effect of autocrine/paracrine and intracrine effects of PTHrP in MCF-7 cells overproducing the peptide is accelerated cell growth. These findings have critical implications regarding the role of PTHrP in breast cancer, and they suggest that controlling PTHrP production in breast cancer may be useful therapeutically.5744GenePTH-related peptide5744GenePTHrP5741GenePTH5745GenePTH/PTHrP receptor5744GenePTHrP5744GenePTHrP5745GenePTH/PTHrP receptor5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5745GenePTH/PTHrP receptor5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP9606SpeciesMCF-7 \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir2/bioc_collection_2.xml b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir2/bioc_collection_2.xml new file mode 100644 index 000000000..2ed9fa4f7 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir2/bioc_collection_2.xml @@ -0,0 +1,2 @@ +JCoRe GNormPlus BioC WriterFri Feb 18 13:55:36 CET 2022PubTator.key10722742title0Mdm2 is a RING finger-dependent ubiquitin protein ligase for itself and p53.4193GeneMdm27157Genep53abstract77Mdm2 has been shown to regulate p53 stability by targeting the p53 protein for proteasomal degradation. We now report that Mdm2 is a ubiquitin protein ligase (E3) for p53 and that its activity is dependent on its RING finger. Furthermore, we show that Mdm2 mediates its own ubiquitination in a RING finger-dependent manner, which requires no eukaryotic proteins other than ubiquitin-activating enzyme (E1) and an ubiquitin-conjugating enzyme (E2). It is apparent, therefore, that Mdm2 manifests an intrinsic capacity to mediate ubiquitination. Mutation of putative zinc coordination residues abrogated this activity, as did chelation of divalent cations. After cation chelation, the full activity could be restored by addition of zinc. We further demonstrate that the degradation of p53 and Mdm2 in cells requires additional potential zinc-coordinating residues beyond those required for the intrinsic activity of Mdm2 in vitro. Replacement of the Mdm2 RING with that of another protein (Praja1) reconstituted ubiquitination and proteasomal degradation of Mdm2. However, this RING was ineffective in ubiquitination and proteasomal targeting of p53, suggesting that there may be specificity at the level of the RING in the recognition of heterologous substrates.4193GeneMdm27157Genep537157Genep534193GeneMdm27157Genep534193GeneMdm27318Geneubiquitin-activating enzyme (E1)4193GeneMdm27157Genep534193GeneMdm24193GeneMdm24193GeneMdm264219GenePraja14193GeneMdm27157Genep531770008title0Structural analysis and expression of human desmoglein: a cadherin-like component of the desmosome.1828;281131Genedesmoglein1000Genecadherin9606Specieshumanabstract100Desmosomes are adhesive cell junctions found in great abundance in tissues that experience mechanical stress. The transmembrane desmosomal glycoproteins have been proposed to play a role in cell adhesion; desmoglein I (DGI) is a major member of this class of desmosomal molecules. However, evidence supporting a role for DGI in cell adhesion or in the plaque is lacking. In order to begin to understand DGI function we have identified human cDNA clones encoding the entire mature polypeptide of 1000 amino acids. Our data suggest that like the bovine DGI molecule human DGI is highly related to the calcium-dependent class of cell adhesion molecules known as cadherins. Four related extracellular domains located in the amino-terminal domain of the molecule contain putative calcium binding sites originally identified in the cadherins. The highest degree of similarity between human N-cadherin and human DGI, and likewise between bovine DGI and human DGI, is greatest in the most amino-terminal extracellular domain. This suggests a conserved functional role for the extracellular domains, perhaps in calcium-mediated cell adhesion. The cytoplasmic portion of the molecule contains a cadherin-like region and, like bovine DGI, a carboxy-terminal tail that is not present in the cadherins, comprising three additional domains. One of these contains a novel repeating motif of 29 +/- 1 residues, first identified in bovine DGI. Each of the highly homologous repeating units is likely to consist of two beta-strands and two turns with special characteristics. Five amino acids that are identical in bovine and human DGI lie in the second of the two predicted beta-strands, and intriguingly contain putative target sites for protein kinase C. On the basis of structural analysis, a model predicting the disposition of human DGI domains in the desmosome is proposed. Northern analysis suggests that unlike bovine epidermis, which expresses a single mRNA of reported size approximately 7.6 kb, human foreskin and cultured keratinocytes display a complex pattern with bands of approximately 7.2, 4.0 and 3.0 kb. Each of these cross-hybridizing mRNAs is coordinately expressed in normal human keratinocytes in response to long-term culture and increased calcium.1828Genedesmoglein I1828GeneDGI1828GeneDGI1828GeneDGI281131GeneDGI1828GeneDGI1000GeneN-cadherin1828GeneDGI281131GeneDGI1828GeneDGI281131GeneDGI281131GeneDGI1828GeneDGI9606Specieshuman9913Speciesbovine9606Specieshuman9606Specieshuman9606Specieshuman9913Speciesbovine9606Specieshuman9913Speciesbovine9913Speciesbovine9913Speciesbovine9606Specieshuman9606Specieshuman9913Speciesbovine9606Specieshuman9606Specieshuman1828GeneDGI \ No newline at end of file diff --git a/pom.xml b/pom.xml index 662cf49ef..8f0ad13d9 100644 --- a/pom.xml +++ b/pom.xml @@ -1,435 +1,551 @@ - + + 4.0.0 - + + - + + de.julielab - + + jcore-parent - + + 2.5.2-SNAPSHOT - + + - + + jcore-base - + + pom - + + JCoRe Base - + + The POM for the JCoRe Base projects. - + + 2.6.0-SNAPSHOT - + + - + + JULIE Lab, Germany - + + http://www.julielab.de - + + - + + - + + - + + BSD-2-Clause - + + https://opensource.org/licenses/BSD-2-Clause - + + - + + - + + https://github.com/JULIELab/jcore-base - + + - + + - + + org.apache.uima - + + uimaj-core - + + ${uima-version} - + + - + + - + + org.apache.uima - + + uimafit-core - + + ${uimafit-version} - + + - + + - + + - + + jcore-annotation-adder-ae - + + jcore-ace-reader - + + jcore-acronym-ae - + + jcore-acronym-writer - + + jcore-banner-ae - + + jcore-bc2gm-reader - + + jcore-bc2gmformat-writer - + + jcore-biolemmatizer-ae - + + jcore-bionlpformat-consumer - + + jcore-bionlpformat-reader - + + jcore-biosem-ae - + + jcore-conll-consumer - + + jcore-coordination-baseline-ae - + + jcore-cord19-reader - + + jcore-coreference-writer - + + jcore-ct-reader - + + jcore-db-checkpoint-ae - + + jcore-descriptor-creator - + + jcore-dta-reader - + + jcore-ec-code-ae - + + jcore-elasticsearch-consumer - + + jcore-embedding-writer - + + jcore-event-flattener-ae - + + jcore-feature-value-replacement-ae - + + jcore-file-reader - + + jcore-flair-ner-ae - + + jcore-flair-token-embedding-ae - + + jcore-flow-controllers - + + jcore-gnp-bioc-writer + jcore-iexml-consumer - + + jcore-iexml-reader - + + jcore-ign-reader - + + jcore-iob-consumer - + + jcore-jnet-ae - + + jcore-jpos-ae - + + jcore-jsbd-ae - + + jcore-jtbd-ae - + + jcore-julielab-entity-evaluator-consumer - + + jcore-likelihood-assignment-ae - + + jcore-likelihood-detection-ae - + + jcore-line-multiplier - + + jcore-lingpipegazetteer-ae - + + jcore-lingpipe-porterstemmer-ae - + + jcore-lingscope-ae - + + jcore-linnaeus-species-ae - + + jcore-mantra-xml-types - + + jcore-medxn-ae - + + jcore-msdoc-reader - + + jcore-mstparser-ae - + + jcore-muc7-reader - + + jcore-mutationfinder-ae - + + jcore-neo4j-relations-consumer - + + jcore-opennlp-chunk-ae - + + jcore-opennlp-parser-ae - + + jcore-opennlp-postag-ae - + + jcore-opennlp-sentence-ae - + + jcore-opennlp-token-ae - + + jcore-ppd-writer - + + jcore-pmc-reader - + + jcore-pubtator-reader - + + jcore-stanford-lemmatizer-ae - + + jcore-topic-indexing-ae - + + jcore-topics-writer - + + jcore-txt-consumer - + + jcore-types - + + jcore-utilities - + + jcore-xml-mapper - + + jcore-xml-reader - + + jcore-xmi-reader - + + jcore-xmi-writer - + + jedis-parent - + + jcore-jedis-integration-tests - + + + jcore-bnp-bioc-reader + - + + - + + scm:git:https://github.com/JULIELab/jcore-base - + + scm:git:https://github.com/JULIELab/jcore-base - + + scm:git:https://github.com/JULIELab/jcore-base - + + - + + From 618c10305fe239b173592d2984157a31a7ece5f2 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 18 Feb 2022 16:49:05 +0100 Subject: [PATCH 148/269] Write the multiplier and add a test. Tests works, everything looks good. --- jcore-gnp-bioc-reader/pom.xml | 6 ++ .../jcore/reader/BioCCasPopulator.java | 7 ++ .../reader/GNormPlusFormatMultiplier.java | 65 +++++++++++++++++++ .../de/julielab/jcore/reader/desc/PLACEHOLDER | 4 -- .../reader/desc/jcore-bnp-bioc-reader.xml | 20 ------ .../jcore/reader/BioCCasPopulatorTest.java | 2 +- .../reader/GNormPlusFormatMultiplierTest.java | 43 ++++++++++++ 7 files changed, 122 insertions(+), 25 deletions(-) create mode 100644 jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java delete mode 100644 jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/PLACEHOLDER delete mode 100644 jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-reader.xml create mode 100644 jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierTest.java diff --git a/jcore-gnp-bioc-reader/pom.xml b/jcore-gnp-bioc-reader/pom.xml index 86008eabd..4ca0c48b5 100644 --- a/jcore-gnp-bioc-reader/pom.xml +++ b/jcore-gnp-bioc-reader/pom.xml @@ -45,6 +45,12 @@ org.assertj assertj-core + + de.julielab + jcore-utilities + ${jcore-utilities-version} + test + JCoRe GNormPlus BioC Reader diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java index 4af6d0342..bfd4474e0 100644 --- a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java @@ -34,6 +34,7 @@ public BioCCasPopulator(Path biocCollectionPath) throws XMLStreamException, IOEx public void populateWithNextDocument(JCas jCas) throws XMLStreamException, IOException { BioCDocument document = bioCCollection.getDocument(pos++); + setDocumentId(jCas, document); setDocumentText(jCas, document); Iterator allAnnotations = Stream.concat(document.getAnnotations().stream(), document.getPassages().stream().map(BioCPassage::getAnnotations).flatMap(Collection::stream)).iterator(); for (BioCAnnotation annotation : (Iterable)() ->allAnnotations) { @@ -55,6 +56,12 @@ public void populateWithNextDocument(JCas jCas) throws XMLStreamException, IOExc } } + private void setDocumentId(JCas jCas, BioCDocument document) { + Header h = new Header(jCas); + h.setDocId(document.getID()); + h.addToIndexes(); + } + private void setDocumentText(JCas jCas, BioCDocument document) { StringBuilder sb = new StringBuilder(); // iterate over the passages and create the complete document text from their individual text elements diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java new file mode 100644 index 000000000..5e7d71580 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java @@ -0,0 +1,65 @@ +package de.julielab.jcore.reader; + +import de.julielab.jcore.types.casmultiplier.JCoReURI; +import org.apache.uima.analysis_component.JCasMultiplier_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.AbstractCas; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.URI; +import java.nio.file.Path; +import java.util.Collection; +import java.util.Iterator; + +@ResourceMetaData(name="GNormPlusFormatMultiplier", description = "Multiplier for GNormPlusFormatMultiplierReader. Takes URIs pointing to BioC collection files that contain annotations created by GNormPlus. For each such file, reads all documents and returns CASes for them until all documents in all collections have been read into a CAS.") +public class GNormPlusFormatMultiplier extends JCasMultiplier_ImplBase { + private final static Logger log = LoggerFactory.getLogger(GNormPlusFormatMultiplier.class); + private Iterator currentUriBatch; + private BioCCasPopulator casPopulator; + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + try { + Collection jcoreUris = JCasUtil.select(jCas, JCoReURI.class); + if (log.isDebugEnabled()) + log.debug("Received batch of {} BioC XML URIs", jcoreUris.size()); + currentUriBatch = jcoreUris.stream().map(JCoReURI::getUri).map(URI::create).iterator(); + } catch (Throwable e) { + log.error("Unexpected error", e); + throw new AnalysisEngineProcessException(e); + } + } + + @Override + public boolean hasNext() throws AnalysisEngineProcessException { + if ((casPopulator == null || casPopulator.documentsLeftInCollection() == 0) && currentUriBatch.hasNext()) { + URI nextUri = currentUriBatch.next(); + try { + casPopulator = new BioCCasPopulator(Path.of(nextUri)); + } catch (Exception e) { + log.error("Could not read from {}", nextUri, e); + throw new AnalysisEngineProcessException(e); + } + } + return casPopulator != null && casPopulator.documentsLeftInCollection() > 0; + } + + @Override + public AbstractCas next() throws AnalysisEngineProcessException { + if (hasNext()) { + JCas cas = getEmptyJCas(); + try { + casPopulator.populateWithNextDocument(cas); + return cas; + } catch (Exception e) { + log.error("Could not populate CAS with the next BioC document.", e); + throw new AnalysisEngineProcessException(e); + } + } + return null; + } +} diff --git a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/PLACEHOLDER b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/PLACEHOLDER deleted file mode 100644 index e4b0b196a..000000000 --- a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/PLACEHOLDER +++ /dev/null @@ -1,4 +0,0 @@ -The actual descriptor must be created by UIMA fit. -For this purpose, use UIMAfit annotations to annotate the reader component class. -Then employ the jcore-descriptor-creator's main method to build the descriptor from the reader class. -The jcore-descriptor-creator is already on the classpath as a Maven dependency. diff --git a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-reader.xml b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-reader.xml deleted file mode 100644 index 9ce0d444f..000000000 --- a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-reader.xml +++ /dev/null @@ -1,20 +0,0 @@ - - - org.apache.uima.java - GNormPlusFormatMultiplierReader - - JCoRe GNormPlus BioC Reader - This is only a placeholder descriptor. Please use UIMAfit to annotate the component parameters. Then employ the jcore-descriptor-creator's main method to build the descriptor from the reader class GNormPlusFormatMultiplierReader. The jcore-descriptor-creator is already on the classpath as a Maven dependency. - 2.3.0-SNAPSHOT - JULIE Lab Jena, Germany - - - - - - true - true - true - - - diff --git a/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java index dddbb8704..acea59b54 100644 --- a/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java +++ b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java @@ -15,7 +15,7 @@ class BioCCasPopulatorTest { private JCas getJCas() throws Exception { - return JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-semantics-biology-types"); + return JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); } @Test diff --git a/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierTest.java b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierTest.java new file mode 100644 index 000000000..a38744b34 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierTest.java @@ -0,0 +1,43 @@ +package de.julielab.jcore.reader; + +import de.julielab.jcore.types.casmultiplier.JCoReURI; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.JCasIterator; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.Test; + +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; +class GNormPlusFormatMultiplierTest { + private JCas getCas() throws Exception { + return JCasFactory.createJCas("de.julielab.jcore.types.casmultiplier.jcore-uri-multiplier-types"); + } + + @Test + void process() throws Exception { + JCas cas = getCas(); + JCoReURI jCoReURI = new JCoReURI(cas); + jCoReURI.setUri(Path.of("src", "test", "resources", "test-input-path", "subdir1", "bioc_collection_0.xml").toUri().toString()); + jCoReURI.addToIndexes(); + + JCoReURI jCoReURI2 = new JCoReURI(cas); + jCoReURI2.setUri(Path.of("src", "test", "resources", "test-input-path", "subdir2", "bioc_collection_2.xml").toUri().toString()); + jCoReURI2.addToIndexes(); + + AnalysisEngine multiplier = AnalysisEngineFactory.createEngine(GNormPlusFormatMultiplier.class); + JCasIterator jCasIterator = multiplier.processAndOutputNewCASes(cas); + List docIds = new ArrayList<>(); + while (jCasIterator.hasNext()) { + JCas multiplierCas = jCasIterator.next(); + docIds.add(JCoReTools.getDocId(multiplierCas)); + multiplierCas.release(); + } + assertThat(docIds).containsExactlyInAnyOrder("1378843", "10896916", "10722742", "1770008"); + } +} \ No newline at end of file From 489bd9bd0608ba4d28f10b14267722ac292d64e4 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 18 Feb 2022 16:50:11 +0100 Subject: [PATCH 149/269] Add the GNP reader descriptors. Resolves #131. --- jcore-gnp-bioc-reader/component.meta | 25 ++++++++ .../desc/jcore-bnp-bioc-multiplier-reader.xml | 58 +++++++++++++++++++ .../reader/desc/jcore-bnp-bioc-multiplier.xml | 26 +++++++++ 3 files changed, 109 insertions(+) create mode 100644 jcore-gnp-bioc-reader/component.meta create mode 100644 jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier-reader.xml create mode 100644 jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml diff --git a/jcore-gnp-bioc-reader/component.meta b/jcore-gnp-bioc-reader/component.meta new file mode 100644 index 000000000..c3b3f6e0a --- /dev/null +++ b/jcore-gnp-bioc-reader/component.meta @@ -0,0 +1,25 @@ +{ + "categories": [ + "ae", + "reader" + ], + "description": "A reader for the BioC format used by GNormPlus. Reads the text and the annotations, both species and genes.", + "descriptors": [ + { + "category": "ae", + "location": "de.julielab.jcore.reader.desc.jcore-bnp-bioc-multiplier" + }, + { + "category": "reader", + "location": "de.julielab.jcore.reader.desc.jcore-bnp-bioc-multiplier-reader" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-bnp-bioc-reader", + "groupId": "de.julielab", + "version": "2.6.0-SNAPSHOT" + }, + "name": "JCoRe GNormPlus BioC Reader" +} diff --git a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier-reader.xml b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier-reader.xml new file mode 100644 index 000000000..7081ae596 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier-reader.xml @@ -0,0 +1,58 @@ + + + org.apache.uima.java + de.julielab.jcore.reader.GNormPlusFormatMultiplierReader + + JCoRe GNormPlus Format Multiplier Reader + A reader for the BioC XML format used by GNormPlus. Requires the matching multiplier. + + + InputPath + Path to a directory or file to be read. In case of a directory, all files ending in .xml will be read. + String + false + true + + + Recursive + Whether to read also the subdirectories of the input directory, if the input path points to a directory. + Boolean + false + false + + + BatchSize + The number of XML file URI references to send to the CAS multipliers in each work assignment. Defaults to 20. + Integer + false + false + + + + + Recursive + + true + + + + BatchSize + + 20 + + + + + + + + + + + + true + false + true + + + \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml new file mode 100644 index 000000000..2b64be30b --- /dev/null +++ b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml @@ -0,0 +1,26 @@ + + + org.apache.uima.java + true + de.julielab.jcore.reader.GNormPlusFormatMultiplier + + GNormPlusFormatMultiplier + Multiplier for GNormPlusFormatMultiplierReader. Takes URIs pointing to BioC collection files that contain annotations created by GNormPlus. For each such file, reads all documents and returns CASes for them until all documents in all collections have been read into a CAS. + + + + + + + + + + + + + true + true + false + + + \ No newline at end of file From 833a275b876757ca2f476eaa8c2e06c5a8e211a6 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 18 Feb 2022 16:58:48 +0100 Subject: [PATCH 150/269] Correct the module structure of the parent pom. --- pom.xml | 859 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 429 insertions(+), 430 deletions(-) diff --git a/pom.xml b/pom.xml index 8f0ad13d9..84fad31a2 100644 --- a/pom.xml +++ b/pom.xml @@ -1,550 +1,549 @@ - - - - + + + + 4.0.0 - - - - + + + + - - - - + + + + de.julielab - - - - + + + + jcore-parent - - - - + + + + 2.5.2-SNAPSHOT - - - - + + + + - - - - + + + + jcore-base - - - - + + + + pom - - - - + + + + JCoRe Base - - - - + + + + The POM for the JCoRe Base projects. - - - - + + + + 2.6.0-SNAPSHOT - - - - + + + + - - - - + + + + JULIE Lab, Germany - - - - + + + + http://www.julielab.de - - - - + + + + - - - - + + + + - - - - + + + + - - - - + + + + BSD-2-Clause - - - - + + + + https://opensource.org/licenses/BSD-2-Clause - - - - + + + + - - - - + + + + - - - - + + + + https://github.com/JULIELab/jcore-base - - - - + + + + - - - - + + + + - - - - + + + + org.apache.uima - - - - + + + + uimaj-core - - - - + + + + ${uima-version} - - - - + + + + - - - - + + + + - - - - + + + + org.apache.uima - - - - + + + + uimafit-core - - - - + + + + ${uimafit-version} - - - - + + + + - - - - + + + + - - - - + + + + - - - + + + jcore-annotation-adder-ae - - - + + + jcore-ace-reader - - - - + + + + jcore-acronym-ae - - - + + + jcore-acronym-writer - - - - + + + + jcore-banner-ae - - - + + + jcore-bc2gm-reader - - - + + + jcore-bc2gmformat-writer - - - + + + jcore-biolemmatizer-ae - - - - + + + + jcore-bionlpformat-consumer - - - - + + + + jcore-bionlpformat-reader - - - - + + + + jcore-biosem-ae - - - - + + + + jcore-conll-consumer - - - - + + + + jcore-coordination-baseline-ae - - - + + + jcore-cord19-reader - - - + + + jcore-coreference-writer - - - + + + jcore-ct-reader - - - + + + jcore-db-checkpoint-ae - - - + + + jcore-descriptor-creator - - - + + + jcore-dta-reader - - - - + + + + jcore-ec-code-ae - - - - + + + + jcore-elasticsearch-consumer - - - - + + + + jcore-embedding-writer - - - - + + + + jcore-event-flattener-ae - - - - + + + + jcore-feature-value-replacement-ae - - - - + + + + jcore-file-reader - - - - + + + + jcore-flair-ner-ae - - - + + + jcore-flair-token-embedding-ae - - - + + + jcore-flow-controllers - - + + jcore-gnp-bioc-reader + jcore-gnp-bioc-writer - - + + jcore-iexml-consumer - - - - + + + + jcore-iexml-reader - - - - + + + + jcore-ign-reader - - - - + + + + jcore-iob-consumer - - - - + + + + jcore-jnet-ae - - - - + + + + jcore-jpos-ae - - - - + + + + jcore-jsbd-ae - - - - + + + + jcore-jtbd-ae - - - - + + + + jcore-julielab-entity-evaluator-consumer - - - - + + + + jcore-likelihood-assignment-ae - - - - + + + + jcore-likelihood-detection-ae - - - + + + jcore-line-multiplier - - - + + + jcore-lingpipegazetteer-ae - - - - + + + + jcore-lingpipe-porterstemmer-ae - - - - + + + + jcore-lingscope-ae - - - - + + + + jcore-linnaeus-species-ae - - - - + + + + jcore-mantra-xml-types - - - - + + + + jcore-medxn-ae - - - - + + + + jcore-msdoc-reader - - - - + + + + jcore-mstparser-ae - - - - + + + + jcore-muc7-reader - - - - + + + + jcore-mutationfinder-ae - - - + + + jcore-neo4j-relations-consumer - - - - + + + + jcore-opennlp-chunk-ae - - - - + + + + jcore-opennlp-parser-ae - - - - + + + + jcore-opennlp-postag-ae - - - - + + + + jcore-opennlp-sentence-ae - - - - + + + + jcore-opennlp-token-ae - - - + + + jcore-ppd-writer - - - + + + jcore-pmc-reader - - - - + + + + jcore-pubtator-reader - - - - + + + + jcore-stanford-lemmatizer-ae - - - - + + + + jcore-topic-indexing-ae - - - - + + + + jcore-topics-writer - - - - + + + + jcore-txt-consumer - - - - + + + + jcore-types - - - - + + + + jcore-utilities - - - - + + + + jcore-xml-mapper - - - - + + + + jcore-xml-reader - - - - + + + + jcore-xmi-reader - - - - + + + + jcore-xmi-writer - - - - + + + + jedis-parent - - - + + + jcore-jedis-integration-tests - - - - jcore-bnp-bioc-reader - + + + - - - - + + + + - - - - + + + + scm:git:https://github.com/JULIELab/jcore-base - - - - + + + + scm:git:https://github.com/JULIELab/jcore-base - - - - + + + + scm:git:https://github.com/JULIELab/jcore-base - - - - + + + + - + From 62269ae0b3b91bca4f051a5174ae81a6def047da Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 18 Feb 2022 17:08:36 +0100 Subject: [PATCH 151/269] Remove the absolut path to BioC.dtd from the test document. Intellij must have put it in there automatically to resolve the DTD. --- .../resources/test-input-path/subdir1/bioc_collection_0.xml | 2 +- .../resources/test-input-path/subdir1/bioc_collection_1.xml | 2 +- .../resources/test-input-path/subdir2/bioc_collection_2.xml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_0.xml b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_0.xml index a2f9b537c..9c1283a15 100644 --- a/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_0.xml +++ b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_0.xml @@ -1,2 +1,2 @@ JCoRe GNormPlus BioC WriterFri Feb 18 13:55:36 CET 2022PubTator.key1378843title0Cloning and expression of a cell surface receptor for advanced glycosylation end products of proteins.abstract103Advanced glycosylation end products of proteins (AGEs) are nonenzymatically glycosylated proteins which accumulate in vascular tissue in aging and at an accelerated rate in diabetes. A approximately 35-kDa polypeptide with a unique NH2-terminal sequence has been isolated from bovine lung and found to be present on the surface of endothelial cells where it mediates the binding of AGEs (receptor for advanced glycosylation end product or RAGE). Using an oligonucleotide probe based on the amino-terminal sequence of RAGE, an apparently full-length cDNA of 1.5 kilobases was isolated from a bovine lung cDNA library. This cDNA encoded a 394 amino acid mature protein comprised of the following putative domains: an extracellular domain of 332 amino acids, a single hydrophobic membrane spanning domain of 19 amino acids, and a carboxyl-terminal domain of 43 amino acids. A partial clone encoding the human counterpart of RAGE, isolated from a human lung library, was found to be approximately 90% homologous to the bovine molecule. Based on computer analysis of the amino acid sequence of RAGE and comparison with databases, RAGE is a new member of the immunoglobulin superfamily of cell surface molecules and shares significant homology with MUC 18, NCAM, and the cytoplasmic domain of CD20. Expression of the RAGE cDNA in 293 cells allowed them to bind 125I-AGE-albumin in a saturable and dose-dependent manner (Kd approximately 100 nM), blocked by antibody to RAGE. Western blots of 293 cells transfected with RAGE cDNA probed with anti-RAGE IgG demonstrated expression of immunoreactive protein compared to its absence in mock-transfected cells. These results suggest that RAGE functions as a cell surface receptor for AGEs, which could potentially mediate cellular effects of this class of glycosylated proteins.280986GeneRAGE280986GeneRAGE177GeneRAGE280986GeneRAGE280986GeneRAGE505653GeneCD20280986GeneRAGE280986GeneRAGE280986GeneRAGE280986GeneRAGE280986GeneRAGE9913Speciesbovine9913Speciesbovine9606Specieshuman9606Specieshuman9913Speciesbovine10896916title0Alpha(2) adrenoceptors regulate proliferation of human intestinal epithelial cells.150GeneAlpha(2) adrenoceptors9606Specieshumanabstract84Previous studies on rodents have suggested that catecholamines stimulate proliferation of the intestinal epithelium through activation of alpha(2) adrenoceptors located on crypt cells. The occurrence of this effect awaits demonstration in humans and the molecular mechanisms involved have not yet been elucidated. Here, we examined the effect of alpha(2) agonists on a clone of Caco2 cells expressing the human alpha(2A) adrenoceptor. Cells were transfected with a bicistronic plasmid containing the alpha2C10 and neomycin phosphotransferase genes. G418 resistant clones were assayed for receptor expression using radioligand binding. Receptor functionality was assessed by testing its ability to couple Gi proteins and to inhibit cAMP production. Mitogen activated protein kinase (MAPK) phosphorylation was followed by western blot, and cell proliferation was estimated by measuring protein and DNA content. Permanent transfection of Caco2 cells allowed us to obtain a clone (Caco2-3B) expressing alpha(2A) adrenoceptors at a density similar to that found in normal human intestinal epithelium. Caco2-3B retained morphological features and brush border enzyme expression characteristic of enterocytic differentiation. The receptor was coupled to Gi2/Gi3 proteins and its stimulation caused marked diminution of forskolin induced cAMP production. Treatment of Caco2-3B with UK14304 (alpha(2) agonist) induced a rapid increase in the phosphorylation state of MAPK, extracellular regulated protein kinase 1 (Erk1), and 2 (Erk2). This event was totally abolished in pertussis toxin treated cells and in the presence of kinase inhibitors (genistein or PD98059). It was unaffected by protein kinase C downregulation but correlated with a transient increase in Shc tyrosine phosphorylation. Finally, sustained exposure of Caco2-3B to UK14304 resulted in modest but significant acceleration of cell proliferation. None of these effects was observed in the parental cell line Caco2. The results obtained in the present study support a regulatory role for alpha(2) adrenoceptors in intestinal cell proliferation.150Genealpha(2) adrenoceptors150Genealpha(2A) adrenoceptor150Genealpha2C105595;5594;5595GeneMAPK5595;5594;5595GeneMAPK5595Geneextracellular regulated protein kinase 15595GeneErk15594GeneErk26464GeneShc150Genealpha(2) adrenoceptors9606Specieshumans9606Specieshuman9606Specieshuman9606SpeciesCaco29606SpeciesCaco2 \ No newline at end of file + "BioC.dtd">JCoRe GNormPlus BioC WriterFri Feb 18 13:55:36 CET 2022PubTator.key1378843title0Cloning and expression of a cell surface receptor for advanced glycosylation end products of proteins.abstract103Advanced glycosylation end products of proteins (AGEs) are nonenzymatically glycosylated proteins which accumulate in vascular tissue in aging and at an accelerated rate in diabetes. A approximately 35-kDa polypeptide with a unique NH2-terminal sequence has been isolated from bovine lung and found to be present on the surface of endothelial cells where it mediates the binding of AGEs (receptor for advanced glycosylation end product or RAGE). Using an oligonucleotide probe based on the amino-terminal sequence of RAGE, an apparently full-length cDNA of 1.5 kilobases was isolated from a bovine lung cDNA library. This cDNA encoded a 394 amino acid mature protein comprised of the following putative domains: an extracellular domain of 332 amino acids, a single hydrophobic membrane spanning domain of 19 amino acids, and a carboxyl-terminal domain of 43 amino acids. A partial clone encoding the human counterpart of RAGE, isolated from a human lung library, was found to be approximately 90% homologous to the bovine molecule. Based on computer analysis of the amino acid sequence of RAGE and comparison with databases, RAGE is a new member of the immunoglobulin superfamily of cell surface molecules and shares significant homology with MUC 18, NCAM, and the cytoplasmic domain of CD20. Expression of the RAGE cDNA in 293 cells allowed them to bind 125I-AGE-albumin in a saturable and dose-dependent manner (Kd approximately 100 nM), blocked by antibody to RAGE. Western blots of 293 cells transfected with RAGE cDNA probed with anti-RAGE IgG demonstrated expression of immunoreactive protein compared to its absence in mock-transfected cells. These results suggest that RAGE functions as a cell surface receptor for AGEs, which could potentially mediate cellular effects of this class of glycosylated proteins.280986GeneRAGE280986GeneRAGE177GeneRAGE280986GeneRAGE280986GeneRAGE505653GeneCD20280986GeneRAGE280986GeneRAGE280986GeneRAGE280986GeneRAGE280986GeneRAGE9913Speciesbovine9913Speciesbovine9606Specieshuman9606Specieshuman9913Speciesbovine10896916title0Alpha(2) adrenoceptors regulate proliferation of human intestinal epithelial cells.150GeneAlpha(2) adrenoceptors9606Specieshumanabstract84Previous studies on rodents have suggested that catecholamines stimulate proliferation of the intestinal epithelium through activation of alpha(2) adrenoceptors located on crypt cells. The occurrence of this effect awaits demonstration in humans and the molecular mechanisms involved have not yet been elucidated. Here, we examined the effect of alpha(2) agonists on a clone of Caco2 cells expressing the human alpha(2A) adrenoceptor. Cells were transfected with a bicistronic plasmid containing the alpha2C10 and neomycin phosphotransferase genes. G418 resistant clones were assayed for receptor expression using radioligand binding. Receptor functionality was assessed by testing its ability to couple Gi proteins and to inhibit cAMP production. Mitogen activated protein kinase (MAPK) phosphorylation was followed by western blot, and cell proliferation was estimated by measuring protein and DNA content. Permanent transfection of Caco2 cells allowed us to obtain a clone (Caco2-3B) expressing alpha(2A) adrenoceptors at a density similar to that found in normal human intestinal epithelium. Caco2-3B retained morphological features and brush border enzyme expression characteristic of enterocytic differentiation. The receptor was coupled to Gi2/Gi3 proteins and its stimulation caused marked diminution of forskolin induced cAMP production. Treatment of Caco2-3B with UK14304 (alpha(2) agonist) induced a rapid increase in the phosphorylation state of MAPK, extracellular regulated protein kinase 1 (Erk1), and 2 (Erk2). This event was totally abolished in pertussis toxin treated cells and in the presence of kinase inhibitors (genistein or PD98059). It was unaffected by protein kinase C downregulation but correlated with a transient increase in Shc tyrosine phosphorylation. Finally, sustained exposure of Caco2-3B to UK14304 resulted in modest but significant acceleration of cell proliferation. None of these effects was observed in the parental cell line Caco2. The results obtained in the present study support a regulatory role for alpha(2) adrenoceptors in intestinal cell proliferation.150Genealpha(2) adrenoceptors150Genealpha(2A) adrenoceptor150Genealpha2C105595;5594;5595GeneMAPK5595;5594;5595GeneMAPK5595Geneextracellular regulated protein kinase 15595GeneErk15594GeneErk26464GeneShc150Genealpha(2) adrenoceptors9606Specieshumans9606Specieshuman9606Specieshuman9606SpeciesCaco29606SpeciesCaco2 \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_1.xml b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_1.xml index b2144e781..6676e8d34 100644 --- a/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_1.xml +++ b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir1/bioc_collection_1.xml @@ -1,2 +1,2 @@ JCoRe GNormPlus BioC WriterFri Feb 18 13:55:36 CET 2022PubTator.key10880510title0Human TREK2, a 2P domain mechano-sensitive K+ channel with multiple regulations by polyunsaturated fatty acids, lysophospholipids, and Gs, Gi, and Gq protein-coupled receptors.54207GeneTREK29606SpeciesHumanabstract177Mechano-sensitive and fatty acid-activated K(+) belong to the structural class of K(+) channel with two pore domains. Here, we report the isolation and the characterization of a novel member of this family. This channel, called TREK2, is closely related to TREK1 (78% of homology). Its gene is located on chromosome 14q31. TREK2 is abundantly expressed in pancreas and kidney and to a lower level in brain, testis, colon, and small intestine. In the central nervous system, TREK2 has a widespread distribution with the highest levels of expression in cerebellum, occipital lobe, putamen, and thalamus. In transfected cells, TREK2 produces rapidly activating and non-inactivating outward rectifier K(+) currents. The single-channel conductance is 100 picosiemens at +40 mV in 150 mm K(+). The currents can be strongly stimulated by polyunsaturated fatty acid such as arachidonic, docosahexaenoic, and linoleic acids and by lysophosphatidylcholine. The channel is also activated by acidification of the intracellular medium. TREK2 is blocked by application of intracellular cAMP. As with TREK1, TREK2 is activated by the volatile general anesthetics chloroform, halothane, and isoflurane and by the neuroprotective agent riluzole. TREK2 can be positively or negatively regulated by a variety of neurotransmitter receptors. Stimulation of the G(s)-coupled receptor 5HT4sR or the G(q)-coupled receptor mGluR1 inhibits channel activity, whereas activation of the G(i)-coupled receptor mGluR2 increases TREK2 currents. These multiple types of regulations suggest that TREK2 plays an important role as a target of neurotransmitter action.54207GeneTREK23776GeneTREK154207GeneTREK254207GeneTREK254207GeneTREK254207GeneTREK23776GeneTREK154207GeneTREK254207GeneTREK23360Gene5HT4sR2911GenemGluR114800GenemGluR254207GeneTREK254207GeneTREK210803599title0Enhanced growth of MCF-7 breast cancer cells overexpressing parathyroid hormone-related peptide.5744Geneparathyroid hormone-related peptide9606SpeciesMCF-7abstract97PTH-related peptide (PTHrP) is a secreted protein produced by breast cancer cells both in vivo and in vitro. Because of its structural similarity to PTH at the amino terminus, the two proteins interact with a common cell surface receptor, the PTH/PTHrP receptor. When overproduced by tumor cells, PTHrP enters the circulation, giving rise to the common paraneoplastic syndrome of humoral hypercalcemia of malignancy. Although initially discovered in malignancies, PTHrP is now known to be produced by most cells and tissues in the body. It acts as an autocrine and paracrine mediator of cell proliferation and differentiation, effects which are mediated via the PTH/PTHrP receptor. Recent evidence also has shown that, directly after translation, PTHrP is able to enter the nucleus and/or nucleolus and influence cell cycle progression and apoptosis. In this study, we have either overproduced PTHrP or inhibited endogenous PTHrP production in the breast cancer cell line, MCF-7. Overexpression of PTHrP was associated with an increase in mitogenesis, whereas inhibiting endogenous PTHrP production resulted in decreased cell proliferation. The overexpressed peptide targeted to the perinuclear space. In contrast, PTHrP interaction with the cell surface PTH/PTHrP receptor resulted in decreased cell proliferation in the same cell line. This latter effect is dependent on interaction with the receptor, in that exogenously added PTHrP moieties known not to interact with the receptor had no effect on cell growth. Furthermore, neutralization of added peptide with an anti-PTHrP antiserum completely abolished the growth inhibitory effects. In contrast, this antibody has no effect on the increased proliferation rate of the MCF-7 transfectants that overexpress PTHrP, compared with control cells. The net effect of autocrine/paracrine and intracrine effects of PTHrP in MCF-7 cells overproducing the peptide is accelerated cell growth. These findings have critical implications regarding the role of PTHrP in breast cancer, and they suggest that controlling PTHrP production in breast cancer may be useful therapeutically.5744GenePTH-related peptide5744GenePTHrP5741GenePTH5745GenePTH/PTHrP receptor5744GenePTHrP5744GenePTHrP5745GenePTH/PTHrP receptor5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5745GenePTH/PTHrP receptor5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP9606SpeciesMCF-7 \ No newline at end of file + "BioC.dtd">JCoRe GNormPlus BioC WriterFri Feb 18 13:55:36 CET 2022PubTator.key10880510title0Human TREK2, a 2P domain mechano-sensitive K+ channel with multiple regulations by polyunsaturated fatty acids, lysophospholipids, and Gs, Gi, and Gq protein-coupled receptors.54207GeneTREK29606SpeciesHumanabstract177Mechano-sensitive and fatty acid-activated K(+) belong to the structural class of K(+) channel with two pore domains. Here, we report the isolation and the characterization of a novel member of this family. This channel, called TREK2, is closely related to TREK1 (78% of homology). Its gene is located on chromosome 14q31. TREK2 is abundantly expressed in pancreas and kidney and to a lower level in brain, testis, colon, and small intestine. In the central nervous system, TREK2 has a widespread distribution with the highest levels of expression in cerebellum, occipital lobe, putamen, and thalamus. In transfected cells, TREK2 produces rapidly activating and non-inactivating outward rectifier K(+) currents. The single-channel conductance is 100 picosiemens at +40 mV in 150 mm K(+). The currents can be strongly stimulated by polyunsaturated fatty acid such as arachidonic, docosahexaenoic, and linoleic acids and by lysophosphatidylcholine. The channel is also activated by acidification of the intracellular medium. TREK2 is blocked by application of intracellular cAMP. As with TREK1, TREK2 is activated by the volatile general anesthetics chloroform, halothane, and isoflurane and by the neuroprotective agent riluzole. TREK2 can be positively or negatively regulated by a variety of neurotransmitter receptors. Stimulation of the G(s)-coupled receptor 5HT4sR or the G(q)-coupled receptor mGluR1 inhibits channel activity, whereas activation of the G(i)-coupled receptor mGluR2 increases TREK2 currents. These multiple types of regulations suggest that TREK2 plays an important role as a target of neurotransmitter action.54207GeneTREK23776GeneTREK154207GeneTREK254207GeneTREK254207GeneTREK254207GeneTREK23776GeneTREK154207GeneTREK254207GeneTREK23360Gene5HT4sR2911GenemGluR114800GenemGluR254207GeneTREK254207GeneTREK210803599title0Enhanced growth of MCF-7 breast cancer cells overexpressing parathyroid hormone-related peptide.5744Geneparathyroid hormone-related peptide9606SpeciesMCF-7abstract97PTH-related peptide (PTHrP) is a secreted protein produced by breast cancer cells both in vivo and in vitro. Because of its structural similarity to PTH at the amino terminus, the two proteins interact with a common cell surface receptor, the PTH/PTHrP receptor. When overproduced by tumor cells, PTHrP enters the circulation, giving rise to the common paraneoplastic syndrome of humoral hypercalcemia of malignancy. Although initially discovered in malignancies, PTHrP is now known to be produced by most cells and tissues in the body. It acts as an autocrine and paracrine mediator of cell proliferation and differentiation, effects which are mediated via the PTH/PTHrP receptor. Recent evidence also has shown that, directly after translation, PTHrP is able to enter the nucleus and/or nucleolus and influence cell cycle progression and apoptosis. In this study, we have either overproduced PTHrP or inhibited endogenous PTHrP production in the breast cancer cell line, MCF-7. Overexpression of PTHrP was associated with an increase in mitogenesis, whereas inhibiting endogenous PTHrP production resulted in decreased cell proliferation. The overexpressed peptide targeted to the perinuclear space. In contrast, PTHrP interaction with the cell surface PTH/PTHrP receptor resulted in decreased cell proliferation in the same cell line. This latter effect is dependent on interaction with the receptor, in that exogenously added PTHrP moieties known not to interact with the receptor had no effect on cell growth. Furthermore, neutralization of added peptide with an anti-PTHrP antiserum completely abolished the growth inhibitory effects. In contrast, this antibody has no effect on the increased proliferation rate of the MCF-7 transfectants that overexpress PTHrP, compared with control cells. The net effect of autocrine/paracrine and intracrine effects of PTHrP in MCF-7 cells overproducing the peptide is accelerated cell growth. These findings have critical implications regarding the role of PTHrP in breast cancer, and they suggest that controlling PTHrP production in breast cancer may be useful therapeutically.5744GenePTH-related peptide5744GenePTHrP5741GenePTH5745GenePTH/PTHrP receptor5744GenePTHrP5744GenePTHrP5745GenePTH/PTHrP receptor5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5745GenePTH/PTHrP receptor5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP5744GenePTHrP9606SpeciesMCF-7 \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir2/bioc_collection_2.xml b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir2/bioc_collection_2.xml index 2ed9fa4f7..dc8927c84 100644 --- a/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir2/bioc_collection_2.xml +++ b/jcore-gnp-bioc-reader/src/test/resources/test-input-path/subdir2/bioc_collection_2.xml @@ -1,2 +1,2 @@ JCoRe GNormPlus BioC WriterFri Feb 18 13:55:36 CET 2022PubTator.key10722742title0Mdm2 is a RING finger-dependent ubiquitin protein ligase for itself and p53.4193GeneMdm27157Genep53abstract77Mdm2 has been shown to regulate p53 stability by targeting the p53 protein for proteasomal degradation. We now report that Mdm2 is a ubiquitin protein ligase (E3) for p53 and that its activity is dependent on its RING finger. Furthermore, we show that Mdm2 mediates its own ubiquitination in a RING finger-dependent manner, which requires no eukaryotic proteins other than ubiquitin-activating enzyme (E1) and an ubiquitin-conjugating enzyme (E2). It is apparent, therefore, that Mdm2 manifests an intrinsic capacity to mediate ubiquitination. Mutation of putative zinc coordination residues abrogated this activity, as did chelation of divalent cations. After cation chelation, the full activity could be restored by addition of zinc. We further demonstrate that the degradation of p53 and Mdm2 in cells requires additional potential zinc-coordinating residues beyond those required for the intrinsic activity of Mdm2 in vitro. Replacement of the Mdm2 RING with that of another protein (Praja1) reconstituted ubiquitination and proteasomal degradation of Mdm2. However, this RING was ineffective in ubiquitination and proteasomal targeting of p53, suggesting that there may be specificity at the level of the RING in the recognition of heterologous substrates.4193GeneMdm27157Genep537157Genep534193GeneMdm27157Genep534193GeneMdm27318Geneubiquitin-activating enzyme (E1)4193GeneMdm27157Genep534193GeneMdm24193GeneMdm24193GeneMdm264219GenePraja14193GeneMdm27157Genep531770008title0Structural analysis and expression of human desmoglein: a cadherin-like component of the desmosome.1828;281131Genedesmoglein1000Genecadherin9606Specieshumanabstract100Desmosomes are adhesive cell junctions found in great abundance in tissues that experience mechanical stress. The transmembrane desmosomal glycoproteins have been proposed to play a role in cell adhesion; desmoglein I (DGI) is a major member of this class of desmosomal molecules. However, evidence supporting a role for DGI in cell adhesion or in the plaque is lacking. In order to begin to understand DGI function we have identified human cDNA clones encoding the entire mature polypeptide of 1000 amino acids. Our data suggest that like the bovine DGI molecule human DGI is highly related to the calcium-dependent class of cell adhesion molecules known as cadherins. Four related extracellular domains located in the amino-terminal domain of the molecule contain putative calcium binding sites originally identified in the cadherins. The highest degree of similarity between human N-cadherin and human DGI, and likewise between bovine DGI and human DGI, is greatest in the most amino-terminal extracellular domain. This suggests a conserved functional role for the extracellular domains, perhaps in calcium-mediated cell adhesion. The cytoplasmic portion of the molecule contains a cadherin-like region and, like bovine DGI, a carboxy-terminal tail that is not present in the cadherins, comprising three additional domains. One of these contains a novel repeating motif of 29 +/- 1 residues, first identified in bovine DGI. Each of the highly homologous repeating units is likely to consist of two beta-strands and two turns with special characteristics. Five amino acids that are identical in bovine and human DGI lie in the second of the two predicted beta-strands, and intriguingly contain putative target sites for protein kinase C. On the basis of structural analysis, a model predicting the disposition of human DGI domains in the desmosome is proposed. Northern analysis suggests that unlike bovine epidermis, which expresses a single mRNA of reported size approximately 7.6 kb, human foreskin and cultured keratinocytes display a complex pattern with bands of approximately 7.2, 4.0 and 3.0 kb. Each of these cross-hybridizing mRNAs is coordinately expressed in normal human keratinocytes in response to long-term culture and increased calcium.1828Genedesmoglein I1828GeneDGI1828GeneDGI1828GeneDGI281131GeneDGI1828GeneDGI1000GeneN-cadherin1828GeneDGI281131GeneDGI1828GeneDGI281131GeneDGI281131GeneDGI1828GeneDGI9606Specieshuman9913Speciesbovine9606Specieshuman9606Specieshuman9606Specieshuman9913Speciesbovine9606Specieshuman9913Speciesbovine9913Speciesbovine9913Speciesbovine9606Specieshuman9606Specieshuman9913Speciesbovine9606Specieshuman9606Specieshuman1828GeneDGI \ No newline at end of file + "BioC.dtd">JCoRe GNormPlus BioC WriterFri Feb 18 13:55:36 CET 2022PubTator.key10722742title0Mdm2 is a RING finger-dependent ubiquitin protein ligase for itself and p53.4193GeneMdm27157Genep53abstract77Mdm2 has been shown to regulate p53 stability by targeting the p53 protein for proteasomal degradation. We now report that Mdm2 is a ubiquitin protein ligase (E3) for p53 and that its activity is dependent on its RING finger. Furthermore, we show that Mdm2 mediates its own ubiquitination in a RING finger-dependent manner, which requires no eukaryotic proteins other than ubiquitin-activating enzyme (E1) and an ubiquitin-conjugating enzyme (E2). It is apparent, therefore, that Mdm2 manifests an intrinsic capacity to mediate ubiquitination. Mutation of putative zinc coordination residues abrogated this activity, as did chelation of divalent cations. After cation chelation, the full activity could be restored by addition of zinc. We further demonstrate that the degradation of p53 and Mdm2 in cells requires additional potential zinc-coordinating residues beyond those required for the intrinsic activity of Mdm2 in vitro. Replacement of the Mdm2 RING with that of another protein (Praja1) reconstituted ubiquitination and proteasomal degradation of Mdm2. However, this RING was ineffective in ubiquitination and proteasomal targeting of p53, suggesting that there may be specificity at the level of the RING in the recognition of heterologous substrates.4193GeneMdm27157Genep537157Genep534193GeneMdm27157Genep534193GeneMdm27318Geneubiquitin-activating enzyme (E1)4193GeneMdm27157Genep534193GeneMdm24193GeneMdm24193GeneMdm264219GenePraja14193GeneMdm27157Genep531770008title0Structural analysis and expression of human desmoglein: a cadherin-like component of the desmosome.1828;281131Genedesmoglein1000Genecadherin9606Specieshumanabstract100Desmosomes are adhesive cell junctions found in great abundance in tissues that experience mechanical stress. The transmembrane desmosomal glycoproteins have been proposed to play a role in cell adhesion; desmoglein I (DGI) is a major member of this class of desmosomal molecules. However, evidence supporting a role for DGI in cell adhesion or in the plaque is lacking. In order to begin to understand DGI function we have identified human cDNA clones encoding the entire mature polypeptide of 1000 amino acids. Our data suggest that like the bovine DGI molecule human DGI is highly related to the calcium-dependent class of cell adhesion molecules known as cadherins. Four related extracellular domains located in the amino-terminal domain of the molecule contain putative calcium binding sites originally identified in the cadherins. The highest degree of similarity between human N-cadherin and human DGI, and likewise between bovine DGI and human DGI, is greatest in the most amino-terminal extracellular domain. This suggests a conserved functional role for the extracellular domains, perhaps in calcium-mediated cell adhesion. The cytoplasmic portion of the molecule contains a cadherin-like region and, like bovine DGI, a carboxy-terminal tail that is not present in the cadherins, comprising three additional domains. One of these contains a novel repeating motif of 29 +/- 1 residues, first identified in bovine DGI. Each of the highly homologous repeating units is likely to consist of two beta-strands and two turns with special characteristics. Five amino acids that are identical in bovine and human DGI lie in the second of the two predicted beta-strands, and intriguingly contain putative target sites for protein kinase C. On the basis of structural analysis, a model predicting the disposition of human DGI domains in the desmosome is proposed. Northern analysis suggests that unlike bovine epidermis, which expresses a single mRNA of reported size approximately 7.6 kb, human foreskin and cultured keratinocytes display a complex pattern with bands of approximately 7.2, 4.0 and 3.0 kb. Each of these cross-hybridizing mRNAs is coordinately expressed in normal human keratinocytes in response to long-term culture and increased calcium.1828Genedesmoglein I1828GeneDGI1828GeneDGI1828GeneDGI281131GeneDGI1828GeneDGI1000GeneN-cadherin1828GeneDGI281131GeneDGI1828GeneDGI281131GeneDGI281131GeneDGI1828GeneDGI9606Specieshuman9913Speciesbovine9606Specieshuman9606Specieshuman9606Specieshuman9913Speciesbovine9606Specieshuman9913Speciesbovine9913Speciesbovine9913Speciesbovine9606Specieshuman9606Specieshuman9913Speciesbovine9606Specieshuman9606Specieshuman1828GeneDGI \ No newline at end of file From 2afeed5a57d247e6022e97c9be30eb6ba6252508 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 18 Feb 2022 17:17:20 +0100 Subject: [PATCH 152/269] Adapt the BioC Writer test to the again omitted-from-text abstract section headings. --- .../julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java index 9f085bc0b..55601393a 100644 --- a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java +++ b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java @@ -28,7 +28,7 @@ public void populate() throws Exception { assertThat(resultXml).containsOnlyOnce("This is the title of document 1."); assertThat(resultXml).containsOnlyOnce("title"); // The abstract should be one single string - assertThat(resultXml).containsOnlyOnce("BACKGROUND: This abstract section belongs to document 1. RESULTS: There are certainly some results reported by document 1."); + assertThat(resultXml).containsOnlyOnce("This abstract section belongs to document 1. There are certainly some results reported by document 1."); assertThat(resultXml).containsOnlyOnce("INTRODUCTION"); assertThat(resultXml).containsOnlyOnce("section_title"); assertThat(resultXml).contains("paragraph"); From 84c42c5bc94646033688a9283829c5b3daf5db9c Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 2 Mar 2022 14:41:06 +0100 Subject: [PATCH 153/269] Avoid writing empty BioC documents. GNormPlus doesn't handle this but expects that a) there are documents in every collection and b) that each document has at least one passage. --- .../consumer/gnp/BioCCollectionWriter.java | 6 ++- .../consumer/gnp/BioCDocumentPopulator.java | 26 ++++++++-- .../consumer/gnp/GNormPlusFormatWriter.java | 5 +- .../gnp/GNormPlusFormatWriterTest.java | 49 +++++++++++++++++-- 4 files changed, 75 insertions(+), 11 deletions(-) diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java index df5b12587..9d16ba23f 100644 --- a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCCollectionWriter.java @@ -19,6 +19,7 @@ public class BioCCollectionWriter { private Path baseDir; private Path currentDir; private int numWrittenIntoCurrentDir; + private int currentDirNum; public BioCCollectionWriter(int numFilesPerDir, Path baseDir) { this.numFilesPerDir = numFilesPerDir; @@ -32,12 +33,13 @@ public void writeBioCCollection(BioCCollection collection) throws XMLStreamExcep if (currentDir == null) { int i = 0; do { - currentDir = Path.of(baseDir.toString(), "bioc_collections_" + i++); + currentDirNum = i++; + currentDir = Path.of(baseDir.toString(), "bioc_collections_" + currentDirNum); } while (Files.exists(currentDir)); } int i = 0; do { - collectionFile = Path.of(currentDir.toString(), "bioc_collection_" + i++ + ".xml"); + collectionFile = Path.of(currentDir.toString(), "bioc_collection_" + currentDirNum + "_" + i++ + ".xml"); } while (Files.exists(collectionFile)); if (!Files.exists(collectionFile.getParent())) { log.debug("Creating base BioC collection directory {}", baseDir); diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java index 488f42613..1a2182bed 100644 --- a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java @@ -39,14 +39,20 @@ public BioCDocument populate(JCas jCas) { case "table": titleType = "table_title"; break; + case "abstractSection": + // abstract sections are part of the AbstractText which is handled below + titleType = "null"; + break; default: log.debug("Unhandled title type {}", t.getTitleType()); titleType = "other_title"; break; } - BioCPassage p = getPassageForAnnotation(t); - p.putInfon("type", titleType); - doc.addPassage(p); + if (titleType != null) { + BioCPassage p = getPassageForAnnotation(t); + p.putInfon("type", titleType); + doc.addPassage(p); + } } else if (z instanceof AbstractText) { AbstractText at = (AbstractText) z; BioCPassage p = getPassageForAnnotation(at); @@ -69,6 +75,18 @@ public BioCDocument populate(JCas jCas) { return doc; } +// private BioCPassage getPassageForAbstract(AbstractText at) { +// FSArray structuredAbstractParts = at.getStructuredAbstractParts(); +// boolean foundAbstractParts = false; +// if (structuredAbstractParts != null) { +// for (int i = 0; i < structuredAbstractParts.size(); ++i) { +// AbstractSection as = (AbstractSection) structuredAbstractParts.get(i); +// +// } +// } +// return null; +// } + /** * Creates a BioCPassage with offset and text corresponding to the passed annotation a. * @@ -78,6 +96,8 @@ public BioCDocument populate(JCas jCas) { private BioCPassage getPassageForAnnotation(Annotation a) { BioCPassage p = new BioCPassage(); p.setOffset(a.getBegin()); + // GNormPlus doesn't seem to handle newlines well. It resulted in missing annotations when testing if the + // output format is handled well by GNormPlus. p.setText(a.getCoveredText().replaceAll("\n", " ")); return p; } diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java index 24f016a69..002407a0e 100644 --- a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java @@ -59,7 +59,8 @@ public void initialize(final UimaContext aContext) { public void process(final JCas jCas) throws AnalysisEngineProcessException { try { BioCDocument doc = bioCDocumentPopulator.populate(jCas); - currentCollection.addDocument(doc); + if (doc.getPassageCount() > 0) + currentCollection.addDocument(doc); if (currentCollection.getDocmentCount() >= numDocsPerFile) { bioCCollectionWriter.writeBioCCollection(currentCollection); currentCollection.clearDocuments(); @@ -75,7 +76,7 @@ public void process(final JCas jCas) throws AnalysisEngineProcessException { public void collectionProcessComplete() throws AnalysisEngineProcessException { super.collectionProcessComplete(); try { - if (currentCollection.getDocmentCount() != 0) +// if (currentCollection.getDocmentCount() != 0) bioCCollectionWriter.writeBioCCollection(currentCollection); } catch (Exception e) { log.error("Could not write final batch of BioCDocuments.", e); diff --git a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriterTest.java b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriterTest.java index 16a3ec233..d2d9d0f40 100644 --- a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriterTest.java +++ b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriterTest.java @@ -3,6 +3,8 @@ import com.pengyifan.bioc.BioCCollection; import com.pengyifan.bioc.io.BioCCollectionReader; +import de.julielab.jcore.types.Title; +import de.julielab.jcore.types.pubmed.Header; import org.apache.commons.io.FileUtils; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.fit.factory.AnalysisEngineFactory; @@ -15,6 +17,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.List; +import java.util.stream.Collectors; import static org.assertj.core.api.Assertions.assertThat; @@ -47,7 +50,7 @@ public void process1() throws Exception { writer.process(jCas); writer.collectionProcessComplete(); - assertThat(Path.of(BASEDIR.toString(), "bioc_collections_0", "bioc_collection_0.xml")).exists().isNotEmptyFile(); + assertThat(Path.of(BASEDIR.toString(), "bioc_collections_0", "bioc_collection_0_0.xml")).exists().isNotEmptyFile(); } @Test @@ -66,17 +69,55 @@ public void process2() throws Exception { for (int i : List.of(0, 1, 2)) { List fileIndices = i < 2 ? List.of(0, 1, 2) : List.of(0,1); for (int j : fileIndices) { - assertThat(Path.of(BASEDIR.toString(), "bioc_collections_"+i, "bioc_collection_"+j+".xml")).exists().isNotEmptyFile(); + assertThat(Path.of(BASEDIR.toString(), "bioc_collections_"+i, "bioc_collection_"+i+"_"+j+".xml")).exists().isNotEmptyFile(); } } // there should only be two files in the last directory - assertThat(Path.of(BASEDIR.toString(), "bioc_collections_2", "bioc_collection_2.xml")).doesNotExist(); + assertThat(Path.of(BASEDIR.toString(), "bioc_collections_2", "bioc_collection_2_2.xml")).doesNotExist(); // the last file should only contain a single document - BioCCollectionReader reader = new BioCCollectionReader(Path.of(BASEDIR.toString(), "bioc_collections_2", "bioc_collection_1.xml")); + BioCCollectionReader reader = new BioCCollectionReader(Path.of(BASEDIR.toString(), "bioc_collections_2", "bioc_collection_2_1.xml")); BioCCollection lastCollection = reader.readCollection(); assertThat(lastCollection.getDocmentCount()).isEqualTo(1); } + @Test + public void omitEmptyDocuments() throws Exception { + // GNormPlus doesn't handle documents well which do not have any passage. Then, at some later document in the same collection, array out of bounds exceptions appear. + // Make sure we just don't write empty documents. They wouldn't have any annotations anyway. + JCas jCas = TestDocumentGenerator.createTestJCas(); + Header h = new Header(jCas); + h.setDocId("1"); + h.addToIndexes(); + AnalysisEngine writer = getWriterInstance(1, 1); + writer.process(jCas); + jCas.reset(); + jCas.setDocumentText("Hello."); + Header h2 = new Header(jCas); + h2.setDocId("2"); + h2.addToIndexes(); + Title title = new Title(jCas, 0, 6); + title.setTitleType("document"); + title.addToIndexes(); + writer.process(jCas); + writer.collectionProcessComplete(); + // assert that no empty documents were written into the collection + assertThat(Files.lines(Path.of(BASEDIR.toString(), "bioc_collections_0", "bioc_collection_0_0.xml")).map(String::trim).collect(Collectors.joining())).doesNotContain(""); + assertThat(Files.lines(Path.of(BASEDIR.toString(), "bioc_collections_0", "bioc_collection_0_0.xml")).map(String::trim).collect(Collectors.joining())).contains("2"); + } + + @Test + public void omitEmptyDocuments2() throws Exception { + // Additionally to not writing empty documents, we also don't want to write empty collections. This, too, causes out of bounds errors in GNormPlus. + JCas jCas = TestDocumentGenerator.createTestJCas(); + Header h = new Header(jCas); + h.setDocId("1"); + h.addToIndexes(); + AnalysisEngine writer = getWriterInstance(1, 1); + writer.process(jCas); + // assert that no empty documents were written into the collection + assertThat(Path.of(BASEDIR.toString(), "bioc_collections_0", "bioc_collection_0_0.xml")).doesNotExist(); + } + } From 15c5d5111b6e373389a13c72d9cb4f58a7b1842c Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 2 Mar 2022 14:42:14 +0100 Subject: [PATCH 154/269] Add logging message for BANNER observation. BANNER keeps on having irregular concurrency issues. Keep the debug output so we have it when we need it. --- .../src/main/java/banner/tagging/pipe/LemmaPOS.java | 4 ++-- .../java/de/julielab/jcore/ae/banner/BANNERAnnotator.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java b/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java index 41a0a8e5c..e5cb62761 100644 --- a/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java +++ b/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java @@ -43,13 +43,13 @@ public LemmaPOS(Lemmatiser lemmatiser, Tagger posTagger) { public void setLemmatiser(Lemmatiser lemmatiser) { initResourcesMap(); getResources().lemmatiser = lemmatiser; -// System.out.println("Setting lemmatiser to " + Thread.currentThread()); + System.out.println("Setting lemmatiser to " + Thread.currentThread() + " in object " + this); } public void setPosTagger(Tagger posTagger) { initResourcesMap(); getResources().posTagger = posTagger; -// System.out.println("Setting PoS Tagger to " + Thread.currentThread()); + System.out.println("Setting PoS Tagger to " + Thread.currentThread() + " in object " + this); } synchronized private void initResourcesMap() { diff --git a/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java b/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java index 9241d430f..a29132d5c 100644 --- a/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java +++ b/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java @@ -139,7 +139,7 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { // model is deserialized multiple times, the FeatureSet#pipe field seems to be always the // exact same instance, containing a single instance of LemmaPOS (again, despite reading the model // file and deserializing it multiple times). This is why the Thread -> resources map was added. -// System.out.println("Initializing BANNER: " + Thread.currentThread() + " with lemmatiser " + lemmatiser + " and POS tagger " + posTagger); + System.out.println("Initializing BANNER: " + Thread.currentThread() + " with lemmatiser " + lemmatiser + " and POS tagger " + posTagger); synchronized (BANNERAnnotator.class) { tagger = CRFTagger.load(modelIs, lemmatiser, posTagger, dictionary); } From ace255e8497450c366f7222562a4449669aeabbc Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 2 Mar 2022 14:43:14 +0100 Subject: [PATCH 155/269] Fix issues in PMC reader. Avoid an NPE when a section in XML is omitted. Add a default titleType to label elements. --- .../jcore/reader/pmc/parser/FrontParser.java | 331 +++++++++--------- .../reader/pmc/parser/NxmlDocumentParser.java | 19 +- .../reader/pmc/parser/SectionParser.java | 20 +- .../elementproperties-no-bib-refs.yml | 6 + .../pmc/resources/elementproperties.yml | 6 + 5 files changed, 206 insertions(+), 176 deletions(-) diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java index 124e47bef..560f9877d 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java @@ -1,11 +1,10 @@ -/** - * +/** * Copyright (c) 2017, JULIE Lab. - * All rights reserved. This program and the accompanying materials + * All rights reserved. This program and the accompanying materials * are made available under the terms of the BSD-2-Clause License - * - * Author: - * + *

+ * Author: + *

* Description: **/ package de.julielab.jcore.reader.pmc.parser; @@ -21,6 +20,7 @@ import de.julielab.jcore.types.pubmed.OtherID; import org.apache.uima.jcas.cas.FSArray; +import java.io.File; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; @@ -28,163 +28,166 @@ public class FrontParser extends NxmlElementParser { - public FrontParser(NxmlDocumentParser nxmlDocumentParser) { - super(nxmlDocumentParser); - elementName = "front"; - } - - @Override - protected void parseElement(ElementParsingResult frontResult) throws ElementParsingException { - try { - // Only handle the front matter of the actual article, not sub-articles - final String elementPath = getElementPath(); - if (!elementPath.endsWith("/article/front")) { - int firstIndexAfterElement = skipElement(); - frontResult.setLastTokenIndex(firstIndexAfterElement); - frontResult.setResultType(ParsingResult.ResultType.NONE); - return; - } - - // title and abstract - parseXPath("/article/front/article-meta/title-group/article-title").ifPresent(r -> { - ElementParsingResult er = (ElementParsingResult) r; - Title articleTitle = (Title) er.getAnnotation(); - articleTitle.setTitleType("document"); - frontResult.addSubResult(r); - }); - parseXPath("/article/front/article-meta/abstract").ifPresent(r -> { - ElementParsingResult er = (ElementParsingResult) r; - AbstractText abstractText = (AbstractText) er.getAnnotation(); - List abstractSections = er.getSubResultAnnotations(AbstractSection.class); - FSArray fsArray = new FSArray(nxmlDocumentParser.cas, abstractSections.size()); - IntStream.range(0, abstractSections.size()).forEach(i -> fsArray.set(i, abstractSections.get(i))); - abstractText.setStructuredAbstractParts(fsArray); - frontResult.addSubResult(r); - }); - - // article IDs - Optional pmid = getXPathValue("/article/front/article-meta/article-id[@pub-id-type='pmid']"); - Optional pmcid = getXPathValue("/article/front/article-meta/article-id[@pub-id-type='pmc']"); - Optional doi = getXPathValue("/article/front/article-meta/article-id[@pub-id-type='doi']"); - - // publication details - String pubType = ""; - String pubDateFmt = "/article/front/article-meta/pub-date[@pub-type='%s']"; - if (xPathExists(String.format(pubDateFmt, "epub"))) - pubType = "epub"; - else if (xPathExists(String.format(pubDateFmt, "ppub"))) - pubType = "ppub"; - else if (xPathExists(String.format(pubDateFmt, "pmc-release"))) - pubType = "pmc-release"; - Optional year = getXPathValue(String.format("/article/front/article-meta/pub-date[@pub-type='%s']/year", pubType)); - Optional month = getXPathValue(String.format("/article/front/article-meta/pub-date[@pub-type='%s']/month", pubType)); - Optional day = getXPathValue(String.format("/article/front/article-meta/pub-date[@pub-type='%s']/day", pubType)); - Optional journalTitle = nxmlDocumentParser.getTagset() == Tagset.NLM_2_3 || nxmlDocumentParser.getTagset() == Tagset.NLM_3_0 - ? getXPathValue("/article/front/journal-meta/journal-title") - : getXPathValue("/article/front/journal-meta/journal-title-group/journal-title"); - // there actually might be several abbreviated titles but here, we - // only use the first; our type system currently cannot represent - // more anyway. One could try to decide for a preferred one since the - // abbrev-type attribute disposes the source of the abbreviated - // title (e.g. publisher or nlm-ta). - Optional abbrevJournalTitle = nxmlDocumentParser.getTagset() == Tagset.NLM_2_3 || nxmlDocumentParser.getTagset() == Tagset.NLM_3_0 - ? getXPathValue("/article/front/journal-meta/abbrev-journal-title") - : getXPathValue("/article/front/journal-meta/journal-title-group/abbrev-journal-title"); - Optional volume = getXPathValue("/article/front/article-meta/volume"); - Optional issue = getXPathValue("/article/front/article-meta/issue"); - Optional firstPage = getXPathValue("/article/front/article-meta/fpage"); - Optional lastPage = getXPathValue("/article/front/article-meta/lpage"); - Optional elocation = getXPathValue("/article/front/article-meta/elocation-id"); - Optional issn = getXPathValue("/article/front/journal-meta/issn[@pub-type='ppub']"); - - // copyright statement - Optional copyrightStatement = getXPathValue( - "/article/front/article-meta/permissions/copyright-statement"); - - // keywords - Optional> keywords = getXPathValues("/article/front/article-meta/kwd-group/kwd"); - - assert volume.isPresent(); - - Header header = new Header(nxmlDocumentParser.cas); - header.setSource("PubMed Central"); - header.setComponentId(PMCReader.class.getName()); - - pmcid.ifPresent(id -> header.setDocId(id.startsWith("PMC") ? id : "PMC" + id)); - pmid.ifPresent(p -> { - OtherID otherID = new OtherID(nxmlDocumentParser.cas); - otherID.setComponentId(PMCReader.class.getName()); - otherID.setId(p); - otherID.setSource("PubMed"); - FSArray otherIDs = new FSArray(nxmlDocumentParser.cas, 1); - otherIDs.set(0, otherID); - header.setOtherIDs(otherIDs); - }); - doi.ifPresent(header::setDoi); - - copyrightStatement.ifPresent(header::setCopyright); - - Journal journal = new Journal(nxmlDocumentParser.cas); - journal.setComponentId(PMCReader.class.getName()); - journalTitle.ifPresent(journal::setTitle); - abbrevJournalTitle.ifPresent(journal::setShortTitle); - volume.ifPresent(journal::setVolume); - issue.ifPresent(journal::setIssue); - issn.ifPresent(journal::setISSN); - String pages = null; - if (firstPage.isPresent() && lastPage.isPresent()) - pages = firstPage.get() + "--" + lastPage.get(); - else if (firstPage.isPresent()) - pages = firstPage.get(); - else if (elocation.isPresent()) - pages = elocation.get(); - journal.setPages(pages); - FSArray pubTypes = new FSArray(nxmlDocumentParser.cas, 1); - pubTypes.set(0, journal); - Date pubDate = new Date(nxmlDocumentParser.cas); - pubDate.setComponentId(PMCReader.class.getName()); - day.map(Integer::parseInt).ifPresent(pubDate::setDay); - month.map(Integer::parseInt).ifPresent(pubDate::setMonth); - year.map(Integer::parseInt).ifPresent(pubDate::setYear); - journal.setPubDate(pubDate); - header.setPubTypeList(pubTypes); - - // authors (more general: contributors; but for the moment we - // restrict ourselves to authors) - parseXPath("/article/front/article-meta/contrib-group").map(ElementParsingResult.class::cast) - .ifPresent(r -> { - // currently only authors - List authors = r.getSubResults().stream().map(ElementParsingResult.class::cast) - .map(e -> e.getAnnotation()).filter(AuthorInfo.class::isInstance) - .map(AuthorInfo.class::cast).collect(Collectors.toList()); - FSArray aiArray = new FSArray(nxmlDocumentParser.cas, authors.size()); - IntStream.range(0, authors.size()).forEach(i -> { - aiArray.set(i, authors.get(i)); - }); - if (aiArray.size() > 0) - header.setAuthors(aiArray); - }); - - frontResult.setAnnotation(header); - - if (keywords.isPresent()) { - List keywordList = keywords.get(); - FSArray fsArray = new FSArray(nxmlDocumentParser.cas, keywordList.size()); - IntStream.range(0, keywordList.size()).forEach(i -> { - Keyword keyword = new Keyword(nxmlDocumentParser.cas); - keyword.setComponentId(PMCReader.class.getName()); - keyword.setName(keywordList.get(i)); - fsArray.set(i, keyword); - }); - ManualDescriptor manualDescriptor = new ManualDescriptor(nxmlDocumentParser.cas); - manualDescriptor.setComponentId(PMCReader.class.getName()); - manualDescriptor.setKeywordList(fsArray); - manualDescriptor.addToIndexes(); - } - - } catch (XPathParseException | XPathEvalException | NavException e) { - throw new ElementParsingException(e); - } - } + public FrontParser(NxmlDocumentParser nxmlDocumentParser) { + super(nxmlDocumentParser); + elementName = "front"; + } + + @Override + protected void parseElement(ElementParsingResult frontResult) throws ElementParsingException { + try { + // Only handle the front matter of the actual article, not sub-articles + final String elementPath = getElementPath(); + if (!elementPath.endsWith("/article/front")) { + int firstIndexAfterElement = skipElement(); + frontResult.setLastTokenIndex(firstIndexAfterElement); + frontResult.setResultType(ParsingResult.ResultType.NONE); + return; + } + + // title and abstract + parseXPath("/article/front/article-meta/title-group/article-title").ifPresent(r -> { + ElementParsingResult er = (ElementParsingResult) r; + Title articleTitle = (Title) er.getAnnotation(); + articleTitle.setTitleType("document"); + frontResult.addSubResult(r); + }); + parseXPath("/article/front/article-meta/abstract").ifPresent(r -> { + ElementParsingResult er = (ElementParsingResult) r; + AbstractText abstractText = (AbstractText) er.getAnnotation(); + List abstractSections = er.getSubResultAnnotations(AbstractSection.class); + FSArray fsArray = new FSArray(nxmlDocumentParser.cas, abstractSections.size()); + IntStream.range(0, abstractSections.size()).forEach(i -> fsArray.set(i, abstractSections.get(i))); + abstractText.setStructuredAbstractParts(fsArray); + frontResult.addSubResult(r); + }); + + // article IDs + Optional pmid = getXPathValue("/article/front/article-meta/article-id[@pub-id-type='pmid']"); + Optional pmcid = getXPathValue("/article/front/article-meta/article-id[@pub-id-type='pmc']"); + Optional doi = getXPathValue("/article/front/article-meta/article-id[@pub-id-type='doi']"); + + // publication details + String pubType = ""; + String pubDateFmt = "/article/front/article-meta/pub-date[@pub-type='%s']"; + if (xPathExists(String.format(pubDateFmt, "epub"))) + pubType = "epub"; + else if (xPathExists(String.format(pubDateFmt, "ppub"))) + pubType = "ppub"; + else if (xPathExists(String.format(pubDateFmt, "pmc-release"))) + pubType = "pmc-release"; + Optional year = getXPathValue(String.format("/article/front/article-meta/pub-date[@pub-type='%s']/year", pubType)); + Optional month = getXPathValue(String.format("/article/front/article-meta/pub-date[@pub-type='%s']/month", pubType)); + Optional day = getXPathValue(String.format("/article/front/article-meta/pub-date[@pub-type='%s']/day", pubType)); + Optional journalTitle = nxmlDocumentParser.getTagset() == Tagset.NLM_2_3 || nxmlDocumentParser.getTagset() == Tagset.NLM_3_0 + ? getXPathValue("/article/front/journal-meta/journal-title") + : getXPathValue("/article/front/journal-meta/journal-title-group/journal-title"); + // there actually might be several abbreviated titles but here, we + // only use the first; our type system currently cannot represent + // more anyway. One could try to decide for a preferred one since the + // abbrev-type attribute disposes the source of the abbreviated + // title (e.g. publisher or nlm-ta). + Optional abbrevJournalTitle = nxmlDocumentParser.getTagset() == Tagset.NLM_2_3 || nxmlDocumentParser.getTagset() == Tagset.NLM_3_0 + ? getXPathValue("/article/front/journal-meta/abbrev-journal-title") + : getXPathValue("/article/front/journal-meta/journal-title-group/abbrev-journal-title"); + Optional volume = getXPathValue("/article/front/article-meta/volume"); + Optional issue = getXPathValue("/article/front/article-meta/issue"); + Optional firstPage = getXPathValue("/article/front/article-meta/fpage"); + Optional lastPage = getXPathValue("/article/front/article-meta/lpage"); + Optional elocation = getXPathValue("/article/front/article-meta/elocation-id"); + Optional issn = getXPathValue("/article/front/journal-meta/issn[@pub-type='ppub']"); + + // copyright statement + Optional copyrightStatement = getXPathValue( + "/article/front/article-meta/permissions/copyright-statement"); + + // keywords + Optional> keywords = getXPathValues("/article/front/article-meta/kwd-group/kwd"); + + assert volume.isPresent(); + + Header header = new Header(nxmlDocumentParser.cas); + header.setSource("PubMed Central"); + header.setComponentId(PMCReader.class.getName()); + + pmcid.ifPresentOrElse(id -> header.setDocId(id.startsWith("PMC") ? id : "PMC" + id), () -> { + String filenameId = nxmlDocumentParser.getCurrentSource().toString().substring(nxmlDocumentParser.getCurrentSource().toString().lastIndexOf(File.separatorChar)+1, nxmlDocumentParser.getCurrentSource().toString().lastIndexOf('.')); + header.setDocId(filenameId.startsWith("PMC") ? filenameId : "PMC" + filenameId); + }); + pmid.ifPresent(p -> { + OtherID otherID = new OtherID(nxmlDocumentParser.cas); + otherID.setComponentId(PMCReader.class.getName()); + otherID.setId(p); + otherID.setSource("PubMed"); + FSArray otherIDs = new FSArray(nxmlDocumentParser.cas, 1); + otherIDs.set(0, otherID); + header.setOtherIDs(otherIDs); + }); + doi.ifPresent(header::setDoi); + + copyrightStatement.ifPresent(header::setCopyright); + + Journal journal = new Journal(nxmlDocumentParser.cas); + journal.setComponentId(PMCReader.class.getName()); + journalTitle.ifPresent(journal::setTitle); + abbrevJournalTitle.ifPresent(journal::setShortTitle); + volume.ifPresent(journal::setVolume); + issue.ifPresent(journal::setIssue); + issn.ifPresent(journal::setISSN); + String pages = null; + if (firstPage.isPresent() && lastPage.isPresent()) + pages = firstPage.get() + "--" + lastPage.get(); + else if (firstPage.isPresent()) + pages = firstPage.get(); + else if (elocation.isPresent()) + pages = elocation.get(); + journal.setPages(pages); + FSArray pubTypes = new FSArray(nxmlDocumentParser.cas, 1); + pubTypes.set(0, journal); + Date pubDate = new Date(nxmlDocumentParser.cas); + pubDate.setComponentId(PMCReader.class.getName()); + day.map(Integer::parseInt).ifPresent(pubDate::setDay); + month.map(Integer::parseInt).ifPresent(pubDate::setMonth); + year.map(Integer::parseInt).ifPresent(pubDate::setYear); + journal.setPubDate(pubDate); + header.setPubTypeList(pubTypes); + + // authors (more general: contributors; but for the moment we + // restrict ourselves to authors) + parseXPath("/article/front/article-meta/contrib-group").map(ElementParsingResult.class::cast) + .ifPresent(r -> { + // currently only authors + List authors = r.getSubResults().stream().map(ElementParsingResult.class::cast) + .map(e -> e.getAnnotation()).filter(AuthorInfo.class::isInstance) + .map(AuthorInfo.class::cast).collect(Collectors.toList()); + FSArray aiArray = new FSArray(nxmlDocumentParser.cas, authors.size()); + IntStream.range(0, authors.size()).forEach(i -> { + aiArray.set(i, authors.get(i)); + }); + if (aiArray.size() > 0) + header.setAuthors(aiArray); + }); + + frontResult.setAnnotation(header); + + if (keywords.isPresent()) { + List keywordList = keywords.get(); + FSArray fsArray = new FSArray(nxmlDocumentParser.cas, keywordList.size()); + IntStream.range(0, keywordList.size()).forEach(i -> { + Keyword keyword = new Keyword(nxmlDocumentParser.cas); + keyword.setComponentId(PMCReader.class.getName()); + keyword.setName(keywordList.get(i)); + fsArray.set(i, keyword); + }); + ManualDescriptor manualDescriptor = new ManualDescriptor(nxmlDocumentParser.cas); + manualDescriptor.setComponentId(PMCReader.class.getName()); + manualDescriptor.setKeywordList(fsArray); + manualDescriptor.addToIndexes(); + } + + } catch (XPathParseException | XPathEvalException | NavException e) { + throw new ElementParsingException(e); + } + } } diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java index 5285ee138..7bafb1a39 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/NxmlDocumentParser.java @@ -40,9 +40,15 @@ public class NxmlDocumentParser extends NxmlParser { private DefaultElementParser defaultElementParser; private Map> tagProperties; private Tagset tagset; + private Object currentSource; public void reset(File nxmlFile, JCas cas) throws DocumentParsingException { reset(nxmlFile.toURI(), cas); + currentSource = nxmlFile; + } + + public Object getCurrentSource() { + return currentSource; } public void reset(URI uri, JCas cas) throws DocumentParsingException { @@ -53,6 +59,7 @@ public void reset(URI uri, JCas cas) throws DocumentParsingException { if (gzipped) is = new GZIPInputStream(is); reset(is, cas); + currentSource = uri; } catch (IOException e) { throw new DocumentParsingException(e); } @@ -73,6 +80,7 @@ public void reset(InputStream is, JCas cas) throws DocumentParsingException { vn = vg.getNav(); setTagset(); setupParserRegistry(); + currentSource = ""; } catch (IOException | VTDException e) { throw new DocumentParsingException(e); } @@ -152,9 +160,14 @@ public Map getParserRegistry() { } public ElementParsingResult parse() throws ElementParsingException, DocumentParsingException { - String startingElement = moveToNextStartingTag(); - assert startingElement.equals("article") : "Did not encounter an article element as first start element"; - return getParser(startingElement).parse(); + try { + String startingElement = moveToNextStartingTag(); + assert startingElement.equals("article") : "Did not encounter an article element as first start element"; + return getParser(startingElement).parse(); + } catch (Exception e) { + log.error("Exception while parsing document from source {}", currentSource); + throw e; + } } public NxmlElementParser getParser(String tagName) { diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/SectionParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/SectionParser.java index 6283db703..787e0e8aa 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/SectionParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/SectionParser.java @@ -66,15 +66,17 @@ protected void parseElement(ElementParsingResult parsingResult) throws ElementPa if (!secTitleAnnotations.isEmpty()) sectionHeading = secTitleAnnotations.get(0); Section section = (Section) parsingResult.getAnnotation(); - section.setComponentId(PMCReader.class.getName()); - section.setSectionHeading(sectionHeading); - section.setDepth(depth); - section.setSectionId(sectionId); - List label = parsingResult.getSubResults("label"); - if (!label.isEmpty()) { - // there is only one label element - ElementParsingResult labelParsingResult = (ElementParsingResult) label.get(0); - section.setLabel(labelParsingResult.getResultText()); + if (section != null) { + section.setComponentId(PMCReader.class.getName()); + section.setSectionHeading(sectionHeading); + section.setDepth(depth); + section.setSectionId(sectionId); + List label = parsingResult.getSubResults("label"); + if (!label.isEmpty()) { + // there is only one label element + ElementParsingResult labelParsingResult = (ElementParsingResult) label.get(0); + section.setLabel(labelParsingResult.getResultText()); + } } } } catch (NavException e) { diff --git a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties-no-bib-refs.yml b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties-no-bib-refs.yml index 16d5355bb..09bc0123c 100644 --- a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties-no-bib-refs.yml +++ b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties-no-bib-refs.yml @@ -21,6 +21,8 @@ abstract: label: block-element: true type: de.julielab.jcore.types.Title + default-feature-values: + titleType: other paths: - path: list-item/label omit-element: true @@ -30,6 +32,10 @@ sec: paths: - path: abstract/sec type: de.julielab.jcore.types.AbstractSection + attributes: + - name: sec-type + value: supplementary-material + omit-element: true p: block-element: true type: de.julielab.jcore.types.Paragraph diff --git a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties.yml b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties.yml index 321ddf287..8037e3cf4 100644 --- a/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties.yml +++ b/jcore-pmc-reader/src/main/resources/de/julielab/jcore/reader/pmc/resources/elementproperties.yml @@ -24,6 +24,8 @@ abstract: label: block-element: true type: de.julielab.jcore.types.Title + default-feature-values: + titleType: other paths: - path: list-item/label omit-element: true @@ -33,6 +35,10 @@ sec: paths: - path: abstract/sec type: de.julielab.jcore.types.AbstractSection + attributes: + - name: sec-type + value: supplementary-material + omit-element: true p: block-element: true type: de.julielab.jcore.types.Paragraph From df18dfdc214adeb3fcae07b6bf19173667511de8 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 2 Mar 2022 14:43:35 +0100 Subject: [PATCH 156/269] Correct the XMI DB Multiplier name in its descriptor. --- .../julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml index bd4929ad1..fa909d57c 100644 --- a/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml +++ b/jcore-xmi-db-reader/src/main/resources/de/julielab/jcore/reader/xmi/desc/jcore-xmi-db-multiplier.xml @@ -4,7 +4,7 @@ true de.julielab.jcore.reader.xmi.XmiDBMultiplier - JCoRe Abstract Database Multiplier + JCoRe XMI Database Multiplier A multiplier that receives document IDs to read from a database table from the DBMultiplierReader. The reader also delivers the path to the corpus storage system (CoStoSys) configuration and additional tables for joining with the main data table. This multiplier class is abstract and cannot be used directly.Extending classes must implement the next() method to actually read documents from the database and populate CASes with them. This component is a part of the Jena Document Information System, JeDIS. 2.6.0-SNAPSHOT JULIE Lab Jena, Germany From 4314bebd6138e85f02056fb033236749529f026d Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 7 Mar 2022 14:19:40 +0100 Subject: [PATCH 157/269] Descriptor enhancements. --- jcore-gnp-bioc-reader/component.meta | 6 +++--- .../jcore/reader/desc/jcore-bnp-bioc-multiplier.xml | 2 +- .../jcore/consumer/xmi/desc/jcore-xmi-db-writer.xml | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/jcore-gnp-bioc-reader/component.meta b/jcore-gnp-bioc-reader/component.meta index c3b3f6e0a..4bd445551 100644 --- a/jcore-gnp-bioc-reader/component.meta +++ b/jcore-gnp-bioc-reader/component.meta @@ -1,12 +1,12 @@ { "categories": [ - "ae", - "reader" + "reader", + "multiplier" ], "description": "A reader for the BioC format used by GNormPlus. Reads the text and the annotations, both species and genes.", "descriptors": [ { - "category": "ae", + "category": "multiplier", "location": "de.julielab.jcore.reader.desc.jcore-bnp-bioc-multiplier" }, { diff --git a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml index 2b64be30b..018d3db3b 100644 --- a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml +++ b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml @@ -20,7 +20,7 @@ true true - false + true \ No newline at end of file diff --git a/jcore-xmi-db-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-db-writer.xml b/jcore-xmi-db-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-db-writer.xml index b3b5afac1..01f4ca1e3 100644 --- a/jcore-xmi-db-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-db-writer.xml +++ b/jcore-xmi-db-writer/src/main/resources/de/julielab/jcore/consumer/xmi/desc/jcore-xmi-db-writer.xml @@ -190,6 +190,7 @@ + From 1caec4e8444e6e1373a937cb8db4fc964d36eb77 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 7 Mar 2022 16:13:04 +0100 Subject: [PATCH 158/269] Let the BioC GNormPlus reader add max XMI IDs from the database on request. Fixes #133. --- jcore-gnp-bioc-reader/pom.xml | 7 ++- .../jcore/reader/BioCCasPopulator.java | 56 +++++++++++++++++-- .../reader/GNormPlusFormatMultiplier.java | 21 ++++++- .../desc/jcore-bnp-bioc-multiplier-reader.xml | 54 +++++++----------- .../reader/desc/jcore-bnp-bioc-multiplier.xml | 1 + .../jcore/reader/BioCCasPopulatorTest.java | 2 +- .../jcore/consumer/xmi/XmiDBWriterTest.java | 1 + 7 files changed, 99 insertions(+), 43 deletions(-) diff --git a/jcore-gnp-bioc-reader/pom.xml b/jcore-gnp-bioc-reader/pom.xml index 4ca0c48b5..ccbfad282 100644 --- a/jcore-gnp-bioc-reader/pom.xml +++ b/jcore-gnp-bioc-reader/pom.xml @@ -3,7 +3,7 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> 4.0.0 - jcore-bnp-bioc-reader + jcore-gnp-bioc-reader jar de.julielab @@ -51,6 +51,11 @@ ${jcore-utilities-version} test + + de.julielab + costosys + [1.6.0, ) + JCoRe GNormPlus BioC Reader diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java index bfd4474e0..fa76ad27c 100644 --- a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java @@ -2,6 +2,8 @@ import com.pengyifan.bioc.*; import com.pengyifan.bioc.io.BioCCollectionReader; +import de.julielab.costosys.dbconnection.CoStoSysConnection; +import de.julielab.costosys.dbconnection.DataBaseConnector; import de.julielab.jcore.types.*; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; @@ -11,9 +13,10 @@ import javax.xml.stream.XMLStreamException; import java.io.IOException; import java.nio.file.Path; -import java.util.Collection; -import java.util.Iterator; -import java.util.Optional; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.*; import java.util.stream.Stream; /** @@ -23,19 +26,51 @@ public class BioCCasPopulator { private final static Logger log = LoggerFactory.getLogger(BioCCasPopulator.class); private final BioCCollection bioCCollection; + private Map maxXmiIdMap; private int pos; - public BioCCasPopulator(Path biocCollectionPath) throws XMLStreamException, IOException { + public BioCCasPopulator(Path biocCollectionPath, Path costosysConfiguration, String documentsTable) throws XMLStreamException, IOException, SQLException { try (BioCCollectionReader bioCCollectionReader = new BioCCollectionReader(biocCollectionPath)) { bioCCollection = bioCCollectionReader.readCollection(); } + if (costosysConfiguration != null) { + maxXmiIdMap = new HashMap<>(); + DataBaseConnector dbc = new DataBaseConnector(costosysConfiguration.toString()); + try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { + retrieveMaxXmiIds(documentsTable, dbc, conn); + } + } pos = 0; } - public void populateWithNextDocument(JCas jCas) throws XMLStreamException, IOException { + private void retrieveMaxXmiIds(String documentsTable, DataBaseConnector dbc, CoStoSysConnection conn) throws SQLException { + log.debug("Retrieving the max XMI IDs for the current BioC collection of size {} from the database.", bioCCollection.getDocmentCount()); + Statement stmt = conn.createStatement(); + StringBuilder maxIdQueryBuilder = new StringBuilder(); + if (dbc.getActiveTableFieldConfiguration().getPrimaryKey().length > 1) + throw new IllegalArgumentException("The primary key of the active field schema '" + dbc.getActiveTableFieldConfiguration().getName() + "' is a compound key. Compound primary keys are currently not supported in this component."); + String pkString = dbc.getActiveTableFieldConfiguration().getPrimaryKeyString(); + maxIdQueryBuilder.append("SELECT ").append(pkString).append(",max_xmi_id FROM ").append(documentsTable).append(" WHERE ").append(pkString).append(" in ").append("("); + for (BioCDocument document : bioCCollection.getDocuments()) { + String docId = document.getID(); + maxIdQueryBuilder.append("'").append(docId).append("'").append(","); + } + // remove trailing comma + maxIdQueryBuilder.deleteCharAt(maxIdQueryBuilder.length() - 1); + maxIdQueryBuilder.append(")"); + String maxIdQuery = maxIdQueryBuilder.toString(); + ResultSet rs = stmt.executeQuery(maxIdQuery); + while (rs.next()) { + maxXmiIdMap.put(rs.getString(1), rs.getInt(2)); + } + log.debug("Obtained {} max XMI IDs.", maxXmiIdMap.size()); + } + + public void populateWithNextDocument(JCas jCas) { BioCDocument document = bioCCollection.getDocument(pos++); setDocumentId(jCas, document); setDocumentText(jCas, document); + setMaxXmiId(jCas, document); Iterator allAnnotations = Stream.concat(document.getAnnotations().stream(), document.getPassages().stream().map(BioCPassage::getAnnotations).flatMap(Collection::stream)).iterator(); for (BioCAnnotation annotation : (Iterable)() ->allAnnotations) { Optional type = annotation.getInfon("type"); @@ -56,6 +91,17 @@ public void populateWithNextDocument(JCas jCas) throws XMLStreamException, IOExc } } + private void setMaxXmiId(JCas jCas, BioCDocument document) { + if (maxXmiIdMap != null) { + Integer maxXmiId = maxXmiIdMap.get(document.getID()); + if (maxXmiId == null) + throw new IllegalStateException("No max XMI ID was obtained for the document with ID " + document.getID() + ". This means that this document is not already part of the database documents table. When adding annotations to existing database documents, make sure that all documents exist in the database already."); + XmiMetaData xmiMetaData = new XmiMetaData(jCas); + xmiMetaData.setMaxXmiId(maxXmiId); + xmiMetaData.addToIndexes(); + } + } + private void setDocumentId(JCas jCas, BioCDocument document) { Header h = new Header(jCas); h.setDocId(document.getID()); diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java index 5e7d71580..654c595dc 100644 --- a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java @@ -1,12 +1,15 @@ package de.julielab.jcore.reader; import de.julielab.jcore.types.casmultiplier.JCoReURI; +import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasMultiplier_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.AbstractCas; +import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -18,9 +21,25 @@ @ResourceMetaData(name="GNormPlusFormatMultiplier", description = "Multiplier for GNormPlusFormatMultiplierReader. Takes URIs pointing to BioC collection files that contain annotations created by GNormPlus. For each such file, reads all documents and returns CASes for them until all documents in all collections have been read into a CAS.") public class GNormPlusFormatMultiplier extends JCasMultiplier_ImplBase { private final static Logger log = LoggerFactory.getLogger(GNormPlusFormatMultiplier.class); + public static final String PARAM_COSTOSYS_CONFIG = "CostosysConfigFile"; + public static final String PARAM_XMI_DOCUMENTS_TABLE = "DocumentsTable"; private Iterator currentUriBatch; private BioCCasPopulator casPopulator; +@ConfigurationParameter(name=PARAM_COSTOSYS_CONFIG, mandatory = false, description = "Path to the CoStoSys configuration file that is used by the XMI DB writer in the same pipeline, if any. The XMI DB writer requires information about the XMI documents that are already in the database and should be updated with new annotations. The current highest XMI ID must be known to avoid ID collisions. To obtain the ID, it must be received from the database beforehand. This allows to retrieve the information batch wise instead of one-by-one which would be much slower.") + private String costosysConfiguration; +@ConfigurationParameter(name=PARAM_XMI_DOCUMENTS_TABLE, mandatory = false, description = "Required to retrieve the max XMI ID for use by the XMI DB writer. The schema-qualified name of the XMI document table that the XMI DB writer will write annotations into.") + private String documentsTable; + + @Override + public void initialize(UimaContext aContext) throws ResourceInitializationException { + super.initialize(aContext); + costosysConfiguration = (String) aContext.getConfigParameterValue(PARAM_COSTOSYS_CONFIG); + documentsTable = (String) aContext.getConfigParameterValue(PARAM_XMI_DOCUMENTS_TABLE); + if (costosysConfiguration == null ^ documentsTable == null) + throw new ResourceInitializationException(new IllegalArgumentException("Either both or none parameters must be defined: " + PARAM_COSTOSYS_CONFIG + ", " + PARAM_XMI_DOCUMENTS_TABLE)); + } + @Override public void process(JCas jCas) throws AnalysisEngineProcessException { try { @@ -39,7 +58,7 @@ public boolean hasNext() throws AnalysisEngineProcessException { if ((casPopulator == null || casPopulator.documentsLeftInCollection() == 0) && currentUriBatch.hasNext()) { URI nextUri = currentUriBatch.next(); try { - casPopulator = new BioCCasPopulator(Path.of(nextUri)); + casPopulator = new BioCCasPopulator(Path.of(nextUri), costosysConfiguration != null ? Path.of(costosysConfiguration) : null, documentsTable); } catch (Exception e) { log.error("Could not read from {}", nextUri, e); throw new AnalysisEngineProcessException(e); diff --git a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier-reader.xml b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier-reader.xml index 7081ae596..6095bb281 100644 --- a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier-reader.xml +++ b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier-reader.xml @@ -1,58 +1,42 @@ - + org.apache.uima.java - de.julielab.jcore.reader.GNormPlusFormatMultiplierReader - - JCoRe GNormPlus Format Multiplier Reader - A reader for the BioC XML format used by GNormPlus. Requires the matching multiplier. + true + de.julielab.jcore.reader.GNormPlusFormatMultiplier + + GNormPlusFormatMultiplier + Multiplier for GNormPlusFormatMultiplierReader. Takes URIs pointing to BioC collection files that contain annotations created by GNormPlus. For each such file, reads all documents and returns CASes for them until all documents in all collections have been read into a CAS. - InputPath - Path to a directory or file to be read. In case of a directory, all files ending in .xml will be read. + CostosysConfigFile + Path to the CoStoSys configuration file that is used by the XMI DB writer in the same pipeline, if any. The XMI DB writer requires information about the XMI documents that are already in the database and should be updated with new annotations. The current highest XMI ID must be known to avoid ID collisions. To obtain the ID, it must be received from the database beforehand. This allows to retrieve the information batch wise instead of one-by-one which would be much slower. String false - true - - - Recursive - Whether to read also the subdirectories of the input directory, if the input path points to a directory. - Boolean - false false - BatchSize - The number of XML file URI references to send to the CAS multipliers in each work assignment. Defaults to 20. - Integer + DocumentsTable + Required to retrieve the max XMI ID for use by the XMI DB writer. The schema-qualified name of the XMI document table that the XMI DB writer will write annotations into. + String false false - - - Recursive - - true - - - - BatchSize - - 20 - - - + + + + true - false - true + true + false - - \ No newline at end of file + + \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml index 018d3db3b..4981c441b 100644 --- a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml +++ b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml @@ -13,6 +13,7 @@ + diff --git a/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java index acea59b54..3b7e0dba5 100644 --- a/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java +++ b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java @@ -20,7 +20,7 @@ private JCas getJCas() throws Exception { @Test public void populateWithNextDocument() throws Exception { - BioCCasPopulator bioCCasPopulator = new BioCCasPopulator(Path.of("src", "test", "resources", "test-input-path", "bioc_collection_3.xml")); + BioCCasPopulator bioCCasPopulator = new BioCCasPopulator(Path.of("src", "test", "resources", "test-input-path", "bioc_collection_3.xml"), null, null); assertThat(bioCCasPopulator.documentsLeftInCollection()).isEqualTo(2); JCas jCas = getJCas(); bioCCasPopulator.populateWithNextDocument(jCas); diff --git a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java index fc93a2138..306ab2820 100644 --- a/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java +++ b/jcore-xmi-db-writer/src/test/java/de/julielab/jcore/consumer/xmi/XmiDBWriterTest.java @@ -354,4 +354,5 @@ public void testMirrorSubsetNotReset() throws Exception { } } } + } From a3402fab1b9b9cab77e19f7a7d2310340e294792 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 7 Mar 2022 17:02:27 +0100 Subject: [PATCH 159/269] Correct meta typo. --- jcore-gnp-bioc-reader/component.meta | 6 +-- .../reader/GNormPlusFormatMultiplier.java | 2 +- .../desc/jcore-bnp-bioc-multiplier-reader.xml | 54 ++++++++++++------- .../reader/desc/jcore-bnp-bioc-multiplier.xml | 21 ++++++-- 4 files changed, 57 insertions(+), 26 deletions(-) diff --git a/jcore-gnp-bioc-reader/component.meta b/jcore-gnp-bioc-reader/component.meta index 4bd445551..630d71b06 100644 --- a/jcore-gnp-bioc-reader/component.meta +++ b/jcore-gnp-bioc-reader/component.meta @@ -1,12 +1,12 @@ { "categories": [ "reader", - "multiplier" + "ae" ], "description": "A reader for the BioC format used by GNormPlus. Reads the text and the annotations, both species and genes.", "descriptors": [ { - "category": "multiplier", + "category": "ae", "location": "de.julielab.jcore.reader.desc.jcore-bnp-bioc-multiplier" }, { @@ -17,7 +17,7 @@ "exposable": true, "group": "general", "maven-artifact": { - "artifactId": "jcore-bnp-bioc-reader", + "artifactId": "jcore-gnp-bioc-reader", "groupId": "de.julielab", "version": "2.6.0-SNAPSHOT" }, diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java index 654c595dc..8cf8616cf 100644 --- a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java @@ -18,7 +18,7 @@ import java.util.Collection; import java.util.Iterator; -@ResourceMetaData(name="GNormPlusFormatMultiplier", description = "Multiplier for GNormPlusFormatMultiplierReader. Takes URIs pointing to BioC collection files that contain annotations created by GNormPlus. For each such file, reads all documents and returns CASes for them until all documents in all collections have been read into a CAS.") +@ResourceMetaData(name="JCoRe GNormPlus BioC Format Multiplier", description = "Multiplier for GNormPlusFormatMultiplierReader. Takes URIs pointing to BioC collection files that contain annotations created by GNormPlus. For each such file, reads all documents and returns CASes for them until all documents in all collections have been read into a CAS.") public class GNormPlusFormatMultiplier extends JCasMultiplier_ImplBase { private final static Logger log = LoggerFactory.getLogger(GNormPlusFormatMultiplier.class); public static final String PARAM_COSTOSYS_CONFIG = "CostosysConfigFile"; diff --git a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier-reader.xml b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier-reader.xml index 6095bb281..7081ae596 100644 --- a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier-reader.xml +++ b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier-reader.xml @@ -1,42 +1,58 @@ - + org.apache.uima.java - true - de.julielab.jcore.reader.GNormPlusFormatMultiplier - - GNormPlusFormatMultiplier - Multiplier for GNormPlusFormatMultiplierReader. Takes URIs pointing to BioC collection files that contain annotations created by GNormPlus. For each such file, reads all documents and returns CASes for them until all documents in all collections have been read into a CAS. + de.julielab.jcore.reader.GNormPlusFormatMultiplierReader + + JCoRe GNormPlus Format Multiplier Reader + A reader for the BioC XML format used by GNormPlus. Requires the matching multiplier. - CostosysConfigFile - Path to the CoStoSys configuration file that is used by the XMI DB writer in the same pipeline, if any. The XMI DB writer requires information about the XMI documents that are already in the database and should be updated with new annotations. The current highest XMI ID must be known to avoid ID collisions. To obtain the ID, it must be received from the database beforehand. This allows to retrieve the information batch wise instead of one-by-one which would be much slower. + InputPath + Path to a directory or file to be read. In case of a directory, all files ending in .xml will be read. String false + true + + + Recursive + Whether to read also the subdirectories of the input directory, if the input path points to a directory. + Boolean + false false - DocumentsTable - Required to retrieve the max XMI ID for use by the XMI DB writer. The schema-qualified name of the XMI document table that the XMI DB writer will write annotations into. - String + BatchSize + The number of XML file URI references to send to the CAS multipliers in each work assignment. Defaults to 20. + Integer false false - + + + Recursive + + true + + + + BatchSize + + 20 + + + - - - true - true - false + false + true - - \ No newline at end of file + + \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml index 4981c441b..6f016f017 100644 --- a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml +++ b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml @@ -4,9 +4,24 @@ true de.julielab.jcore.reader.GNormPlusFormatMultiplier - GNormPlusFormatMultiplier + JCoRe GNormPlus BioC Format Multiplier Multiplier for GNormPlusFormatMultiplierReader. Takes URIs pointing to BioC collection files that contain annotations created by GNormPlus. For each such file, reads all documents and returns CASes for them until all documents in all collections have been read into a CAS. - + + + CostosysConfigFile + Path to the CoStoSys configuration file that is used by the XMI DB writer in the same pipeline, if any. The XMI DB writer requires information about the XMI documents that are already in the database and should be updated with new annotations. The current highest XMI ID must be known to avoid ID collisions. To obtain the ID, it must be received from the database beforehand. This allows to retrieve the information batch wise instead of one-by-one which would be much slower. + String + false + false + + + DocumentsTable + Required to retrieve the max XMI ID for use by the XMI DB writer. The schema-qualified name of the XMI document table that the XMI DB writer will write annotations into. + String + false + false + + @@ -21,7 +36,7 @@ true true - true + false \ No newline at end of file From 32d92a879b70e029b6c7b16046adbb7ef3e7631a Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 7 Mar 2022 17:10:37 +0100 Subject: [PATCH 160/269] Set the BioC GNP multiplier again as outputting new CASes. --- jcore-gnp-bioc-reader/component.meta | 4 ++-- .../julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/jcore-gnp-bioc-reader/component.meta b/jcore-gnp-bioc-reader/component.meta index 630d71b06..91006a646 100644 --- a/jcore-gnp-bioc-reader/component.meta +++ b/jcore-gnp-bioc-reader/component.meta @@ -1,12 +1,12 @@ { "categories": [ "reader", - "ae" + "multiplier" ], "description": "A reader for the BioC format used by GNormPlus. Reads the text and the annotations, both species and genes.", "descriptors": [ { - "category": "ae", + "category": "multiplier", "location": "de.julielab.jcore.reader.desc.jcore-bnp-bioc-multiplier" }, { diff --git a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml index 6f016f017..4b329d06a 100644 --- a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml +++ b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml @@ -36,7 +36,7 @@ true true - false + true \ No newline at end of file From 6f4b8b9f18baf6cec065896916ed6184ed8c6e4c Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 7 Mar 2022 17:38:55 +0100 Subject: [PATCH 161/269] Add the sofa map besides the xmi IDs with the GNP reader. --- .../jcore/reader/BioCCasPopulator.java | 27 ++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java index fa76ad27c..81320a24c 100644 --- a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java @@ -7,6 +7,7 @@ import de.julielab.jcore.types.*; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.jcas.cas.StringArray; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,6 +18,7 @@ import java.sql.SQLException; import java.sql.Statement; import java.util.*; +import java.util.stream.Collectors; import java.util.stream.Stream; /** @@ -27,6 +29,7 @@ public class BioCCasPopulator { private final static Logger log = LoggerFactory.getLogger(BioCCasPopulator.class); private final BioCCollection bioCCollection; private Map maxXmiIdMap; + private Map sofaMaps; private int pos; public BioCCasPopulator(Path biocCollectionPath, Path costosysConfiguration, String documentsTable) throws XMLStreamException, IOException, SQLException { @@ -35,22 +38,23 @@ public BioCCasPopulator(Path biocCollectionPath, Path costosysConfiguration, Str } if (costosysConfiguration != null) { maxXmiIdMap = new HashMap<>(); + sofaMaps = new HashMap<>(); DataBaseConnector dbc = new DataBaseConnector(costosysConfiguration.toString()); try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { - retrieveMaxXmiIds(documentsTable, dbc, conn); + retrieveXmiMetaData(documentsTable, dbc, conn); } } pos = 0; } - private void retrieveMaxXmiIds(String documentsTable, DataBaseConnector dbc, CoStoSysConnection conn) throws SQLException { + private void retrieveXmiMetaData(String documentsTable, DataBaseConnector dbc, CoStoSysConnection conn) throws SQLException { log.debug("Retrieving the max XMI IDs for the current BioC collection of size {} from the database.", bioCCollection.getDocmentCount()); Statement stmt = conn.createStatement(); StringBuilder maxIdQueryBuilder = new StringBuilder(); if (dbc.getActiveTableFieldConfiguration().getPrimaryKey().length > 1) throw new IllegalArgumentException("The primary key of the active field schema '" + dbc.getActiveTableFieldConfiguration().getName() + "' is a compound key. Compound primary keys are currently not supported in this component."); String pkString = dbc.getActiveTableFieldConfiguration().getPrimaryKeyString(); - maxIdQueryBuilder.append("SELECT ").append(pkString).append(",max_xmi_id FROM ").append(documentsTable).append(" WHERE ").append(pkString).append(" in ").append("("); + maxIdQueryBuilder.append("SELECT ").append(pkString).append(",max_xmi_id,sofa_mapping FROM ").append(documentsTable).append(" WHERE ").append(pkString).append(" in ").append("("); for (BioCDocument document : bioCCollection.getDocuments()) { String docId = document.getID(); maxIdQueryBuilder.append("'").append(docId).append("'").append(","); @@ -62,6 +66,10 @@ private void retrieveMaxXmiIds(String documentsTable, DataBaseConnector dbc, CoS ResultSet rs = stmt.executeQuery(maxIdQuery); while (rs.next()) { maxXmiIdMap.put(rs.getString(1), rs.getInt(2)); + sofaMaps.put(rs.getString(1), rs.getString(3)); + } + if (log.isTraceEnabled()) { + log.trace("XMI ID map sample: {}", maxXmiIdMap.entrySet().stream().limit(10).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue))); } log.debug("Obtained {} max XMI IDs.", maxXmiIdMap.size()); } @@ -94,10 +102,23 @@ public void populateWithNextDocument(JCas jCas) { private void setMaxXmiId(JCas jCas, BioCDocument document) { if (maxXmiIdMap != null) { Integer maxXmiId = maxXmiIdMap.get(document.getID()); + String mappingString = sofaMaps.get(document.getID()); if (maxXmiId == null) throw new IllegalStateException("No max XMI ID was obtained for the document with ID " + document.getID() + ". This means that this document is not already part of the database documents table. When adding annotations to existing database documents, make sure that all documents exist in the database already."); XmiMetaData xmiMetaData = new XmiMetaData(jCas); xmiMetaData.setMaxXmiId(maxXmiId); + String[] mappings = mappingString != null ? mappingString.split("\\|") : null; + StringArray mappingsArray = null; + if (mappings != null) { + mappingsArray = new StringArray(jCas, mappings.length); + for (int i = 0; i < mappings.length; i++) { + String mapping = mappings[i]; + mappingsArray.set(i, mapping); + log.trace("Retrieved sofa_id_mapping {} for document {}.", mappingsArray.get(i), document.getID()); + } + } + if (mappingsArray != null) + xmiMetaData.setSofaIdMappings(mappingsArray); xmiMetaData.addToIndexes(); } } From 8d300903d7bb51f08a336eb2faf11b7126b7282e Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 7 Mar 2022 17:45:07 +0100 Subject: [PATCH 162/269] Add biology types to GNP BioC multiplier. --- .../main/java/de/julielab/jcore/reader/BioCCasPopulator.java | 4 ++-- .../julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java index 81320a24c..49003430c 100644 --- a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java @@ -69,7 +69,8 @@ private void retrieveXmiMetaData(String documentsTable, DataBaseConnector dbc, C sofaMaps.put(rs.getString(1), rs.getString(3)); } if (log.isTraceEnabled()) { - log.trace("XMI ID map sample: {}", maxXmiIdMap.entrySet().stream().limit(10).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue))); + log.trace("XMI ID sample: {}", maxXmiIdMap.entrySet().stream().limit(10).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue))); + log.trace("Sofa map sample: {}", sofaMaps.entrySet().stream().limit(10).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue))); } log.debug("Obtained {} max XMI IDs.", maxXmiIdMap.size()); } @@ -114,7 +115,6 @@ private void setMaxXmiId(JCas jCas, BioCDocument document) { for (int i = 0; i < mappings.length; i++) { String mapping = mappings[i]; mappingsArray.set(i, mapping); - log.trace("Retrieved sofa_id_mapping {} for document {}.", mappingsArray.get(i), document.getID()); } } if (mappingsArray != null) diff --git a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml index 4b329d06a..8ee9bab7a 100644 --- a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml +++ b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml @@ -29,6 +29,7 @@ + From aeb6ac63d21292f6ab99d60be2710b178acb5f9d Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 14 Mar 2022 17:43:22 +0100 Subject: [PATCH 163/269] Fix a bug where the processed doc IDs in the XmiDateInserter were not cleared. The clearing was missing when the pipeline did not start with the DB reader reading from a subset table. --- .../jcore/ae/flairner/FlairNerAnnotator.java | 4 +++- .../jcore/reader/BioCCasPopulator.java | 12 +++++++++++ .../reader/GNormPlusFormatMultiplier.java | 21 +++++++++++++++---- .../GNormPlusFormatMultiplierReader.java | 3 +++ .../reader/desc/jcore-bnp-bioc-multiplier.xml | 13 ++++++++++-- .../jcore/consumer/xmi/XmiDataInserter.java | 14 ++++++++++--- 6 files changed, 57 insertions(+), 10 deletions(-) diff --git a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java index 04d65d3cf..cf36e6c22 100644 --- a/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java +++ b/jcore-flair-ner-ae/src/main/java/de/julielab/jcore/ae/flairner/FlairNerAnnotator.java @@ -24,6 +24,7 @@ import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.DoubleArray; import org.apache.uima.jcas.tcas.Annotation; @@ -174,7 +175,8 @@ public void process(final JCas aJCas) throws AnalysisEngineProcessException { } JCoReOverlapAnnotationIndex intRefIndex = new JCoReOverlapAnnotationIndex<>(aJCas, InternalReference.type); final AnnotationAdderHelper helper = new AnnotationAdderHelper(); - log.trace("Sending document sentences to flair for entity tagging."); + if (log.isTraceEnabled()) + log.trace("Sending document sentences to flair for entity tagging: {}", JCasUtil.select(aJCas, Sentence.class).stream().map(Sentence::getCoveredText).collect(Collectors.toList())); final NerTaggingResponse taggingResponse = connector.tagSentences(StreamSupport.stream(sentIndex.spliterator(), false)); final List taggedEntities = taggingResponse.getTaggedEntities(); for (TaggedEntity entity : taggedEntities) { diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java index 49003430c..75f58fa02 100644 --- a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java @@ -162,6 +162,10 @@ private void setDocumentText(JCas jCas, BioCDocument document) { passageAnnotation = new Title(jCas, offset, passageEnd); ((Title) passageAnnotation).setTitleType("table"); break; + case "other_title": + passageAnnotation = new Title(jCas, offset, passageEnd); + ((Title) passageAnnotation).setTitleType("other"); + break; case "abstract": passageAnnotation = new AbstractText(jCas, offset, passageEnd); break; @@ -224,4 +228,12 @@ private void addGeneAnnotation(BioCAnnotation annotation, JCas jCas) throws Miss public int documentsLeftInCollection() { return bioCCollection.getDocmentCount() - pos; } + + public long getCollectionTextLength() { + return bioCCollection.getDocuments().stream().map(BioCDocument::getPassages).flatMap(Collection::stream).mapToInt(passage -> passage.getText().orElse("").length()).sum(); + } + + public int getNumDocumentsInCollection() { + return bioCCollection.getDocmentCount(); + } } diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java index 8cf8616cf..1739e461e 100644 --- a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplier.java @@ -7,6 +7,7 @@ import org.apache.uima.cas.AbstractCas; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; @@ -15,22 +16,27 @@ import java.net.URI; import java.nio.file.Path; +import java.text.DecimalFormat; import java.util.Collection; import java.util.Iterator; -@ResourceMetaData(name="JCoRe GNormPlus BioC Format Multiplier", description = "Multiplier for GNormPlusFormatMultiplierReader. Takes URIs pointing to BioC collection files that contain annotations created by GNormPlus. For each such file, reads all documents and returns CASes for them until all documents in all collections have been read into a CAS.") +@ResourceMetaData(name = "JCoRe GNormPlus BioC Format Multiplier", description = "Multiplier for GNormPlusFormatMultiplierReader. Takes URIs pointing to BioC collection files that contain annotations created by GNormPlus. For each such file, reads all documents and returns CASes for them until all documents in all collections have been read into a CAS.") +@TypeCapability(outputs = {"de.julielab.jcore.types.Gene", "de.julielab.jcore.types.Organism"}) public class GNormPlusFormatMultiplier extends JCasMultiplier_ImplBase { - private final static Logger log = LoggerFactory.getLogger(GNormPlusFormatMultiplier.class); public static final String PARAM_COSTOSYS_CONFIG = "CostosysConfigFile"; public static final String PARAM_XMI_DOCUMENTS_TABLE = "DocumentsTable"; + private final static Logger log = LoggerFactory.getLogger(GNormPlusFormatMultiplier.class); private Iterator currentUriBatch; private BioCCasPopulator casPopulator; + private DecimalFormat df = new DecimalFormat(); -@ConfigurationParameter(name=PARAM_COSTOSYS_CONFIG, mandatory = false, description = "Path to the CoStoSys configuration file that is used by the XMI DB writer in the same pipeline, if any. The XMI DB writer requires information about the XMI documents that are already in the database and should be updated with new annotations. The current highest XMI ID must be known to avoid ID collisions. To obtain the ID, it must be received from the database beforehand. This allows to retrieve the information batch wise instead of one-by-one which would be much slower.") + @ConfigurationParameter(name = PARAM_COSTOSYS_CONFIG, mandatory = false, description = "Path to the CoStoSys configuration file that is used by the XMI DB writer in the same pipeline, if any. The XMI DB writer requires information about the XMI documents that are already in the database and should be updated with new annotations. The current highest XMI ID must be known to avoid ID collisions. To obtain the ID, it must be received from the database beforehand. This allows to retrieve the information batch wise instead of one-by-one which would be much slower.") private String costosysConfiguration; -@ConfigurationParameter(name=PARAM_XMI_DOCUMENTS_TABLE, mandatory = false, description = "Required to retrieve the max XMI ID for use by the XMI DB writer. The schema-qualified name of the XMI document table that the XMI DB writer will write annotations into.") + @ConfigurationParameter(name = PARAM_XMI_DOCUMENTS_TABLE, mandatory = false, description = "Required to retrieve the max XMI ID for use by the XMI DB writer. The schema-qualified name of the XMI document table that the XMI DB writer will write annotations into.") private String documentsTable; + private long lastTimeStamp; + @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); @@ -38,6 +44,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept documentsTable = (String) aContext.getConfigParameterValue(PARAM_XMI_DOCUMENTS_TABLE); if (costosysConfiguration == null ^ documentsTable == null) throw new ResourceInitializationException(new IllegalArgumentException("Either both or none parameters must be defined: " + PARAM_COSTOSYS_CONFIG + ", " + PARAM_XMI_DOCUMENTS_TABLE)); + lastTimeStamp = 0; } @Override @@ -58,6 +65,12 @@ public boolean hasNext() throws AnalysisEngineProcessException { if ((casPopulator == null || casPopulator.documentsLeftInCollection() == 0) && currentUriBatch.hasNext()) { URI nextUri = currentUriBatch.next(); try { + if (log.isDebugEnabled() && lastTimeStamp != 0) { + long collectionTextLength = casPopulator.getCollectionTextLength(); + long passedMillis = System.currentTimeMillis() - lastTimeStamp; + log.debug("Last document batch of size {} processing time: {}s for text length of {} characters; that is {}ms per character.", casPopulator.getNumDocumentsInCollection(), passedMillis / 1000, collectionTextLength, df.format((double)passedMillis/collectionTextLength)); + } + lastTimeStamp = System.currentTimeMillis(); casPopulator = new BioCCasPopulator(Path.of(nextUri), costosysConfiguration != null ? Path.of(costosysConfiguration) : null, documentsTable); } catch (Exception e) { log.error("Could not read from {}", nextUri, e); diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReader.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReader.java index dc04596e4..019437c25 100644 --- a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReader.java +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/GNormPlusFormatMultiplierReader.java @@ -77,6 +77,9 @@ public void getNext(JCas jCas) throws CollectionException { throw new CollectionException(e); } completed++; + if (completed % 10 == 0) { + log.debug("{} input files read", completed); + } } } diff --git a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml index 8ee9bab7a..15f62b47b 100644 --- a/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml +++ b/jcore-gnp-bioc-reader/src/main/resources/de/julielab/jcore/reader/desc/jcore-bnp-bioc-multiplier.xml @@ -33,11 +33,20 @@ - + + + + + de.julielab.jcore.types.Gene + de.julielab.jcore.types.Organism + + + + true true - true + false \ No newline at end of file diff --git a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java index 390e27e67..1cad80ec4 100644 --- a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java +++ b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java @@ -16,6 +16,7 @@ import java.sql.BatchUpdateException; import java.sql.PreparedStatement; import java.sql.SQLException; +import java.text.DecimalFormat; import java.util.*; import java.util.function.Function; import java.util.function.Predicate; @@ -34,6 +35,7 @@ public class XmiDataInserter { private Map maxXmiIdMap; private String componentDbName; private String hashColumnName; + private DecimalFormat df = new DecimalFormat(); private List processedDocumentIds; @@ -97,7 +99,7 @@ public boolean hasNext() { @Override public Map next() { - Map row = new HashMap(); + Map row = new HashMap<>(); final DocumentId docId = docIdIterator.next(); // There might actually be no data when we only write the SHA hashes final List dataList = dataByDoc.getOrDefault(docId, Collections.emptyList()); @@ -166,6 +168,7 @@ public Map next() { row.put(hashColumnName, hash); log.trace("{}={}", hashColumnName, hash); } + System.out.println("XmiInserter: " + row); return row; } @@ -175,7 +178,9 @@ public void remove() { } } + long time = System.currentTimeMillis(); try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { + log.debug("Obtained connection after {}ms", System.currentTimeMillis()-time); conn.setAutoCommit(false); // This is the private in-line defined class from above. All values are already contained in the class @@ -199,6 +204,7 @@ public void remove() { throw new XmiDataInsertionException(e); } setLastComponent(conn, subsetTableName); + processedDocumentIds.clear(); log.debug("Committing XMI data to database."); conn.commit(); maxXmiIdMap.clear(); @@ -209,6 +215,10 @@ public void remove() { if (null != ne) ne.printStackTrace(); } + if (log.isDebugEnabled()) { + time = System.currentTimeMillis() - time; + log.debug("Database import of {} XMI documents took {}ms ({}ms per document)", documentIdsWithData.size(), time, df.format((double) time / documentIdsWithData.size())); + } } /** @@ -261,8 +271,6 @@ private void setLastComponent(CoStoSysConnection conn, String subsetTableName) t else nextException.printStackTrace(); throw new XmiDataInsertionException(nextException); - } finally { - processedDocumentIds.clear(); } } From 086f095f6e2514016f3cb815dd955cc58891f305 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 14 Mar 2022 17:47:40 +0100 Subject: [PATCH 164/269] Remove debug log message. --- .../java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java | 1 - 1 file changed, 1 deletion(-) diff --git a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java index 1cad80ec4..bcbe2f439 100644 --- a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java +++ b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java @@ -168,7 +168,6 @@ public Map next() { row.put(hashColumnName, hash); log.trace("{}={}", hashColumnName, hash); } - System.out.println("XmiInserter: " + row); return row; } From f973d8d7c8f7040e56bb123f31d1fea48526a2ce Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 16 Mar 2022 16:32:47 +0100 Subject: [PATCH 165/269] Omit annotation deletion for unchanged documents. --- .../jcore/ae/biosem/BioSemEventAnnotator.java | 13 ++++++++- .../jcore/consumer/xmi/XMIDBWriter.java | 16 ++++++++--- .../jcore/consumer/xmi/XmiDataInserter.java | 28 ++++++++++--------- 3 files changed, 39 insertions(+), 18 deletions(-) diff --git a/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java b/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java index 12720ec9d..e263b203f 100644 --- a/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java +++ b/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java @@ -17,7 +17,9 @@ import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException; import org.apache.uima.cas.FSIterator; +import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; +import org.apache.uima.fit.descriptor.ResourceMetaData; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; @@ -36,11 +38,13 @@ import java.util.*; import java.util.Map.Entry; +@ResourceMetaData(name="JCoRe BioSem Event Annotator", description = "Adds annotations for event triggers and events according to the BioNLP Shared Task event definition.") @TypeCapability(inputs = {"de.julielab.jcore.types.Gene"}, outputs = {"de.julielab.jcore.types.EventTrigger", "de.julielab.jcore.types.EventMention"}) public class BioSemEventAnnotator extends JCasAnnotator_ImplBase { private final static Logger log = LoggerFactory.getLogger(BioSemEventAnnotator.class); + public static final String PARAM_COMPONENT_ID = "ComponentId"; public final static String RESOURCE_TRAINED_DB = "TrainedDB"; private DataLoader loader; @@ -49,6 +53,8 @@ public class BioSemEventAnnotator extends JCasAnnotator_ImplBase { @ExternalResource(key = RESOURCE_TRAINED_DB) private DBUtilsProvider dbUtilsProvider; + @ConfigurationParameter(name=PARAM_COMPONENT_ID, mandatory = false, defaultValue = "BioSemEventAnnotator", description = "Optional. If set, the 'componentId' feature of the created annotations will be set to the value of this parameter.") + private String componentId; private EventExtraction xtr; @@ -66,6 +72,7 @@ public class BioSemEventAnnotator extends JCasAnnotator_ImplBase { public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); try { + componentId = (String) aContext.getConfigParameterValue(PARAM_COMPONENT_ID); dbUtilsProvider = (DBUtilsProvider) aContext.getResourceObject(RESOURCE_TRAINED_DB); trainedDb = dbUtilsProvider.getTrainedDatabase(); } catch (ResourceAccessException e) { @@ -200,6 +207,7 @@ private EventMention addEventToIndexes(PData event, Map proteinMap PData eventArg1 = event.getPdata1(); PData eventArg2 = event.getPdata2(); uimaEvent = new EventMention(aJCas, begin, end); + uimaEvent.setComponentId(componentId); uimaEvent.setId(event.PID); uimaEvent.setSpecificType(uimaTrigger.getSpecificType()); uimaEvent.setTrigger(uimaTrigger); @@ -281,6 +289,7 @@ private void addUimaEventArgument(EventMention uimaEvent, Object bioSemArg, int // if we don't want to use the writer). protein.setSpecificType("protein"); uimaArg = new ArgumentMention(aJCas, protein.getBegin(), protein.getEnd()); + uimaArg.setComponentId(componentId); uimaArg.setRef(protein); uimaArg.setRole(determineArgumentRole(uimaEvent, uimaArg, argPos)); } else if (bioSemArg instanceof PData) { @@ -295,9 +304,10 @@ private void addUimaEventArgument(EventMention uimaEvent, Object bioSemArg, int } if (null == uimaEventArg) { throw new IllegalStateException("Creating UIMA EventMention annotation for BioSem event \"" - + eventArg.toString() + "\" failed, the UIMA EventMention is null."); + + eventArg + "\" failed, the UIMA EventMention is null."); } uimaArg = new ArgumentMention(aJCas, uimaEventArg.getBegin(), uimaEventArg.getEnd()); + uimaArg.setComponentId(componentId); uimaArg.setRef(uimaEventArg); uimaArg.setRole(determineArgumentRole(uimaEvent, uimaArg, argPos)); } else { @@ -361,6 +371,7 @@ private EventTrigger addTriggerToIndexes(Word trg, JCas aJCas) { int end = trg.locs[1]; String type = trg.type; EventTrigger uimaTrigger = new EventTrigger(aJCas, begin, end); + uimaTrigger.setComponentId(componentId); uimaTrigger.setId(id); uimaTrigger.setSpecificType(type); return uimaTrigger; diff --git a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java index 8a085cf8b..ef6d7735c 100644 --- a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java +++ b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java @@ -252,6 +252,7 @@ public class XMIDBWriter extends JCasAnnotator_ImplBase { private String documentItemToHash; private Map shaMap; private Set mirrorResetIds; + private Set unchangedDocuments; private String mappingCacheKey; private DocumentReleaseCheckpoint docReleaseCheckpoint; private List currentDocumentIdBatch; @@ -426,6 +427,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept this.binaryEncoder = new BinaryJeDISNodeEncoder(); } mirrorResetIds = new HashSet<>(); + unchangedDocuments = new HashSet<>(); log.info(XMIDBWriter.class.getName() + " initialized."); log.info("Effective document table name: {}", effectiveDocTableName); @@ -519,6 +521,8 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { Optional metaData = metaDatas.stream().findAny(); DocumentId docId = getDocumentId(aJCas, metaData); setMirrorResetStateForDocId(docId, metaData); + if (metaData.isPresent() && metaData.get().getIsDocumentHashUnchanged()) + unchangedDocuments.add(docId); if (docId == null) { log.warn("The current document does not have a document ID. It is omitted from database import."); return; @@ -571,8 +575,10 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { private void setMirrorResetStateForDocId(DocumentId docId, Optional metaData) { if (metaData.isPresent()) { // mirror subset reset is only necessary if we store the base document in any way; - // additionally, we check if the document text hash key is reported to by different to its already - // existing database entry. Only then the mirror subsets should be reset for this document. + // additionally, we check if the document text hash key is reported to be different to its already + // existing database entry. Only then the mirror subsets should be reset for this document because only + // then a re-processing of the document makes sense. + // The isDocumentHashUnchanged feature is set by the XMLDBMultiplier. if (storeBaseDocument && !metaData.get().getIsDocumentHashUnchanged()) mirrorResetIds.add(docId); } else { @@ -1022,7 +1028,7 @@ public void batchProcessComplete() throws AnalysisEngineProcessException { final boolean readyToSendData = processXmiBuffer(); if (readyToSendData) { if (!(featuresToMapDryRun && useBinaryFormat)) - annotationInserter.sendXmiDataToDatabase(effectiveDocTableName, annotationModules, subsetTable, mirrorResetIds, deleteObsolete, shaMap); + annotationInserter.sendXmiDataToDatabase(effectiveDocTableName, annotationModules, subsetTable, mirrorResetIds, unchangedDocuments, deleteObsolete, shaMap); else log.info("The dry run to see details about features to be mapped in the binary format is activated. No contents are written into the database."); log.trace("Clearing {} annotation modules", annotationModules.size()); @@ -1033,6 +1039,7 @@ public void batchProcessComplete() throws AnalysisEngineProcessException { docReleaseCheckpoint.release(jedisSyncKey, currentDocumentIdBatch.stream()); currentDocumentIdBatch.clear(); mirrorResetIds.clear(); + unchangedDocuments.clear(); } } catch (XmiDataInsertionException e) { throw new AnalysisEngineProcessException(e); @@ -1052,7 +1059,7 @@ public void collectionProcessComplete() throws AnalysisEngineProcessException { try { processXmiBuffer(); if (!(featuresToMapDryRun && useBinaryFormat)) - annotationInserter.sendXmiDataToDatabase(effectiveDocTableName, annotationModules, subsetTable, mirrorResetIds, deleteObsolete, shaMap); + annotationInserter.sendXmiDataToDatabase(effectiveDocTableName, annotationModules, subsetTable, mirrorResetIds, unchangedDocuments, deleteObsolete, shaMap); else log.info("The dry run to see details about features to be mapped in the binary format is activated. No contents are written into the database."); annotationModules.clear(); @@ -1062,6 +1069,7 @@ public void collectionProcessComplete() throws AnalysisEngineProcessException { docReleaseCheckpoint.release(jedisSyncKey, currentDocumentIdBatch.stream()); currentDocumentIdBatch.clear(); mirrorResetIds.clear(); + unchangedDocuments.clear(); } catch (XmiDataInsertionException e) { throw new AnalysisEngineProcessException(e); } diff --git a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java index bcbe2f439..d561432fe 100644 --- a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java +++ b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XmiDataInserter.java @@ -63,35 +63,35 @@ public XmiDataInserter(Set annotationModuleColumnNames, * * @param annotationModules * @param mirrorResetIds + * @param unchangedDocuments * @param deleteObsolete * @param shaMap * @throws XmiDataInsertionException * @throws AnalysisEngineProcessException */ - public void sendXmiDataToDatabase(String xmiTableName, List annotationModules, String subsetTableName, Set mirrorResetIds, Boolean deleteObsolete, Map shaMap) throws XmiDataInsertionException { + public void sendXmiDataToDatabase(String xmiTableName, List annotationModules, String subsetTableName, Set mirrorResetIds, Set unchangedDocuments, Boolean deleteObsolete, Map shaMap) throws XmiDataInsertionException { log.trace("Sending {} XMI data items", annotationModules.size()); final Map> dataByDoc = annotationModules.stream().collect(Collectors.groupingBy(XmiData::getDocId)); // Collect all document IDs we want to add something for into the database. This can be annotations or the hash. - final Set documentIdsWithData = shaMap != null ? Sets.union(dataByDoc.keySet(), shaMap.keySet()) : dataByDoc.keySet(); + final Set documentIdsWithData = shaMap != null ? Sets.union(dataByDoc.keySet(), shaMap.keySet()) : dataByDoc.keySet(); log.trace("There are {} documents with values to be updated in the database.", documentIdsWithData.size()); class RowIterator implements Iterator> { + // Add documents that have been processed but no data. We need to do this to override potentially existing + // annotation values with null to remove them. + private Iterator docIdIterator; + private FieldConfig fieldConfig = dbc.getFieldConfiguration(schemaDocument); + private List> fields = fieldConfig.getFields(); /** * An iterator that always returns only rows for a subset of document IDs. Either the ones that need mirror subsets to be reset or those for which mirror subsets should not be reset. * @param returnDocumentsWithMirrorReset */ public RowIterator(boolean returnDocumentsWithMirrorReset) { - Predicate mirrorResetFilterPredicate = docId -> mirrorResetIds.contains(docId); + Predicate mirrorResetFilterPredicate = docId -> !unchangedDocuments.contains(docId); if (!returnDocumentsWithMirrorReset) mirrorResetFilterPredicate = Predicate.not(mirrorResetFilterPredicate); docIdIterator = Stream.concat(documentIdsWithData.stream(), processedDocumentIds.stream()).filter(mirrorResetFilterPredicate).distinct().iterator(); } - // Add documents that have been processed but no data. We need to do this to override potentially existing - // annotation values with null to remove them. - private Iterator docIdIterator; - private FieldConfig fieldConfig = dbc.getFieldConfiguration(schemaDocument); - private List> fields = fieldConfig.getFields(); - @Override public boolean hasNext() { return docIdIterator.hasNext(); @@ -153,7 +153,9 @@ public Map next() { missingColumns.forEach(c -> row.put(c, null)); } // Set columns without a value to null to delete a potentially existing value. - if (updateMode) { + // But only if the document text had changed. Otherwise we would just delete all the annotations we + // actually want to keep. + if (updateMode && !unchangedDocuments.contains(docId)) { Set annotationColumnsWithValues = dataList.stream().map(XmiData::getColumnName).collect(Collectors.toSet()); log.trace("Annotation columns with values: {}", annotationColumnsWithValues); final Sets.SetView columnsWithoutValues = Sets.difference(annotationModuleColumnNames, annotationColumnsWithValues); @@ -179,7 +181,7 @@ public void remove() { long time = System.currentTimeMillis(); try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { - log.debug("Obtained connection after {}ms", System.currentTimeMillis()-time); + log.debug("Obtained connection after {}ms", System.currentTimeMillis() - time); conn.setAutoCommit(false); // This is the private in-line defined class from above. All values are already contained in the class @@ -188,10 +190,10 @@ public void remove() { try { if (updateMode) { log.debug("Updating {} XMI CAS data in database table '{}' for documents with mirror subset resets.", - mirrorResetIds.size(), xmiTableName); + processedDocumentIds.size() - unchangedDocuments.size(), xmiTableName); dbc.updateFromRowIterator(iterator, xmiTableName, false, true, schemaDocument); log.debug("Updating {} XMI CAS data in database table '{}' for documents without mirror subset resets.", - annotationModules.size()-mirrorResetIds.size(), xmiTableName); + unchangedDocuments.size(), xmiTableName); dbc.updateFromRowIterator(new RowIterator(false), xmiTableName, false, false, schemaDocument); } else { log.debug("Inserting {} XMI CAS data into database table '{}'.", From cb93c15a6c6f9576fd2d21021f3606fe5fb7cb95 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 17 Mar 2022 13:10:36 +0100 Subject: [PATCH 166/269] Add the possibility to write gene annotations to BioC. Then, we can provide GNormPlus with our own gene annotations, e.g. from flair. --- .../consumer/gnp/BioCDocumentPopulator.java | 50 +++++++++----- .../consumer/gnp/GNormPlusFormatWriter.java | 8 ++- .../gnp/BioCDocumentPopulatorTest.java | 67 ++++++++++++++++++- .../consumer/gnp/TestDocumentGenerator.java | 2 +- jcore-xml-db-reader/pom.xml | 9 +++ .../jcore/reader/xml/XMLDBMultiplier.java | 43 ++++++++---- .../jcore/reader/xml/XMLDBMultiplierTest.java | 6 +- 7 files changed, 150 insertions(+), 35 deletions(-) diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java index 1a2182bed..bca360265 100644 --- a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java @@ -1,10 +1,13 @@ package de.julielab.jcore.consumer.gnp; +import com.pengyifan.bioc.BioCAnnotation; import com.pengyifan.bioc.BioCDocument; +import com.pengyifan.bioc.BioCLocation; import com.pengyifan.bioc.BioCPassage; import de.julielab.jcore.types.*; import de.julielab.jcore.utility.JCoReTools; import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -14,13 +17,20 @@ */ public class BioCDocumentPopulator { private final static Logger log = LoggerFactory.getLogger(BioCDocumentPopulator.class); + private boolean addGenes; + + public BioCDocumentPopulator(boolean addGenes) { + this.addGenes = addGenes; + } public BioCDocument populate(JCas jCas) { BioCDocument doc = new BioCDocument(JCoReTools.getDocId(jCas)); AnnotationIndex zoneIndex = jCas.getAnnotationIndex(Zone.type); + int annotationId = 0; for (Zone z : zoneIndex) { if (z.getEnd() - z.getBegin() <= 0) continue; + BioCPassage p = null; if (z instanceof Title) { Title t = (Title) z; String titleType; @@ -49,43 +59,53 @@ public BioCDocument populate(JCas jCas) { break; } if (titleType != null) { - BioCPassage p = getPassageForAnnotation(t); + p = getPassageForAnnotation(t); p.putInfon("type", titleType); doc.addPassage(p); } } else if (z instanceof AbstractText) { AbstractText at = (AbstractText) z; - BioCPassage p = getPassageForAnnotation(at); + p = getPassageForAnnotation(at); p.putInfon("type", "abstract"); doc.addPassage(p); } else if (z instanceof Paragraph) { Paragraph pa = (Paragraph) z; - BioCPassage p = getPassageForAnnotation(pa); + p = getPassageForAnnotation(pa); p.putInfon("type", "paragraph"); doc.addPassage(p); } else if (z instanceof Caption) { Caption c = (Caption) z; - BioCPassage p = getPassageForAnnotation(c); + p = getPassageForAnnotation(c); if (c.getCaptionType() == null) throw new IllegalArgumentException("The captionType feature is null for " + c); p.putInfon("type", c.getCaptionType()); doc.addPassage(p); } + if (addGenes) { + annotationId = addGenesToPassage(jCas, z, p, annotationId); + } } return doc; } -// private BioCPassage getPassageForAbstract(AbstractText at) { -// FSArray structuredAbstractParts = at.getStructuredAbstractParts(); -// boolean foundAbstractParts = false; -// if (structuredAbstractParts != null) { -// for (int i = 0; i < structuredAbstractParts.size(); ++i) { -// AbstractSection as = (AbstractSection) structuredAbstractParts.get(i); -// -// } -// } -// return null; -// } + private int addGenesToPassage(JCas jCas, Zone z, BioCPassage p, int annotationId) { + if (p != null) { + Iterable geneIt = JCasUtil.subiterate(jCas, Gene.class, z, false, true); + for (Gene g : geneIt) { + BioCAnnotation annotation = new BioCAnnotation(String.valueOf(annotationId++)); + annotation.setText(g.getCoveredText()); + String type = "Gene"; + String specificType = g.getSpecificType().toLowerCase(); + // 'familiy' is an entity name typo in the ProGene corpus + if (specificType != null && (specificType.contains("familiy") || specificType.contains("family") || specificType.contains("complex"))) + type = "FamilyName"; + annotation.putInfon("type", type); + annotation.addLocation(new BioCLocation(g.getBegin(), g.getEnd() - g.getBegin())); + p.addAnnotation(annotation); + } + } + return annotationId; + } /** * Creates a BioCPassage with offset and text corresponding to the passed annotation a. diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java index 002407a0e..08f10fab4 100644 --- a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/GNormPlusFormatWriter.java @@ -15,6 +15,7 @@ import java.nio.file.Path; import java.util.Date; +import java.util.Optional; @ResourceMetaData(name = "JCoRe GNormPlus BioC Writer", description = "Writes CAS documents into the BioC XML format used by the gene tagger and normalizer GNormPlus.", vendor = "JULIE Lab Jena, Germany") @TypeCapability(inputs = {}, outputs = {}) @@ -23,6 +24,7 @@ public class GNormPlusFormatWriter extends JCasAnnotator_ImplBase { public static final String PARAM_NUM_DOCS_PER_FILE = "NumDocsPerFile"; public static final String PARAM_NUM_FILES_PER_DIR = "NumFilesPerDir"; public static final String PARAM_BASE_DIR = "BaseDirectory"; + public static final String PARAM_ADD_GENES = "AddGenes"; private final static Logger log = LoggerFactory.getLogger(GNormPlusFormatWriter.class); @ConfigurationParameter(name = PARAM_NUM_DOCS_PER_FILE, description = "The number of documents (i.e. CASes) that should be written into a single BioC XML file.") private int numDocsPerFile; @@ -30,6 +32,8 @@ public class GNormPlusFormatWriter extends JCasAnnotator_ImplBase { private int numDocsPerDir; @ConfigurationParameter(name = PARAM_BASE_DIR, description = "The base directory into which to create new directories that contain the actual BioC collection files.") private String baseDirectory; + @ConfigurationParameter(name=PARAM_ADD_GENES, mandatory = false, description = "false", defaultValue = "If set to true, all Gene annotations in the CAS will be added to the BioC documents.") + private boolean addGenes; private BioCDocumentPopulator bioCDocumentPopulator; private BioCCollectionWriter bioCCollectionWriter; @@ -44,8 +48,9 @@ public void initialize(final UimaContext aContext) { numDocsPerFile = (int) aContext.getConfigParameterValue(PARAM_NUM_DOCS_PER_FILE); numDocsPerDir = (int) aContext.getConfigParameterValue(PARAM_NUM_FILES_PER_DIR); baseDirectory = (String) aContext.getConfigParameterValue(PARAM_BASE_DIR); + addGenes = (boolean) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_ADD_GENES)).orElse(false); - bioCDocumentPopulator = new BioCDocumentPopulator(); + bioCDocumentPopulator = new BioCDocumentPopulator(addGenes); bioCCollectionWriter = new BioCCollectionWriter(numDocsPerDir, Path.of(baseDirectory)); currentCollection = new BioCCollection("UTF-8", "1.0", new Date().toString(), true, "JCoRe GNormPlus BioC Writer", "PubTator.key"); @@ -76,7 +81,6 @@ public void process(final JCas jCas) throws AnalysisEngineProcessException { public void collectionProcessComplete() throws AnalysisEngineProcessException { super.collectionProcessComplete(); try { -// if (currentCollection.getDocmentCount() != 0) bioCCollectionWriter.writeBioCCollection(currentCollection); } catch (Exception e) { log.error("Could not write final batch of BioCDocuments.", e); diff --git a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java index 55601393a..25dc4e0ff 100644 --- a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java +++ b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulatorTest.java @@ -3,6 +3,7 @@ import com.pengyifan.bioc.BioCCollection; import com.pengyifan.bioc.BioCDocument; import com.pengyifan.bioc.io.BioCCollectionWriter; +import de.julielab.jcore.types.Gene; import org.apache.uima.jcas.JCas; import org.junit.jupiter.api.Test; @@ -14,7 +15,7 @@ class BioCDocumentPopulatorTest { @Test public void populate() throws Exception { - BioCDocumentPopulator populator = new BioCDocumentPopulator(); + BioCDocumentPopulator populator = new BioCDocumentPopulator(false); JCas jCas = TestDocumentGenerator.prepareCas(1); BioCDocument biocDoc = populator.populate(jCas); ByteArrayOutputStream baos = new ByteArrayOutputStream(); @@ -38,4 +39,68 @@ public void populate() throws Exception { assertThat(resultXml).containsOnlyOnce("Tab1."); assertThat(resultXml).containsOnlyOnce("This is the table1 caption."); } + + @Test + public void populateWithGenes() throws Exception { + BioCDocumentPopulator populator = new BioCDocumentPopulator(true); + JCas jCas = TestDocumentGenerator.prepareCas(1); + new Gene(jCas, 0, 4).addToIndexes(); + new Gene(jCas, 87, 96).addToIndexes(); + BioCDocument biocDoc = populator.populate(jCas); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + BioCCollection collection = new BioCCollection("UTF-8", "1.0", (new Date()).toString(), true, "jUnit Test", "PubTator.key"); + collection.addDocument(biocDoc); + BioCCollectionWriter collectionWriter = new BioCCollectionWriter(baos); + collectionWriter.writeCollection(collection); + String resultXml = baos.toString(StandardCharsets.UTF_8); + assertThat(resultXml).containsOnlyOnce(""); + assertThat(resultXml).contains("Gene"); + assertThat(resultXml).containsOnlyOnce(""); + assertThat(resultXml).containsOnlyOnce("This"); + + assertThat(resultXml).contains(""); + assertThat(resultXml).contains("Gene"); + assertThat(resultXml).containsOnlyOnce(""); + assertThat(resultXml).containsOnlyOnce("certainly"); + } + + @Test + public void populateWithGeneFamilies() throws Exception { + BioCDocumentPopulator populator = new BioCDocumentPopulator(true); + JCas jCas = TestDocumentGenerator.prepareCas(1); + Gene gene = new Gene(jCas, 0, 4); + gene.setSpecificType("protein_familiy_or_group"); + gene.addToIndexes(); + BioCDocument biocDoc = populator.populate(jCas); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + BioCCollection collection = new BioCCollection("UTF-8", "1.0", (new Date()).toString(), true, "jUnit Test", "PubTator.key"); + collection.addDocument(biocDoc); + BioCCollectionWriter collectionWriter = new BioCCollectionWriter(baos); + collectionWriter.writeCollection(collection); + String resultXml = baos.toString(StandardCharsets.UTF_8); + assertThat(resultXml).containsOnlyOnce(""); + assertThat(resultXml).contains("FamilyName"); + assertThat(resultXml).containsOnlyOnce(""); + assertThat(resultXml).containsOnlyOnce("This"); + } + + @Test + public void populateWithGeneFamilies2() throws Exception { + BioCDocumentPopulator populator = new BioCDocumentPopulator(true); + JCas jCas = TestDocumentGenerator.prepareCas(1); + Gene gene = new Gene(jCas, 0, 4); + gene.setSpecificType("FamilyName"); + gene.addToIndexes(); + BioCDocument biocDoc = populator.populate(jCas); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + BioCCollection collection = new BioCCollection("UTF-8", "1.0", (new Date()).toString(), true, "jUnit Test", "PubTator.key"); + collection.addDocument(biocDoc); + BioCCollectionWriter collectionWriter = new BioCCollectionWriter(baos); + collectionWriter.writeCollection(collection); + String resultXml = baos.toString(StandardCharsets.UTF_8); + assertThat(resultXml).containsOnlyOnce(""); + assertThat(resultXml).contains("FamilyName"); + assertThat(resultXml).containsOnlyOnce(""); + assertThat(resultXml).containsOnlyOnce("This"); + } } \ No newline at end of file diff --git a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/TestDocumentGenerator.java b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/TestDocumentGenerator.java index 55ca81a02..17e13f984 100644 --- a/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/TestDocumentGenerator.java +++ b/jcore-gnp-bioc-writer/src/test/java/de/julielab/jcore/consumer/gnp/TestDocumentGenerator.java @@ -9,7 +9,7 @@ public class TestDocumentGenerator { public static JCas createTestJCas() throws UIMAException { - return JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types"); + return JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-semantics-biology-types"); } public static JCas prepareCas(int docId) throws UIMAException { diff --git a/jcore-xml-db-reader/pom.xml b/jcore-xml-db-reader/pom.xml index 3342d08b7..24dd2febd 100644 --- a/jcore-xml-db-reader/pom.xml +++ b/jcore-xml-db-reader/pom.xml @@ -75,6 +75,15 @@ org.assertj assertj-core + + de.julielab + jcore-descriptor-creator + + + ch.qos.logback + logback-classic + provided + https://github.com/JULIELab/jcore-base/jcore-xml-db-reader diff --git a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java index f3c3d7790..6f0eda6aa 100644 --- a/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java +++ b/jcore-xml-db-reader/src/main/java/de/julielab/jcore/reader/xml/XMLDBMultiplier.java @@ -46,6 +46,8 @@ public class XMLDBMultiplier extends DBMultiplier { public static final String PARAM_TABLE_DOCUMENT = "DocumentTable"; public static final String PARAM_TABLE_DOCUMENT_SCHEMA = "DocumentTableSchema"; public static final String PARAM_TO_VISIT_KEYS = "ToVisitKeys"; + public static final String PARAM_ADD_TO_VISIT_KEYS = "AddToVisitKeys"; + public static final String PARAM_ADD_UNCHANGED_DOCUMENT_TEXT_FLAG = "AddUnchangedDocumentTextFlag"; private final static Logger log = LoggerFactory.getLogger(XMLDBMultiplier.class); /** @@ -64,8 +66,12 @@ public class XMLDBMultiplier extends DBMultiplier { private String xmiStorageDataTable; @ConfigurationParameter(name = PARAM_TABLE_DOCUMENT_SCHEMA, mandatory = false, description = "For use with AnnotationDefinedFlowController. The name of the schema that the document table - given with the " + PARAM_TABLE_DOCUMENT + " parameter - adheres to. Only the primary key part is required for hash value retrieval.") private String xmiStorageDataTableSchema; - @ConfigurationParameter(name = PARAM_TO_VISIT_KEYS, mandatory = false, description = "For use with AnnotationDefinedFlowController. The delegate AE keys of the AEs this CAS should still applied on although the hash has not changed. Can be null or empty indicating that no component should be applied to the CAS. This is, however, the task of the AnnotationDefinedFlowController.") + @ConfigurationParameter(name = PARAM_TO_VISIT_KEYS, mandatory = false, description = "For use with AnnotationDefinedFlowController. Specifies the delegate AE keys of the AEs this CAS should still applied on although the hash has not changed. Can be null or empty indicating that no component should be applied to the CAS. The task of the AnnotationDefinedFlowController is then to read those annotations and route the CAS accordingly.") private String[] toVisitKeys; + @ConfigurationParameter(name = PARAM_ADD_TO_VISIT_KEYS, mandatory = false, description = "Toggles the creation of annotations for the AnnotationDefinedFlowController. Only needed when such a flow controller is used in the pipeline. For details, see the description of " + PARAM_TO_VISIT_KEYS + ".") + private boolean addToVisitKeys; + @ConfigurationParameter(name = PARAM_ADD_UNCHANGED_DOCUMENT_TEXT_FLAG, mandatory = false, description = "Toggles the addition of the 'document text is unchanged' flag. The value of this flag is determined via a SHA256 hash of the CAS document text. When " + PARAM_TABLE_DOCUMENT + " and " + PARAM_TABLE_DOCUMENT_SCHEMA + " are specified, the hash value of the document in storage is retrieved and compared to the current value. The flag is then set with respect to the comparison result.") + private boolean addUnchangedDocumentTextFlag; private Row2CasMapper row2CasMapper; @@ -83,16 +89,20 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept xmiStorageDataTableSchema = (String) aContext.getConfigParameterValue(PARAM_TABLE_DOCUMENT_SCHEMA); documentItemToHash = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_ADD_SHA_HASH)).orElse("document_text"); toVisitKeys = (String[]) aContext.getConfigParameterValue(PARAM_TO_VISIT_KEYS); + addToVisitKeys = (boolean) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_ADD_TO_VISIT_KEYS)).orElse(false); + addUnchangedDocumentTextFlag = (boolean) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_ADD_UNCHANGED_DOCUMENT_TEXT_FLAG)).orElse(false); // We don't know yet which tables to read. Thus, we leave the row mapping out. // We will now once the DBMultiplier#process(JCas) will have been run. Initializer initializer = new Initializer(mappingFileStr, null, null); xmlMapper = initializer.getXmlMapper(); initialized = false; - if (!(xmiStorageDataTable == null && xmiStorageDataTableSchema == null) && !(xmiStorageDataTable != null && xmiStorageDataTableSchema != null && documentItemToHash != null)) { - String errorMsg = String.format("From the parameters '%s' and '%s' some are specified and some aren't. To activate hash value comparison in order to add aggregate component keys for CAS visit, specify all those parameters. Otherwise, specify none.", PARAM_TABLE_DOCUMENT, PARAM_TABLE_DOCUMENT_SCHEMA); - log.error(errorMsg); - throw new ResourceInitializationException(new IllegalArgumentException(errorMsg)); + if ((addToVisitKeys || addUnchangedDocumentTextFlag)) { + if (!(xmiStorageDataTable == null && xmiStorageDataTableSchema == null) && !(xmiStorageDataTable != null && xmiStorageDataTableSchema != null && documentItemToHash != null)) { + String errorMsg = String.format("From the parameters '%s' and '%s' some are specified and some aren't. To activate hash value comparison in order to add aggregate component keys for CAS visit, specify all those parameters. Otherwise, specify none.", PARAM_TABLE_DOCUMENT, PARAM_TABLE_DOCUMENT_SCHEMA); + log.error(errorMsg); + throw new ResourceInitializationException(new IllegalArgumentException(errorMsg)); + } } } @@ -115,6 +125,8 @@ public AbstractCas next() throws AnalysisEngineProcessException { } // The DBC is initialized in the super class in the process() method. Thus, at this point // the DBC should be set. + if (xmiStorageDataTable != null && !dbc.withConnectionQueryBoolean(d -> d.tableExists(xmiStorageDataTable))) + throw new AnalysisEngineProcessException(new IllegalArgumentException("The data table" + xmiStorageDataTable + " to retrieve hash values from for document text change detection does not exist in the database: " + dbc.getDbURL())); casPopulator = new CasPopulator(dbc, xmlMapper, row2CasMapper, rowMappingArray); initialized = true; } @@ -138,7 +150,7 @@ public AbstractCas next() throws AnalysisEngineProcessException { * @param jCas The newly read JCas. */ private void setToVisitAnnotation(JCas jCas) { - if (xmiStorageDataTable != null && dbc.tableExists(xmiStorageDataTable)) { + if (addToVisitKeys || addUnchangedDocumentTextFlag) { DBProcessingMetaData dbProcessingMetaData = JCasUtil.selectSingle(jCas, DBProcessingMetaData.class); StringArray pkArray = dbProcessingMetaData.getPrimaryKey(); String pkString = String.join(",", pkArray.toArray()); @@ -148,14 +160,17 @@ private void setToVisitAnnotation(JCas jCas) { if (existingHash.equals(newHash)) { if (log.isTraceEnabled()) log.trace("Document {} has a document text hash that equals the one present in the database. Creating a ToVisit annotation routing it only to the components with delegate keys {}.", pkString, toVisitKeys); - dbProcessingMetaData.setIsDocumentHashUnchanged(true); - ToVisit toVisit = new ToVisit(jCas); - if (toVisitKeys != null && toVisitKeys.length != 0) { - StringArray keysArray = new StringArray(jCas, toVisitKeys.length); - keysArray.copyFromArray(toVisitKeys, 0, 0, toVisitKeys.length); - toVisit.setDelegateKeys(keysArray); + if (addUnchangedDocumentTextFlag) + dbProcessingMetaData.setIsDocumentHashUnchanged(true); + if (addToVisitKeys) { + ToVisit toVisit = new ToVisit(jCas); + if (toVisitKeys != null && toVisitKeys.length != 0) { + StringArray keysArray = new StringArray(jCas, toVisitKeys.length); + keysArray.copyFromArray(toVisitKeys, 0, 0, toVisitKeys.length); + toVisit.setDelegateKeys(keysArray); + } + toVisit.addToIndexes(); } - toVisit.addToIndexes(); } } else { log.trace("No existing hash was found for document {}", pkString); @@ -191,7 +206,7 @@ protected List> getAllRetrievedColumns() { * @throws AnalysisEngineProcessException If the SQL request fails. */ private Map fetchCurrentHashesFromDatabase(RowBatch rowBatch) throws AnalysisEngineProcessException { - if (xmiStorageDataTable != null && dbc.tableExists(xmiStorageDataTable) && rowBatch.getIdentifiers() != null && rowBatch.getIdentifiers().size() > 0) { + if ((addToVisitKeys || addUnchangedDocumentTextFlag) && rowBatch.getIdentifiers() != null && rowBatch.getIdentifiers().size() > 0) { String hashColumn = documentItemToHash + "_sha256"; // Extract the document IDs in this RowBatch. The IDs could be composite keys. List documentIds = new ArrayList<>(rowBatch.getIdentifiers().size()); diff --git a/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java b/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java index 86009735d..ae154a30f 100644 --- a/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java +++ b/jcore-xml-db-reader/src/test/java/de/julielab/jcore/reader/xml/XMLDBMultiplierTest.java @@ -180,7 +180,8 @@ public void testHashComparison() throws Exception { XMLDBMultiplier.PARAM_ADD_SHA_HASH, "documentText", XMLDBMultiplier.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE, XMLDBMultiplier.PARAM_TABLE_DOCUMENT_SCHEMA, "xmi_text", - XMLDBMultiplier.PARAM_TO_VISIT_KEYS, "ThisIsTheVisitKey" + XMLDBMultiplier.PARAM_TO_VISIT_KEYS, "ThisIsTheVisitKey", + XMLDBMultiplier.PARAM_ADD_TO_VISIT_KEYS, true ); JCasIterator jCasIterator = engine.processAndOutputNewCASes(jCas); List toVisitKeys = new ArrayList<>(); @@ -204,7 +205,8 @@ public void testHashComparison2() throws Exception { XMLDBMultiplier.PARAM_MAPPING_FILE, Path.of("src", "test", "resources", "test-mappingfile.xml").toString(), XMLDBMultiplier.PARAM_ADD_SHA_HASH, "documentText", XMLDBMultiplier.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE, - XMLDBMultiplier.PARAM_TABLE_DOCUMENT_SCHEMA, "xmi_text" + XMLDBMultiplier.PARAM_TABLE_DOCUMENT_SCHEMA, "xmi_text", + XMLDBMultiplier.PARAM_ADD_TO_VISIT_KEYS, true ); JCasIterator jCasIterator = engine.processAndOutputNewCASes(jCas); List emptyToVisitAnnotation = new ArrayList<>(); From be429c4fe02a0fd456e669b8b58c72dbf3036c19 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 17 Mar 2022 13:12:46 +0100 Subject: [PATCH 167/269] Add gene addition parameter to descriptor for GNP BioC writer. --- .../consumer/gnp/desc/jcore-gnp-bioc-writer.xml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/jcore-gnp-bioc-writer.xml b/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/jcore-gnp-bioc-writer.xml index 524f590ea..82e53378c 100644 --- a/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/jcore-gnp-bioc-writer.xml +++ b/jcore-gnp-bioc-writer/src/main/resources/de/julielab/jcore/consumer/gnp/desc/jcore-gnp-bioc-writer.xml @@ -29,12 +29,27 @@ false true + + AddGenes + false + Boolean + false + false + - + + + AddGenes + + false + + + + From 8524fbacb770663cf19a7262749157bf430899ed Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 17 Mar 2022 13:16:31 +0100 Subject: [PATCH 168/269] Resolves #134. --- .../de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java index bca360265..78ec078cc 100644 --- a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java @@ -95,10 +95,12 @@ private int addGenesToPassage(JCas jCas, Zone z, BioCPassage p, int annotationId BioCAnnotation annotation = new BioCAnnotation(String.valueOf(annotationId++)); annotation.setText(g.getCoveredText()); String type = "Gene"; - String specificType = g.getSpecificType().toLowerCase(); + String specificType = g.getSpecificType() != null ? g.getSpecificType().toLowerCase() : null; // 'familiy' is an entity name typo in the ProGene corpus if (specificType != null && (specificType.contains("familiy") || specificType.contains("family") || specificType.contains("complex"))) type = "FamilyName"; + else if (specificType != null && specificType.contains("domain")) + type = "DomainMotif"; annotation.putInfon("type", type); annotation.addLocation(new BioCLocation(g.getBegin(), g.getEnd() - g.getBegin())); p.addAnnotation(annotation); From 9b5f84cec38280e8a2b7ab2e7d0905f11ee0b052 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 23 Mar 2022 18:18:55 +0100 Subject: [PATCH 169/269] Reduce database connection usage. --- .../java/de/julielab/jcore/reader/db/DBMultiplier.java | 2 +- .../de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java | 8 +++++--- .../java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java index c83fcaebb..b52c111c5 100644 --- a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java +++ b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java @@ -58,7 +58,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept initialized = false; } - private DataBaseConnector getDataBaseConnector(String costosysConfig) throws AnalysisEngineProcessException { + protected DataBaseConnector getDataBaseConnector(String costosysConfig) throws AnalysisEngineProcessException { DataBaseConnector dbc; try { dbc = new DataBaseConnector(costosysConfig); diff --git a/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java b/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java index 447e95929..eb0975888 100644 --- a/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java +++ b/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java @@ -84,8 +84,8 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { - super.process(aJCas); docId2HashMap = fetchCurrentHashesFromDatabase(JCasUtil.selectSingle(aJCas, RowBatch.class)); + super.process(aJCas); } @Override @@ -143,6 +143,8 @@ private void populateCas(JCas jCas, byte[][] documentData, String pkString) thro * @throws AnalysisEngineProcessException If the SQL request fails. */ private Map fetchCurrentHashesFromDatabase(RowBatch rowBatch) throws AnalysisEngineProcessException { + if (dbc == null) + dbc = getDataBaseConnector(rowBatch.getCostosysConfiguration()); if (xmiStorageDataTable != null && dbc.tableExists(xmiStorageDataTable) && rowBatch.getIdentifiers() != null && rowBatch.getIdentifiers().size() > 0) { String hashColumn = documentItemToHash + "_sha256"; // Extract the document IDs in this RowBatch. The IDs could be composite keys. @@ -188,11 +190,11 @@ private Map fetchCurrentHashesFromDatabase(RowBatch rowBatch) th * database, if present. If there was a hash in the database and the hash values are equal, creates the ToVisit * annotation and adds the toVisitKeys passed in the configuration of this component.

* - * @param jCas The newly read JCas. + * @param jCas The newly read JCas. * @param pkString */ private void setToVisitAnnotation(JCas jCas, String pkString) { - if (xmiStorageDataTable != null && dbc.tableExists(xmiStorageDataTable)) { + if (xmiStorageDataTable != null && xmiStorageDataTable != null) { String existingHash = docId2HashMap.get(pkString); if (existingHash != null) { String newHash = getHash(jCas); diff --git a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java index ef6d7735c..1d13802dd 100644 --- a/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java +++ b/jcore-xmi-db-writer/src/main/java/de/julielab/jcore/consumer/xmi/XMIDBWriter.java @@ -541,7 +541,7 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException { try (CoStoSysConnection costoConn = dbc.obtainOrReserveConnection()) { Map mirrorSubsetNames = dbc.getMirrorSubsetNames(costoConn, effectiveDocTableName); if (mirrorSubsetNames.keySet().contains(subsetTable.replace("^[^.]\\.", ""))) - throw new AnalysisEngineProcessException(new IllegalArgumentException("The read subset table " + subsetTable + " is a mirror subset its document table " + effectiveDocTableName + " and the base document should be stored. This base document storage would cause all its subset to reset the updated documents. Thus, the subset " + subsetTable + " would be partially reset while processing, reading the same documents over and over again. This is therefore illegal.")); + throw new AnalysisEngineProcessException(new IllegalArgumentException("The read subset table " + subsetTable + " is a mirror subset of the target document table " + effectiveDocTableName + " and the base document should be stored. This base document storage would cause all its subset to reset the updated documents. Thus, the subset " + subsetTable + " would be partially reset while processing, reading the same documents over and over again. This is therefore illegal.")); } } } From 2a8b27773b114077465eeb08efa4042cc2790acb Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 23 Mar 2022 18:19:41 +0100 Subject: [PATCH 170/269] Lower "regex subsentence has invalid offsets" message to debug level. --- .../de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java b/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java index d89ca98b7..0cd7354c5 100644 --- a/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java +++ b/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java @@ -366,7 +366,7 @@ private void splitAtWhitespaces(JCoReCondensedDocumentText documentText, Sentenc lastEnd = s.getEnd(); currentSentenceLength = 0; } else { - LOGGER.warn("Not creating whitespace-segmented sub-sentence because its offsets would be invalid: {}-{}", subBegin, subEnd); + LOGGER.debug("Not creating whitespace-segmented sub-sentence because its offsets would be invalid: {}-{}", subBegin, subEnd); } } currentSentenceLength += wsMatcher.end(); @@ -379,7 +379,7 @@ private void splitAtWhitespaces(JCoReCondensedDocumentText documentText, Sentenc s.setComponentId(this.getClass().getName()); subSentences.add(s); } else { - LOGGER.warn("Not creating whitespace-segmented sub-sentence because its offsets would be invalid: {}-{}", subBegin, subEnd); + LOGGER.debug("Not creating whitespace-segmented sub-sentence because its offsets would be invalid: {}-{}", subBegin, subEnd); } } From ad533d9f11aa6ba3e680bb075aa531107f4a75e2 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 25 Mar 2022 11:55:59 +0100 Subject: [PATCH 171/269] Fix a bug where a file name was expected but the XML source was a stream. Some PMC documents do not contain their own ID. One measure to solve this was to use the file name or URI of the source file. However, when reading from the database, there is no such file. Instead, the file name is given by the primary key. --- .../jcore/multiplier/pmc/PMCDBMultiplier.java | 3 +- .../jcore/multiplier/pmc/ErrorTest.java | 39 +++++++++++++++++++ .../src/test/resources/costosys-errortest.xml | 24 ++++++++++++ .../jcore/reader/pmc/parser/FrontParser.java | 10 ++++- 4 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/ErrorTest.java create mode 100644 jcore-pmc-db-reader/src/test/resources/costosys-errortest.xml diff --git a/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java b/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java index eb0975888..c669e4f92 100644 --- a/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java +++ b/jcore-pmc-db-reader/src/main/java/de/julielab/jcore/multiplier/pmc/PMCDBMultiplier.java @@ -13,6 +13,7 @@ import de.julielab.jcore.types.pubmed.Header; import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.AbstractCas; @@ -129,7 +130,7 @@ private void populateCas(JCas jCas, byte[][] documentData, String pkString) thro // It actually happens that some PMC XML documents do not contain their own ID. We can use the ID obtained // via the database primary key, which in turn might be derived from the original file name or some meta file. Header header = JCasUtil.selectSingle(jCas, Header.class); - if (header.getDocId().isBlank()) { + if (StringUtils.isBlank(header.getDocId())) { log.debug("Document has no docId set. Derived the ID {} from the primary key and setting it as the Header#docId feature.", pkString); header.setDocId(pkString); } diff --git a/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/ErrorTest.java b/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/ErrorTest.java new file mode 100644 index 000000000..674d61685 --- /dev/null +++ b/jcore-pmc-db-reader/src/test/java/de/julielab/jcore/multiplier/pmc/ErrorTest.java @@ -0,0 +1,39 @@ +package de.julielab.jcore.multiplier.pmc; + +import de.julielab.jcore.reader.db.DBMultiplierReader; +import de.julielab.jcore.utility.JCoReTools; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.JCasIterator; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import java.nio.file.Path; + +/** + * This is not as much a test as it is a facility to check error cases in isolation. The existing code + * reads from an XML database table and parses the PMC document from there + */ +@Disabled +public class ErrorTest { + + @Test + public void errorTest() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types"); + CollectionReader reader = CollectionReaderFactory.createReader(DBMultiplierReader.class, DBMultiplierReader.PARAM_COSTOSYS_CONFIG_NAME, Path.of("src", "test", "resources", "costosys-errortest.xml").toString(), DBMultiplierReader.PARAM_TABLE, "_data.errordoc", DBMultiplierReader.PARAM_RESET_TABLE, true); + AnalysisEngine engine = AnalysisEngineFactory.createEngine(PMCDBMultiplier.class, PMCDBMultiplier.PARAM_OMIT_BIB_REFERENCES, true); + while (reader.hasNext()) { + reader.getNext(jCas.getCas()); + JCasIterator jCasIterator = engine.processAndOutputNewCASes(jCas); + while (jCasIterator.hasNext()) { + JCas next = jCasIterator.next(); + System.out.println(JCoReTools.getDocId(next)); + next.release(); + } + } + } +} diff --git a/jcore-pmc-db-reader/src/test/resources/costosys-errortest.xml b/jcore-pmc-db-reader/src/test/resources/costosys-errortest.xml new file mode 100644 index 000000000..e9788a2fa --- /dev/null +++ b/jcore-pmc-db-reader/src/test/resources/costosys-errortest.xml @@ -0,0 +1,24 @@ + + + + public + pmc_bulk_gzip + + + + + + + + + + + + + pmc_xml + 5 + + + + + \ No newline at end of file diff --git a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java index 560f9877d..19a848902 100644 --- a/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java +++ b/jcore-pmc-reader/src/main/java/de/julielab/jcore/reader/pmc/parser/FrontParser.java @@ -111,8 +111,14 @@ else if (xPathExists(String.format(pubDateFmt, "pmc-release"))) header.setComponentId(PMCReader.class.getName()); pmcid.ifPresentOrElse(id -> header.setDocId(id.startsWith("PMC") ? id : "PMC" + id), () -> { - String filenameId = nxmlDocumentParser.getCurrentSource().toString().substring(nxmlDocumentParser.getCurrentSource().toString().lastIndexOf(File.separatorChar)+1, nxmlDocumentParser.getCurrentSource().toString().lastIndexOf('.')); - header.setDocId(filenameId.startsWith("PMC") ? filenameId : "PMC" + filenameId); + // try to extract the PMCID from the file name + // For now, let the dot indicate that this is, indeed, a file name; the source also be an InputStream, + // then we don't have access to the file name + int dotIndex = nxmlDocumentParser.getCurrentSource().toString().lastIndexOf('.'); + if (dotIndex > 0) { + String filenameId = nxmlDocumentParser.getCurrentSource().toString().substring(nxmlDocumentParser.getCurrentSource().toString().lastIndexOf(File.separatorChar) + 1, dotIndex); + header.setDocId(filenameId.startsWith("PMC") ? filenameId : "PMC" + filenameId); + } }); pmid.ifPresent(p -> { OtherID otherID = new OtherID(nxmlDocumentParser.cas); From 8eaee4b749b23876de5ccedaaf9c09020eb373a8 Mon Sep 17 00:00:00 2001 From: khituras Date: Fri, 25 Mar 2022 13:10:38 +0100 Subject: [PATCH 172/269] Fix an error due to api change in XML tools. --- .../java/de/julielab/jcore/multiplier/xml/XMLMultiplier.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-xml-reader/src/main/java/de/julielab/jcore/multiplier/xml/XMLMultiplier.java b/jcore-xml-reader/src/main/java/de/julielab/jcore/multiplier/xml/XMLMultiplier.java index aafcb1e8a..4b6e4f8d1 100644 --- a/jcore-xml-reader/src/main/java/de/julielab/jcore/multiplier/xml/XMLMultiplier.java +++ b/jcore-xml-reader/src/main/java/de/julielab/jcore/multiplier/xml/XMLMultiplier.java @@ -189,7 +189,7 @@ public void process(JCas cas) throws AnalysisEngineProcessException { try { rowIterator = JulieXMLTools.constructRowIterator( JulieXMLTools.readStream(UriUtilities.getInputStreamFromUri(new java.net.URI(currentUri)), 1024), - 1024, forEach, fields, currentUri); + 1024, forEach, fields, currentUri, true); } catch (IOException | URISyntaxException e) { throw new AnalysisEngineProcessException(e); } From dcbbce60b9be72225eb500aa228f7b84f616db3f Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 30 Mar 2022 09:41:30 +0200 Subject: [PATCH 173/269] Add FamilyName gene annotations to the CAS. --- .../jcore/reader/BioCCasPopulator.java | 22 ++ .../jcore/reader/BioCCasPopulatorTest.java | 16 ++ .../test/resources/bioc_collection_0_0.xml | 261 ++++++++++++++++++ 3 files changed, 299 insertions(+) create mode 100644 jcore-gnp-bioc-reader/src/test/resources/bioc_collection_0_0.xml diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java index 75f58fa02..4618ef255 100644 --- a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java @@ -90,6 +90,9 @@ public void populateWithNextDocument(JCas jCas) { case "Gene": addGeneAnnotation(annotation, jCas); break; + case "FamilyName": + addFamilyAnnotation(annotation, jCas); + break; case "Species": addSpeciesAnnotation(annotation, jCas); break; @@ -100,6 +103,7 @@ public void populateWithNextDocument(JCas jCas) { } } + private void setMaxXmiId(JCas jCas, BioCDocument document) { if (maxXmiIdMap != null) { Integer maxXmiId = maxXmiIdMap.get(document.getID()); @@ -215,6 +219,7 @@ private void addGeneAnnotation(BioCAnnotation annotation, JCas jCas) throws Miss // for GNormPlus, there are no discontinuing annotations anyway BioCLocation location = annotation.getTotalLocation(); Gene gene = new Gene(jCas, location.getOffset(), location.getOffset() + location.getLength()); + gene.setSpecificType("Gene"); ResourceEntry resourceEntry = new ResourceEntry(jCas, gene.getBegin(), gene.getEnd()); resourceEntry.setSource("NCBI Gene"); resourceEntry.setComponentId(GNormPlusFormatMultiplierReader.class.getCanonicalName()); @@ -225,6 +230,23 @@ private void addGeneAnnotation(BioCAnnotation annotation, JCas jCas) throws Miss gene.addToIndexes(); } + private void addFamilyAnnotation(BioCAnnotation annotation, JCas jCas) { + // the "total location" is the span from the minimum location value to the maximum location value; + // for GNormPlus, there are no discontinuing annotations anyway + BioCLocation location = annotation.getTotalLocation(); + Gene gene = new Gene(jCas, location.getOffset(), location.getOffset() + location.getLength()); + gene.setSpecificType("FamilyName"); + // e.g. NCBITaxonomyID:9606 + Optional focusSpecies = annotation.getInfon("FocusSpecies"); + if (!focusSpecies.isPresent()) + throw new IllegalStateException("A FamilyName annotation does not specify its species: " + annotation); + String taxId = focusSpecies.get().substring(15); + StringArray speciesArray = new StringArray(jCas, 1); + speciesArray.set(0, taxId); + gene.setSpecies(speciesArray); + gene.addToIndexes(); + } + public int documentsLeftInCollection() { return bioCCollection.getDocmentCount() - pos; } diff --git a/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java index 3b7e0dba5..b93ad6c46 100644 --- a/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java +++ b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java @@ -59,4 +59,20 @@ public void populateWithNextDocument() throws Exception { } assertThat(organisms).extracting(Organism::getCoveredText).contains("human", "patients", "rat", "retrovirus", "ZR-75-1"); } + + @Test + public void addFamilyNames() throws Exception { + BioCCasPopulator bioCCasPopulator = new BioCCasPopulator(Path.of("src", "test", "resources","bioc_collection_0_0.xml"), null, null); + JCas jCas = getJCas(); + bioCCasPopulator.populateWithNextDocument(jCas); + + Collection genes = JCasUtil.select(jCas, Gene.class); + assertThat(genes).hasSize(23); + assertThat(genes).filteredOn(Gene::getSpecificType, "FamilyName").hasSize(5); + for (Gene o : genes) { + if (o.getSpecificType().equals("FamilyName")) { + assertThat(o.getSpecies(0)).isEqualTo("9606"); + } + } + } } \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/test/resources/bioc_collection_0_0.xml b/jcore-gnp-bioc-reader/src/test/resources/bioc_collection_0_0.xml new file mode 100644 index 000000000..46dc0e704 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/resources/bioc_collection_0_0.xml @@ -0,0 +1,261 @@ + + + + JCoRe GNormPlus BioC Writer + Wed Mar 02 14:58:28 CET 2022 + PubTator.key + + 10885490 + + title + 0 + Decreased plasma cholesterol esterification and cholesteryl ester transfer in hypopituitary patients on glucocorticoid replacement therapy. + + 9606 + Species + + patients + + + + abstract + 140 + Cardiovascular risk is increased in hypopituitary patients. No data are available with respect to the effect of glucocorticoid replacement therapy on high density lipoproteins (HDL) metabolism in such patients. Plasma lecithin:cholesterol acyl transferase (LCAT), cholesteryl ester transfer protein (CETP) and phospholipid transfer protein (PLTP) are important determinants of HDL remodelling. The possible influence of conventional glucocorticoid replacement on plasma lipids, plasma LCAT, CETP and PLTP activity levels, as well as on plasma cholesterol esterification (EST) and cholesteryl ester transfer (CET) was evaluated in 24 consecutive hypopituitary patients (12 men and 12 women) with untreated growth hormone deficiency of whom 17 had adrenal insufficiency and were treated with cortisone acetate, 25 to 37.5 mg daily. Twenty-three patients were on stable levothyroxin therapy and 22 patients used sex steroids. Urinary excretion of cortisol and cortisone metabolites was higher (p<0.001) in glucocorticoid-treated patients. Body mass index (p<0.08) and fat mass (p<0.12) were not significantly different in patients receiving and not receiving glucocorticoids. Fasting blood glucose, plasma insulin and insulin resistance were similar in the groups. Plasma total (p<0.05) and very low+low density lipoprotein cholesterol (p<0.01) were lower in patients receiving glucocorticoids, whereas HDL cholesterol and plasma triglycerides were not different between patients treated and not treated with glucocorticoids. Plasma LCAT activity was 45% lower (p<0.02) and CETP activity was 34% lower (p<0.05) in patients on glucocorticoid treatment. Multiple regression analysis showed that these effects were independent of gender and fat mass. In glucocorticoid-receiving patients, plasma EST and CET were decreased by 80% (p<0.01) and by 58% (p<0.05), respectively. These changes were at least partly attributable to lower LCAT and CETP activity levels. In contrast, plasma PLTP activity was not different between patients with and without glucocorticoid treatment, suggesting that exogenous glucocorticoids exert a different regulatory effect on plasma CETP compared to PLTP. In conclusion, this preliminary study suggests that conventional glucocorticoid replacement in hypopituitary patients is associated with a decrease in plasma cholesterol esterification and cholesteryl ester transfer, indicating that these steps in HDL metabolism are impaired. Such abnormalities in HDL metabolism could be involved in increased cardiovascular risk in glucocorticoid-treated hypopituitary patients, despite a lack of deterioration in plasma lipids. + + 3931 + Gene + + lecithin:cholesterol acyl transferase + + + 3931 + Gene + + LCAT + + + 1071 + Gene + + cholesteryl ester transfer protein + + + 1071 + Gene + + CETP + + + 5360 + Gene + + phospholipid transfer protein + + + 5360 + Gene + + PLTP + + + 3931 + Gene + + LCAT + + + 1071 + Gene + + CETP + + + 5360 + Gene + + PLTP + + + 3630 + Gene + + insulin + + + 3630 + Gene + + insulin + + + NCBITaxonomyID:9606 + FamilyName + + HDL + + + 3931 + Gene + + LCAT + + + 1071 + Gene + + CETP + + + 3931 + Gene + + LCAT + + + 1071 + Gene + + CETP + + + 5360 + Gene + + PLTP + + + 1071 + Gene + + CETP + + + 5360 + Gene + + PLTP + + + NCBITaxonomyID:9606 + FamilyName + + HDL + + + NCBITaxonomyID:9606 + FamilyName + + HDL + + + NCBITaxonomyID:9606 + FamilyName + + HDL + + + NCBITaxonomyID:9606 + FamilyName + + HDL + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + men + + + 9606 + Species + + women + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + 9606 + Species + + patients + + + + \ No newline at end of file From 91bd495440b27f7f5d08d325af4a7a1764620d56 Mon Sep 17 00:00:00 2001 From: khituras Date: Mon, 4 Apr 2022 12:04:50 +0200 Subject: [PATCH 174/269] Set null title types to title type 'other'. That removes an exception thrown for some title types we do not really care about (e.g. titles of publications in a literature list added to the text body of a PMC document, outside of references). --- .../de/julielab/jcore/reader/BioCCasPopulator.java | 12 ++++++++++-- .../jcore/consumer/gnp/BioCDocumentPopulator.java | 9 +++++---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java index 4618ef255..4c8b3f908 100644 --- a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java @@ -97,7 +97,7 @@ public void populateWithNextDocument(JCas jCas) { addSpeciesAnnotation(annotation, jCas); break; } - } catch (MissingInfonException e) { + } catch (MissingInfonException | IllegalArgumentException e) { throw new IllegalArgumentException("BioCDocument " + document.getID() + " has an annotation issue; see cause exception.", e); } } @@ -233,7 +233,15 @@ private void addGeneAnnotation(BioCAnnotation annotation, JCas jCas) throws Miss private void addFamilyAnnotation(BioCAnnotation annotation, JCas jCas) { // the "total location" is the span from the minimum location value to the maximum location value; // for GNormPlus, there are no discontinuing annotations anyway - BioCLocation location = annotation.getTotalLocation(); + BioCLocation location; + try { + location = annotation.getTotalLocation(); + } catch (Exception e) { + // This handles a legacy issue: We modified GNormPlus to output FamilyName annotations. For some reason, + // FamilyNames can have zero length. This has been fixed but there is still old output that would + // cause an error at this point. Thus, when the offsets are invalid, skip the annotation. + return; + } Gene gene = new Gene(jCas, location.getOffset(), location.getOffset() + location.getLength()); gene.setSpecificType("FamilyName"); // e.g. NCBITaxonomyID:9606 diff --git a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java index 78ec078cc..c642193cf 100644 --- a/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java +++ b/jcore-gnp-bioc-writer/src/main/java/de/julielab/jcore/consumer/gnp/BioCDocumentPopulator.java @@ -34,9 +34,10 @@ public BioCDocument populate(JCas jCas) { if (z instanceof Title) { Title t = (Title) z; String titleType; - if (t.getTitleType() == null) - throw new IllegalArgumentException("The titleType feature was not set for " + t); - switch (t.getTitleType()) { + String titleTypeString = t.getTitleType(); + if (titleTypeString == null) + titleTypeString = "other"; + switch (titleTypeString) { case "document": titleType = "title"; break; @@ -54,7 +55,7 @@ public BioCDocument populate(JCas jCas) { titleType = "null"; break; default: - log.debug("Unhandled title type {}", t.getTitleType()); + log.debug("Unhandled title type {}", titleTypeString); titleType = "other_title"; break; } From a87ba45c8de0dea8693c12c97da42a6e124e63f9 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 6 Apr 2022 08:09:59 +0200 Subject: [PATCH 175/269] Add '9999' confidence values to gene annotations in the GNP reader. 9999 means "exact" match. GNP checks a dictionary to find IDs and transfers those to other matches under some circumstances. --- .../main/java/de/julielab/jcore/reader/BioCCasPopulator.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java index 4c8b3f908..972d0e7dd 100644 --- a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java @@ -220,6 +220,9 @@ private void addGeneAnnotation(BioCAnnotation annotation, JCas jCas) throws Miss BioCLocation location = annotation.getTotalLocation(); Gene gene = new Gene(jCas, location.getOffset(), location.getOffset() + location.getLength()); gene.setSpecificType("Gene"); + // 9999 ist the GeNo score for exact matches; GNP only recognized exact dictionary matches and transfers + // their IDs to other forms under certain circumstances (abbreviations, for example) + gene.setConfidence("9999"); ResourceEntry resourceEntry = new ResourceEntry(jCas, gene.getBegin(), gene.getEnd()); resourceEntry.setSource("NCBI Gene"); resourceEntry.setComponentId(GNormPlusFormatMultiplierReader.class.getCanonicalName()); @@ -244,6 +247,7 @@ private void addFamilyAnnotation(BioCAnnotation annotation, JCas jCas) { } Gene gene = new Gene(jCas, location.getOffset(), location.getOffset() + location.getLength()); gene.setSpecificType("FamilyName"); + gene.setConfidence("9999"); // e.g. NCBITaxonomyID:9606 Optional focusSpecies = annotation.getInfon("FocusSpecies"); if (!focusSpecies.isPresent()) From 2917a9220c69eb1e4577b50180a7b04c99bab4b6 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 6 Apr 2022 08:10:31 +0200 Subject: [PATCH 176/269] Add error messages when the passed data resources is null. --- .../main/java/de/julielab/jcore/utility/JCoReTools.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReTools.java b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReTools.java index 038321c70..606502b76 100644 --- a/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReTools.java +++ b/jcore-utilities/src/main/java/de/julielab/jcore/utility/JCoReTools.java @@ -259,7 +259,8 @@ public static FSArray copyFSArray(FSArray array) { /** * Creates a new StringArray from the given string elements. - * @param jCas The jCas to associate the new StringArray with. + * + * @param jCas The jCas to associate the new StringArray with. * @param elements The strings to put into the StringArray. * @return The new, filled StringArray. */ @@ -456,7 +457,11 @@ else if (comparison < 0) { * @throws IOException If reading the resource file fails. */ public static InputStream resolveExternalResourceGzipInputStream(DataResource resource) throws IOException { + if (resource == null) + throw new IllegalArgumentException("The passed DataResource is null."); InputStream is = resource.getInputStream(); + if (is == null) + throw new IllegalArgumentException("The resource at " + resource.getUrl() + " could not be read. It does not exist or is not on the ClassPath."); String lcUriString = resource.getUri().toString().toLowerCase(); if (lcUriString.endsWith(".gz") || lcUriString.endsWith(".gzip")) is = new GZIPInputStream(is); From aea26c82f5c1803f9ead2abb45dff3be38810fc8 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 12 Apr 2022 15:37:42 +0200 Subject: [PATCH 177/269] Handle multiple gene IDs assigned by GNormPlus. For ranges or enumerations like B7-1/2, for example. Now, for each such IDs a new ResourceEntry is created. Thus, one Gene can now have multiple ResourceEntries. --- .../jcore/reader/BioCCasPopulator.java | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java index 972d0e7dd..553224c3a 100644 --- a/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java +++ b/jcore-gnp-bioc-reader/src/main/java/de/julielab/jcore/reader/BioCCasPopulator.java @@ -220,15 +220,19 @@ private void addGeneAnnotation(BioCAnnotation annotation, JCas jCas) throws Miss BioCLocation location = annotation.getTotalLocation(); Gene gene = new Gene(jCas, location.getOffset(), location.getOffset() + location.getLength()); gene.setSpecificType("Gene"); - // 9999 ist the GeNo score for exact matches; GNP only recognized exact dictionary matches and transfers - // their IDs to other forms under certain circumstances (abbreviations, for example) - gene.setConfidence("9999"); - ResourceEntry resourceEntry = new ResourceEntry(jCas, gene.getBegin(), gene.getEnd()); - resourceEntry.setSource("NCBI Gene"); - resourceEntry.setComponentId(GNormPlusFormatMultiplierReader.class.getCanonicalName()); - resourceEntry.setEntryId(geneId.get()); - FSArray resourceEntryList = new FSArray(jCas, 1); - resourceEntryList.set(0, resourceEntry); + // one gene mention might have multiple IDs when there are ranges or enumerations, e.g. "IL2-5", "B7-1 and B7-2" or "B7-1/2" + String[] geneIds = geneId.get().split(";"); + FSArray resourceEntryList = new FSArray(jCas, geneIds.length); + for (int i = 0; i < geneIds.length; i++) { + ResourceEntry resourceEntry = new ResourceEntry(jCas, gene.getBegin(), gene.getEnd()); + // 9999 ist the GeNo score for exact matches; GNP only recognized exact dictionary matches and transfers + // their IDs to other forms under certain circumstances (abbreviations, for example) + resourceEntry.setConfidence("9999"); + resourceEntry.setSource("NCBI Gene"); + resourceEntry.setComponentId(GNormPlusFormatMultiplierReader.class.getCanonicalName()); + resourceEntry.setEntryId(geneIds[i]); + resourceEntryList.set(i, resourceEntry); + } gene.setResourceEntryList(resourceEntryList); gene.addToIndexes(); } @@ -247,7 +251,6 @@ private void addFamilyAnnotation(BioCAnnotation annotation, JCas jCas) { } Gene gene = new Gene(jCas, location.getOffset(), location.getOffset() + location.getLength()); gene.setSpecificType("FamilyName"); - gene.setConfidence("9999"); // e.g. NCBITaxonomyID:9606 Optional focusSpecies = annotation.getInfon("FocusSpecies"); if (!focusSpecies.isPresent()) From bfe6bc911374b97e001201695787e5544210339b Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 12 Apr 2022 15:37:52 +0200 Subject: [PATCH 178/269] Minor changes. --- .../jcore/ae/biosem/BioSemEventAnnotator.java | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java b/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java index e263b203f..8a42dd9dc 100644 --- a/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java +++ b/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java @@ -1,11 +1,11 @@ -/** - * +/** + * * Copyright (c) 2017, JULIE Lab. - * All rights reserved. This program and the accompanying materials + * All rights reserved. This program and the accompanying materials * are made available under the terms of the BSD-2-Clause License * - * Author: - * + * Author: + * * Description: **/ package de.julielab.jcore.ae.biosem; @@ -237,7 +237,7 @@ private EventMention addEventToIndexes(PData event, Map proteinMap } /** - * + * * @param uimaEvent * The UIMA event annotation to add a new argument to * @param bioSemArg @@ -342,7 +342,7 @@ private void addUimaEventArgument(EventMention uimaEvent, Object bioSemArg, int } /** - * + * * @param uimaEvent * @param uimaArg * @param argPos @@ -383,7 +383,7 @@ private EventTrigger addTriggerToIndexes(Word trg, JCas aJCas) { * ID<tab>Entity-Type[Protein]<tab>start<tab>end<tab>Mention name *
* Example: T3 Protein 166 174 TGF-beta - * + * * @return */ private List getProteinLines(Map proteins, String docId) throws AnnotatorProcessException { @@ -405,7 +405,7 @@ private List getProteinLines(Map proteins, String docId) t /** * Assigns an ID of the form Ti to each gene in the CAS, i * being an enumeration number beginning at 0. - * + * * @param aJCas * @return */ @@ -421,9 +421,7 @@ private Map enumerateProteins(JCas aJCas) { Gene gene = (Gene) geneIt.next(); if (gene.getBegin() < lastEnd) continue; - String id = gene.getId(); - // if (StringUtils.isBlank(id)) - id = "T" + i++; + String id = "T" + i++; gene.setId(id); proteins.put(id, gene); lastEnd = gene.getEnd(); From e514d6103bee2d85dfc54a0d48b131ddb8be2f12 Mon Sep 17 00:00:00 2001 From: khituras Date: Tue, 12 Apr 2022 15:38:34 +0200 Subject: [PATCH 179/269] Resolve a JAR hell issue with commons-io. The solution: Don't use commons-io. --- .../Neo4jRelationsConsumer.java | 62 +++++++++++-------- 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java index 0a1aaafff..190cf30cd 100644 --- a/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java +++ b/jcore-neo4j-relations-consumer/src/main/java/de/julielab/jcore/consumer/neo4jrelations/Neo4jRelationsConsumer.java @@ -6,6 +6,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.HashMultiset; import com.google.common.collect.Multiset; +import de.julielab.java.utilities.IOStreamUtilities; import de.julielab.jcore.ae.checkpoint.DocumentId; import de.julielab.jcore.ae.checkpoint.DocumentReleaseCheckpoint; import de.julielab.jcore.types.ArgumentMention; @@ -20,7 +21,6 @@ import de.julielab.neo4j.plugins.datarepresentation.ImportIETypedRelations; import de.julielab.neo4j.plugins.datarepresentation.constants.ImportIERelations; import org.apache.commons.codec.binary.Base64; -import org.apache.commons.io.IOUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; @@ -44,8 +44,6 @@ import java.util.*; import java.util.stream.StreamSupport; -import static java.nio.charset.StandardCharsets.UTF_8; - @ResourceMetaData(name = "JCoRe Neo4j Relations Consumer", description = "This component assumes that a Neo4j server with an installed julieliab-neo4j-plugins-concepts plugin installed. It then sends FlattenedRelation instances with more then one arguments to Neo4j. Note that this requires the event arguments to have a ResourceEntry list to obtain database concept IDs from.", vendor = "JULIE Lab, Germany", copyright = "JULIE Lab", version = "2.6.0-SNAPSHOT") @TypeCapability(inputs = {"de.julielab.jcore.types.EventMention"}) public class Neo4jRelationsConsumer extends JCasAnnotator_ImplBase { @@ -84,19 +82,24 @@ public class Neo4jRelationsConsumer extends JCasAnnotator_ImplBase { */ @Override public void initialize(final UimaContext aContext) throws ResourceInitializationException { - url = (String) aContext.getConfigParameterValue(PARAM_URL); - idProperty = (String) aContext.getConfigParameterValue(PARAM_ID_PROPERTY); - globalSource = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_SOURCE)).orElse(null); - neo4jUser = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_NEO4J_USER)).orElse(null); - neo4jPassword = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_NEO4J_PASSWORD)).orElse(null); - writeBatchSize = Optional.ofNullable((Integer) aContext.getConfigParameterValue(PARAM_WRITE_BATCH_SIZE)).orElse(50); - om = new ObjectMapper(); - om.setSerializationInclusion(JsonInclude.Include.NON_NULL); - om.setSerializationInclusion(JsonInclude.Include.NON_EMPTY); - initImportRelations(); - DocumentReleaseCheckpoint.get().register(Neo4jRelationsConsumer.class.getCanonicalName()); - documentIds = new HashSet<>(); - docNum = 0; + try { + url = (String) aContext.getConfigParameterValue(PARAM_URL); + idProperty = (String) aContext.getConfigParameterValue(PARAM_ID_PROPERTY); + globalSource = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_SOURCE)).orElse(null); + neo4jUser = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_NEO4J_USER)).orElse(null); + neo4jPassword = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_NEO4J_PASSWORD)).orElse(null); + writeBatchSize = Optional.ofNullable((Integer) aContext.getConfigParameterValue(PARAM_WRITE_BATCH_SIZE)).orElse(50); + om = new ObjectMapper(); + om.setSerializationInclusion(JsonInclude.Include.NON_NULL); + om.setSerializationInclusion(JsonInclude.Include.NON_EMPTY); + initImportRelations(); + DocumentReleaseCheckpoint.get().register(Neo4jRelationsConsumer.class.getCanonicalName()); + documentIds = new HashSet<>(); + docNum = 0; + } catch (Throwable e) { + log.error("Could not initialize", e); + throw new ResourceInitializationException(e); + } } private void initImportRelations() { @@ -109,16 +112,23 @@ private void initImportRelations() { */ @Override public void process(final JCas aJCas) throws AnalysisEngineProcessException { - ImportIERelationDocument document = convertRelations(aJCas); - if (!document.getRelations().isEmpty()) - importIERelations.addRelationDocument(document); + try { + ImportIERelationDocument document = convertRelations(aJCas); + if (!document.getRelations().isEmpty()) + importIERelations.addRelationDocument(document); - Optional metaOpt = JCasUtil.select(aJCas, DBProcessingMetaData.class).stream().findAny(); - documentIds.add(metaOpt.isPresent() ? new DocumentId(metaOpt.get()) : new DocumentId(JCoReTools.getDocId(aJCas))); + Optional metaOpt = JCasUtil.select(aJCas, DBProcessingMetaData.class).stream().findAny(); + documentIds.add(metaOpt.isPresent() ? new DocumentId(metaOpt.get()) : new DocumentId(JCoReTools.getDocId(aJCas))); - if (documentIds.size() % writeBatchSize == 0) { - log.trace("Document nr {} processed, sending batch nr {} of size {} to database.", docNum, docNum / writeBatchSize, writeBatchSize); - batchProcessComplete(); + if (documentIds.size() % writeBatchSize == 0) { + log.trace("Document nr {} processed, sending batch nr {} of size {} to database.", docNum, docNum / writeBatchSize, writeBatchSize); + batchProcessComplete(); + } + } catch (Throwable e) { + log.error("Exception occurred in document {}", JCoReTools.getDocId(aJCas), e); + if (!(e instanceof AnalysisEngineProcessException)) + throw new AnalysisEngineProcessException(e); + throw e; } } @@ -187,12 +197,12 @@ private void sendRelationsToNeo4j() throws AnalysisEngineProcessException { g.close(); } try (InputStream inputStream = urlConnection.getInputStream()) { - log.debug("Response from Neo4j: {}", IOUtils.toString(inputStream, UTF_8)); + log.debug("Response from Neo4j: {}", IOStreamUtilities.getStringFromInputStream(inputStream)); } catch (IOException e) { log.error("Exception occurred while sending relation data to Neo4j server."); try (InputStream inputStream = urlConnection.getErrorStream()) { if (inputStream != null) - log.error("Error from Neo4j: {}", IOUtils.toString(inputStream, UTF_8)); + log.error("Error from Neo4j: {}", IOStreamUtilities.getStringFromInputStream(inputStream)); } throw e; } From ba3fea8169e14f2db55a77a8b29affdaa39e45d3 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 20 Apr 2022 12:06:14 +0200 Subject: [PATCH 180/269] Add an MMAX2 reader. Fixes #136. --- jcore-mmax2-reader/LICENSE | 26 ++ jcore-mmax2-reader/README.md | 34 ++ jcore-mmax2-reader/pom.xml | 73 ++++ .../julielab/jcore/cr/mmax2/MMAX2Reader.java | 323 ++++++++++++++++++ .../cr/mmax2/desc/jcore-mmax2-reader.xml | 54 +++ .../jcore/cr/mmax2/MMAX2ReaderTest.java | 64 ++++ .../resources/input/mmax_26000/Basedata.uri | 1 + .../input/mmax_26000/Basedata/Basedata.xml | 240 +++++++++++++ .../input/mmax_26000/Basedata/words.dtd | 3 + .../mmax_26000/Customizations/proteins.xml | 72 ++++ .../mmax_26000/Customizations/sentence.xml | 3 + .../input/mmax_26000/Markables/markables.dtd | 2 + .../input/mmax_26000/Markables/proteins.xml | 20 ++ .../input/mmax_26000/Markables/sentence.xml | 14 + .../input/mmax_26000/Schemes/proteins.xml | 16 + .../input/mmax_26000/Schemes/sentence.xml | 3 + .../input/mmax_26000/Styles/default_style.xsl | 58 ++++ .../input/mmax_26000/common_paths.xml | 17 + .../resources/input/mmax_26000/project.mmax | 7 + .../src/test/resources/originalText/10048764 | 2 + 20 files changed, 1032 insertions(+) create mode 100644 jcore-mmax2-reader/LICENSE create mode 100644 jcore-mmax2-reader/README.md create mode 100644 jcore-mmax2-reader/pom.xml create mode 100644 jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java create mode 100644 jcore-mmax2-reader/src/main/resources/de/julielab/jcore/cr/mmax2/desc/jcore-mmax2-reader.xml create mode 100644 jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java create mode 100644 jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata.uri create mode 100644 jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata/Basedata.xml create mode 100644 jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata/words.dtd create mode 100644 jcore-mmax2-reader/src/test/resources/input/mmax_26000/Customizations/proteins.xml create mode 100644 jcore-mmax2-reader/src/test/resources/input/mmax_26000/Customizations/sentence.xml create mode 100644 jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/markables.dtd create mode 100644 jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/proteins.xml create mode 100644 jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/sentence.xml create mode 100644 jcore-mmax2-reader/src/test/resources/input/mmax_26000/Schemes/proteins.xml create mode 100644 jcore-mmax2-reader/src/test/resources/input/mmax_26000/Schemes/sentence.xml create mode 100644 jcore-mmax2-reader/src/test/resources/input/mmax_26000/Styles/default_style.xsl create mode 100644 jcore-mmax2-reader/src/test/resources/input/mmax_26000/common_paths.xml create mode 100644 jcore-mmax2-reader/src/test/resources/input/mmax_26000/project.mmax create mode 100644 jcore-mmax2-reader/src/test/resources/originalText/10048764 diff --git a/jcore-mmax2-reader/LICENSE b/jcore-mmax2-reader/LICENSE new file mode 100644 index 000000000..fbbd41e05 --- /dev/null +++ b/jcore-mmax2-reader/LICENSE @@ -0,0 +1,26 @@ +BSD 2-Clause License + +Copyright (c) 2017, JULIE Lab +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/jcore-mmax2-reader/README.md b/jcore-mmax2-reader/README.md new file mode 100644 index 000000000..2cacbc00a --- /dev/null +++ b/jcore-mmax2-reader/README.md @@ -0,0 +1,34 @@ +# JCoRe Component Skeleton +`Text that describes the component in brevity...` + +**Descriptor Path**: +``` +de.julielab.jcore.{reader, ae, consumer}.NAME.desc.ARTIFACT-NAME +``` + +`More thorough description` +`Are there any requirements or dependencies for this component?` + +**1. Parameters** + +| Parameter Name | Parameter Type | Mandatory | Multivalued | Description | +|----------------|----------------|-----------|-------------|-------------| +| param1 | UIMA-Type | Boolean | Boolean | Description | +| param2 | UIMA-Type | Boolean | Boolean | Description | + +**2. Predefined Settings** + +| Parameter Name | Parameter Syntax | Example | +|----------------|------------------|---------| +| param1 | Syntax-Description | `Example` | +| param2 | Syntax-Description | `Example` | + +**3. Capabilities** + +| Type | Input | Output | +|------|:-----:|:------:| +| de.julielab.jcore.types.TYPE | | `+` | +| de.julielab.jcore.types.ace.TYPE | `+` | | + + +[1] Some Literature? diff --git a/jcore-mmax2-reader/pom.xml b/jcore-mmax2-reader/pom.xml new file mode 100644 index 000000000..39f6d714e --- /dev/null +++ b/jcore-mmax2-reader/pom.xml @@ -0,0 +1,73 @@ + + + + 4.0.0 + jcore-mmax2-reader + jar + de.julielab + + + de.julielab + jcore-base + 2.6.0-SNAPSHOT + + + 2.6.0-SNAPSHOT + + + + ch.qos.logback + logback-classic + test + + + org.slf4j + slf4j-api + + + de.julielab + jcore-descriptor-creator + + + de.julielab + jcore-types + ${jcore-types-version} + + + de.julielab + julielab-mmax-to-iob-iexml-converter + 1.0.2-SNAPSHOT + + + org.apache.commons + commons-lang3 + + + de.julielab + jcore-utilities + ${jcore-utilities-version} + + + org.assertj + assertj-core + + + org.junit.jupiter + junit-jupiter-engine + + + JCoRe MMAX2 reader. + + JULIE Lab Jena, Germany + http://www.julielab.de + + https://github.com/JULIELab/jcore-base/tree/master/jcore-mmax2-reader + Collection reader for MMAX2 annotation projects. + + + BSD 2-Clause + https://opensource.org/licenses/BSD-2-Clause + + + diff --git a/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java b/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java new file mode 100644 index 000000000..fa09f4c69 --- /dev/null +++ b/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java @@ -0,0 +1,323 @@ +package de.julielab.jcore.cr.mmax2; + +import de.julielab.jcore.types.ConceptMention; +import de.julielab.jcore.types.Gene; +import de.julielab.jcore.types.Token; +import de.julielab.jcore.utility.JCoReAnnotationTools; +import de.julielab.jules.mmax.MarkableContainer; +import de.julielab.jules.mmax.Statistics; +import de.julielab.jules.mmax.WordInformation; +import org.apache.uima.UimaContext; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Progress; +import org.apache.uima.util.ProgressImpl; +import org.eml.MMAX2.annotation.markables.Markable; +import org.eml.MMAX2.discourse.MMAX2Discourse; +import org.eml.MMAX2.discourse.MMAX2DiscourseElement; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.util.*; +import java.util.function.Predicate; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +@ResourceMetaData(name = "JCoRe MMAX2 reader.", description = "Collection reader for MMAX2 annotation projects.", vendor = "JULIE Lab Jena, Germany") +public class MMAX2Reader extends JCasCollectionReader_ImplBase { + + public static final String PARAM_INPUT_DIR = "InputDir"; + public static final String PARAM_ANNOTATION_LEVELS = "AnnotationLevels"; + public static final String PARAM_ORIGINAL_TEXT_FILES = "OriginalTextFiles"; + public static final String PARAM_UIMA_ANNOTATION_TYPES = "UimaAnnotationTypes"; + private final static Logger log = LoggerFactory.getLogger(MMAX2Reader.class); + @ConfigurationParameter(name = PARAM_INPUT_DIR, description = "Should point to the directory of which the MMAX2 projects are sub directories of.") + private String inputDir; + @ConfigurationParameter(name = PARAM_ANNOTATION_LEVELS, description = "The names of the MMAX2 annotation levels to create annotations for.") + private String[] annotationLevels; + @ConfigurationParameter(name = PARAM_UIMA_ANNOTATION_TYPES, description = "The fully qualified names of the UIMA annotation types to be used for the representation of the input annotation level. Must match the indices of " + PARAM_ANNOTATION_LEVELS + ", i.e. the ith level will be added to the CAS as the ith type.") + private String[] uimaTypeNames; + @ConfigurationParameter(name = PARAM_ORIGINAL_TEXT_FILES, mandatory = false, description = "The MMAX2 base data consists of tokenized text and does not keep track of the original text. This parameter should point to a directory containing the original text files. The file names should match the MMAX2 project IDs.") + private String originalTextFilesDir; + + private LinkedList folderList; + private String actualPath; + private HashMap levels2uimaNames; + private List> uimaAnnotationClasses; + private int numDocuments; + + /** + * This method is called a single time by the framework at component + * creation. Here, descriptor parameters are read and initial setup is done. + */ + @Override + public void initialize(UimaContext context) throws ResourceInitializationException { + super.initialize(context); + inputDir = (String) context.getConfigParameterValue(PARAM_INPUT_DIR); + annotationLevels = (String[]) context.getConfigParameterValue(PARAM_ANNOTATION_LEVELS); + uimaTypeNames = (String[]) getUimaContext().getConfigParameterValue(PARAM_UIMA_ANNOTATION_TYPES); + originalTextFilesDir = (String) context.getConfigParameterValue(PARAM_ORIGINAL_TEXT_FILES); + actualPath = null; + if (annotationLevels.length != uimaTypeNames.length) + throw new IllegalArgumentException("The number of annotation levels and the number of UIMA type names must match. But the given annotation levels are '" + Arrays.toString(annotationLevels) + "' and the UIMA types names are '" + Arrays.toString(uimaTypeNames) + "'."); + try { + uimaAnnotationClasses = Arrays.stream(uimaTypeNames).map(name -> { + try { + return Class.forName(name); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + }).collect(Collectors.toList()); + } catch (Exception e) { + log.error("Could not initialize UIMA annotation classes from parameter values {}", Arrays.toString(uimaTypeNames)); + throw new ResourceInitializationException(e); + } + levels2uimaNames = IntStream.range(0, annotationLevels.length).collect(HashMap::new, (m, i) -> m.put(annotationLevels[i], uimaTypeNames[i]), (m1, m2) -> m1.putAll(m2)); + setUpFolderList(); + } + + private void setUpFolderList() throws ResourceInitializationException { + folderList = new LinkedList<>(); + if (!inputDir.endsWith(File.separator)) + this.inputDir += File.separator; + + File rootX = new File(inputDir); + + if (!rootX.exists()) { + File dir1 = new File("."); + try { + rootX = new File(dir1.getCanonicalPath() + inputDir); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + if (!rootX.exists()) { + log.error("{} does not exist", inputDir); + throw new ResourceInitializationException(new IllegalArgumentException(inputDir + " does not exist")); + } + } + + for (String rootFolder : rootX.list()) { + if (!rootFolder.endsWith(File.separator)) + rootFolder += File.separator; + File root = new File(inputDir + rootFolder); + if (root.isDirectory()) { + this.folderList.add(root); + } + } + numDocuments = folderList.size(); + } + + private String getPMID() throws CollectionException { + try { + FileInputStream fstream = new FileInputStream(this.actualPath + "Basedata.uri"); + // Get the object of DataInputStream + DataInputStream in = new DataInputStream(fstream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String strLine; + // Read File Line By Line + int count = 0; + String pmid = ""; + while ((strLine = br.readLine()) != null) { + count++; + pmid = strLine; + } + if (count > 1) { + log.error("unknown data in {}Basedata.uri", actualPath); + System.exit(1); + return null; + } + return pmid; + } catch (IOException e) { + log.error("Error while parsing {}Basedata.uri", actualPath); + throw new CollectionException(e); + } + } + + /** + * This method is called for each document going through the component. This + * is where the actual work happens. + */ + @Override + public void getNext(JCas jCas) throws CollectionException { + Statistics.projects++; + actualPath = this.folderList.poll().getAbsolutePath() + "/"; + // rename style file from default_style.xsl to generic_nongui_style.xsl + // (necessary for api use) + File style = new File(actualPath + "Styles/default_style.xsl"); + style.renameTo(new File(actualPath + "Styles/generic_nongui_style.xsl")); + + File mmaxfile = new File(actualPath + "project.mmax"); + MMAX2Discourse discourse = MMAX2Discourse.buildDiscourse(mmaxfile.getAbsolutePath()); + + // text from basedata with spaces between all words + String documentText = discourse.getNextDocumentChunk(); + + WordInformation[] words = new WordInformation[discourse.getDiscourseElementCount()]; + + int textPosition = 0; + // Words from basedata + for (MMAX2DiscourseElement elem : discourse.getDiscourseElements()) { + WordInformation word = new WordInformation(); + word.setId(elem.getID()); + int discoursePosition = elem.getDiscoursePosition(); + word.setPosition(discoursePosition); + StringBuilder textBuilder = new StringBuilder(); + int end = discourse.getDisplayEndPositionFromDiscoursePosition(discoursePosition); + for (textPosition = discourse.getDisplayStartPositionFromDiscoursePosition(discoursePosition); textPosition <= end; textPosition++) { + textBuilder.append(documentText.charAt(textPosition)); + } + word.setText(textBuilder.toString()); + words[discoursePosition] = word; + } + + this.produceOutput(discourse, words, jCas); + + // set stylefile back to normal + style = new File(actualPath + "Styles/generic_nongui_style.xsl"); + style.renameTo(new File(actualPath + "Styles/default_style.xsl")); + + Statistics.projects++; + } + + private void produceOutput(MMAX2Discourse discourse, WordInformation[] words, JCas jCas) throws CollectionException { + StringBuilder out = new StringBuilder(); + StringBuilder outPlain = new StringBuilder(); + String pmid = this.getPMID(); + if (originalTextFilesDir != null && this.originalTextFilesDir.length() > 0) + this.handleOriginalTextInformation(pmid, words); + + Map pos2offsets = new HashMap<>(); + + for (int i = 0; i < words.length; i++) { + WordInformation word = words[i]; + + Token token = new Token(jCas, outPlain.length(), outPlain.length() + word.getText().length()); + token.setComponentId(getClass().getCanonicalName()); + token.addToIndexes(); + pos2offsets.put(word.getPosition(), token); + + outPlain.append(word.getText()); + if (word.isFollowedBySpace()) { + out.append(" "); + outPlain.append(" "); + } + } + for (int i = 0; i < annotationLevels.length; ++i) { + Iterator iterator = discourse.getMarkableLevelByName(annotationLevels[i], false).getMarkables().stream().map(Markable.class::cast).filter(Predicate.not(Markable::isDiscontinuous)).iterator(); + while (iterator.hasNext()) { + Markable markable = iterator.next(); + int beginPosition = markable.getLeftmostDiscoursePosition(); + int endPosition = markable.getRightmostDiscoursePosition(); + int beginOffset = pos2offsets.get(beginPosition).getBegin(); + int endOffset = pos2offsets.get(endPosition).getEnd(); + Annotation a; + try { + a = JCoReAnnotationTools.getAnnotationByClassName(jCas, uimaTypeNames[i]); + } catch (Exception e) { + throw new CollectionException(e); + } + a.setBegin(beginOffset); + a.setEnd(endOffset); + if (a instanceof ConceptMention) + ((ConceptMention) a).setSpecificType(markable.getAttributeValue(markable.getMarkableLevelName())); + a.addToIndexes(); + } + } + for (WordInformation word : words) { + for (MarkableContainer mc : word.getMarkables()) { + int beginPosition = mc.getBegin(); + if (beginPosition == word.getPosition()) { + int endPosition = mc.getEnd(); + int beginOffset = pos2offsets.get(beginPosition).getBegin(); + int endOffset = pos2offsets.get(endPosition).getEnd(); + Gene gene = new Gene(jCas, beginOffset, endOffset); + gene.addToIndexes(); + } + } + } + String textPlain = outPlain.toString(); + jCas.setDocumentText(textPlain); + } + + private void handleOriginalTextInformation(String pmid, WordInformation[] words) throws CollectionException { + if (originalTextFilesDir.length() > 0 && !originalTextFilesDir.endsWith("/")) + originalTextFilesDir += File.separator; + + File file = new File(originalTextFilesDir + pmid); + if (!file.exists()) { + log.warn("no original File found for {} using only mmax text.", pmid); + return; + } + try { + FileInputStream fis = new FileInputStream(file); + InputStreamReader isr = new InputStreamReader(fis); + int wordCounter = 0; + int i; + try { + WordInformation actualWord = words[wordCounter]; + String actualText = actualWord.getText(); + actualWord.setFollowedBySpace(false); + int wordCharCounter = 0; + while ((i = isr.read()) >= 0) { + if (wordCharCounter >= actualText.length()) { + wordCounter++; + if (wordCounter < words.length) { + actualWord = words[wordCounter]; + actualText = actualWord.getText(); + actualWord.setFollowedBySpace(false); + wordCharCounter = 0; + } else { + if (!Character.isWhitespace(i)) { + log.warn("original Text contains more words than mmax information"); + } + return; + } + } + + if (actualText.charAt(wordCharCounter) == i || Character.toLowerCase(actualText.charAt(wordCharCounter)) == Character.toLowerCase(i)) { + wordCharCounter++; + } else { + if (!Character.isWhitespace(i)) { + log.warn("there is a non whitespace character different in original text at document {} critical character is '{}' near word '{}' (MMAX2 word ID {})", pmid, i, actualText, actualWord.getId()); + } else { + words[wordCounter - 1].setFollowedBySpace(true); + } + } + } + isr.close(); + } catch (IOException e) { + log.error("Error attempting to read original text file ", e); + throw new CollectionException(e); + } + } catch (Exception e) { + log.error("Error attempting to read original text file", e); + if (e instanceof CollectionException) + throw (CollectionException) e; + throw new CollectionException(e); + } + } + + @Override + public void close() { + // nothing to do + } + + @Override + public Progress[] getProgress() { + return new Progress[]{new ProgressImpl(numDocuments - folderList.size(), numDocuments, "document")}; + } + + @Override + public boolean hasNext() { + return !this.folderList.isEmpty(); + } + + +} diff --git a/jcore-mmax2-reader/src/main/resources/de/julielab/jcore/cr/mmax2/desc/jcore-mmax2-reader.xml b/jcore-mmax2-reader/src/main/resources/de/julielab/jcore/cr/mmax2/desc/jcore-mmax2-reader.xml new file mode 100644 index 000000000..8f3289029 --- /dev/null +++ b/jcore-mmax2-reader/src/main/resources/de/julielab/jcore/cr/mmax2/desc/jcore-mmax2-reader.xml @@ -0,0 +1,54 @@ + + + org.apache.uima.java + de.julielab.jcore.cr.mmax2.MMAX2Reader + + JCoRe MMAX2 reader. + Collection reader for MMAX2 annotation projects. + JULIE Lab Jena, Germany + + + InputDir + Should point to the directory of which the MMAX2 projects are sub directories of. + String + false + true + + + AnnotationLevels + The names of the MMAX2 annotation levels to create annotations for. + String + true + true + + + UimaAnnotationTypes + The fully qualified names of the UIMA annotation types to be used for the representation of the input annotation level. Must match the indices of AnnotationLevels, i.e. the ith level will be added to the CAS as the ith type. + String + true + true + + + OriginalTextFiles + The MMAX2 base data consists of tokenized text and does not keep track of the original text. This parameter should point to a directory containing the original text files. The file names should match the MMAX2 project IDs. + String + false + false + + + + + + + + + + + + + true + false + true + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java b/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java new file mode 100644 index 000000000..410b42ed1 --- /dev/null +++ b/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java @@ -0,0 +1,64 @@ +package de.julielab.jcore.cr.mmax2; + +import de.julielab.jcore.types.Protein; +import de.julielab.jcore.types.Sentence; +import de.julielab.jcore.types.Token; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.junit.jupiter.api.Test; + +import java.nio.file.Path; +import java.util.Collection; + +import static org.assertj.core.api.Assertions.assertThat; +/** + * Unit tests for jcore-mmax2-reader. + * + * @author + */ +public class MMAX2ReaderTest { + + @Test + public void testReader() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types"); + CollectionReader reader = CollectionReaderFactory.createReader("de.julielab.jcore.cr.mmax2.desc.jcore-mmax2-reader", + MMAX2Reader.PARAM_INPUT_DIR, Path.of("src", "test", "resources", "input").toString(), + MMAX2Reader.PARAM_ANNOTATION_LEVELS, new String[]{"proteins", "sentence"}, + MMAX2Reader.PARAM_UIMA_ANNOTATION_TYPES, new String[]{"de.julielab.jcore.types.Protein", "de.julielab.jcore.types.Sentence"}); + assertThat(reader.hasNext()).isTrue(); + reader.getNext(jCas.getCas()); + // the text should be tokenized because we did not provide the original text + assertThat(jCas.getDocumentText()).startsWith("Characterization of antihuman IFNAR-1 monoclonal antibodies : epitope localization and functional analysis ."); + Collection proteins = JCasUtil.select(jCas, Protein.class); + assertThat(proteins).hasSize(16); + assertThat(proteins).map(Protein::getCoveredText).contains("IFNAR-1", "type I interferon receptor", "HuIFNAR-1", "Stat"); + Collection sentences = JCasUtil.select(jCas, Sentence.class); + assertThat(sentences).hasSize(10); + Collection tokens = JCasUtil.select(jCas, Token.class); + // check a small sample of tokens that should have been created + assertThat(tokens).map(Token::getCoveredText).contains("Characterization", "IFNAR-1", ":", "(", "subunits", "recognition", ".", "HuIFNAR-1"); + } + + @Test + public void testReaderOriginalText() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types"); + CollectionReader reader = CollectionReaderFactory.createReader("de.julielab.jcore.cr.mmax2.desc.jcore-mmax2-reader", + MMAX2Reader.PARAM_INPUT_DIR, Path.of("src", "test", "resources", "input").toString(), + MMAX2Reader.PARAM_ORIGINAL_TEXT_FILES, Path.of("src", "test", "resources", "originalText").toString(), + MMAX2Reader.PARAM_ANNOTATION_LEVELS, new String[]{"proteins"}, + MMAX2Reader.PARAM_UIMA_ANNOTATION_TYPES, new String[]{"de.julielab.jcore.types.Protein"}); + assertThat(reader.hasNext()).isTrue(); + reader.getNext(jCas.getCas()); + // in this test, the text should not appear tokenized but arranged according to the original text + assertThat(jCas.getDocumentText()).startsWith("Characterization of antihuman IFNAR-1 monoclonal antibodies: epitope localization and functional analysis."); + Collection proteins = JCasUtil.select(jCas, Protein.class); + assertThat(proteins).hasSize(16); + assertThat(proteins).map(Protein::getCoveredText).contains("IFNAR-1", "type I interferon receptor", "HuIFNAR-1", "Stat"); + Collection tokens = JCasUtil.select(jCas, Token.class); + // check a small sample of tokens that should have been created + assertThat(tokens).map(Token::getCoveredText).contains("Characterization", "IFNAR-1", ":", "(", "subunits", "recognition", ".", "HuIFNAR-1"); + } +} diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata.uri b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata.uri new file mode 100644 index 000000000..134fd8e79 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata.uri @@ -0,0 +1 @@ +10048764 diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata/Basedata.xml b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata/Basedata.xml new file mode 100644 index 000000000..cd5e3c8a3 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata/Basedata.xml @@ -0,0 +1,240 @@ + + + +Characterization +of +antihuman +IFNAR-1 +monoclonal +antibodies +: +epitope +localization +and +functional +analysis +. +The +type +I +interferon +receptor +( +IFNAR +) +is +composed +of +two +subunits +, +IFNAR-1 +and +IFNAR-2 +, +encoding +transmembrane +polypeptides +. +IFNAR-2 +has +a +dominant +role +in +ligand +binding +, +but +IFNAR-1 +contributes +to +binding +affinity +and +to +differential +ligand +recognition +. +A +panel +of +five +monoclonal +antibodies +( +mAb +) +to +human +IFNAR-1 +( +HuIFNAR-1 +) +was +produced +and +characterized +. +The +reactivity +of +each +mAb +toward +HuIFNAR-1 +on +native +and +transfected +cells +and +in +Western +blot +and +ELISA +formats +was +determined +. +In +functional +assays +, +one +mAb +, +EA12 +, +blocked +IFN-a2 +binding +to +human +cells +and +interfered +with +Stat +activation +and +antiviral +activity +. +Epitopes +for +the +mAb +were +localized +to +subdomains +of +the +HuIFNAR-1 +extracellular +domain +by +differential +reactivity +of +the +mAb +to +a +series +of +human +/ +bovine +IFNAR-1 +chimeras +. +The +antibody +EA12 +seems +to +require +native +HuIFNAR-1 +for +reactivity +and +does +not +map +to +a +single +subdomain +, +perhaps +recognizing +an +epitope +containing +noncontiguous +sequences +in +at +least +two +subdomains +. +In +contrast +, +the +epitopes +of +the +non +- +neutralizing +mAb +FB2 +, +AA3 +, +and +GB8 +mapped +, +respectively +, +to +the +first +, +second +, +and +third +subdomains +of +HuIFNAR-1 +. +The +mAb +DB2 +primarily +maps +to +the +fourth +subdomain +, +although +its +reactivity +may +be +affected +by +other +determinants +. + diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata/words.dtd b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata/words.dtd new file mode 100644 index 000000000..a02b470f1 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Basedata/words.dtd @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Customizations/proteins.xml b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Customizations/proteins.xml new file mode 100644 index 000000000..0f4bd71f8 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Customizations/proteins.xml @@ -0,0 +1,72 @@ + + + + + + + + + + + + + + + + + diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Customizations/sentence.xml b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Customizations/sentence.xml new file mode 100644 index 000000000..6fbf9d136 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Customizations/sentence.xml @@ -0,0 +1,3 @@ + + + diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/markables.dtd b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/markables.dtd new file mode 100644 index 000000000..220e8b3c8 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/markables.dtd @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/proteins.xml b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/proteins.xml new file mode 100644 index 000000000..46c822f8d --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/proteins.xml @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/sentence.xml b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/sentence.xml new file mode 100644 index 000000000..9a91c925b --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Markables/sentence.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Schemes/proteins.xml b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Schemes/proteins.xml new file mode 100644 index 000000000..1045dc27e --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Schemes/proteins.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Schemes/sentence.xml b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Schemes/sentence.xml new file mode 100644 index 000000000..f37fbc936 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Schemes/sentence.xml @@ -0,0 +1,3 @@ + + + diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Styles/default_style.xsl b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Styles/default_style.xsl new file mode 100644 index 000000000..ab671aa34 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/Styles/default_style.xsl @@ -0,0 +1,58 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/common_paths.xml b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/common_paths.xml new file mode 100644 index 000000000..8f55971b4 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/common_paths.xml @@ -0,0 +1,17 @@ + + + +Basedata/ +Markables/ +Schemes/ +Styles/ +Customizations/ +default_style.xsl + + +proteins.xml +sentence.xml + + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input/mmax_26000/project.mmax b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/project.mmax new file mode 100644 index 000000000..52fc0b1c1 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input/mmax_26000/project.mmax @@ -0,0 +1,7 @@ + + + +Basedata.xml + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/originalText/10048764 b/jcore-mmax2-reader/src/test/resources/originalText/10048764 new file mode 100644 index 000000000..2db1f6185 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/originalText/10048764 @@ -0,0 +1,2 @@ +Characterization of antihuman IFNAR-1 monoclonal antibodies: epitope localization and functional analysis. +The type I interferon receptor (IFNAR) is composed of two subunits, IFNAR-1 and IFNAR-2, encoding transmembrane polypeptides. IFNAR-2 has a dominant role in ligand binding, but IFNAR-1 contributes to binding affinity and to differential ligand recognition. A panel of five monoclonal antibodies (mAb) to human IFNAR-1 (HuIFNAR-1) was produced and characterized. The reactivity of each mAb toward HuIFNAR-1 on native and transfected cells and in Western blot and ELISA formats was determined. In functional assays, one mAb, EA12, blocked IFN-a2 binding to human cells and interfered with Stat activation and antiviral activity. Epitopes for the mAb were localized to subdomains of the HuIFNAR-1 extracellular domain by differential reactivity of the mAb to a series of human/bovine IFNAR-1 chimeras. The antibody EA12 seems to require native HuIFNAR-1 for reactivity and does not map to a single subdomain, perhaps recognizing an epitope containing noncontiguous sequences in at least two subdomains. In contrast, the epitopes of the non-neutralizing mAb FB2, AA3, and GB8 mapped, respectively, to the first, second, and third subdomains of HuIFNAR-1. The mAb DB2 primarily maps to the fourth subdomain, although its reactivity may be affected by other determinants. From 8b1f50431b611c14d30b3d3ad8a19e072623cee6 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 20 Apr 2022 12:07:18 +0200 Subject: [PATCH 181/269] Add a convenience method to the ES consumer for parallel feature paths and filters. The old methods would always apply all filters to all feature paths instead of assuming a 1:1 relationship. --- .../consumer/es/AbstractFieldGenerator.java | 46 +- pom.xml | 662 ++++++------------ 2 files changed, 275 insertions(+), 433 deletions(-) diff --git a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/AbstractFieldGenerator.java b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/AbstractFieldGenerator.java index 287cd68ea..44dd6d012 100644 --- a/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/AbstractFieldGenerator.java +++ b/jcore-elasticsearch-consumer/src/main/java/de/julielab/jcore/consumer/es/AbstractFieldGenerator.java @@ -730,10 +730,12 @@ public ArrayFieldValue createRawFieldValueForAnnotations(FeatureStructure[] a, S /** * Applies the - * {@link #createRawFieldValueForAnnotation(FeatureStructure, String[], Filter[]) + * {@link #createRawFieldValueForAnnotation(FeatureStructure, String[], Filter[])} * method to all feature structures in fss. Thus, the feature paths and * filters are expected to be parallel: Each feature path has its own - * filter. If the filters array is shorter than the feature paths array, the + * filter. But: The feature paths and filters are applied to all feature structures. + * See {@link #createRawFieldValueForParallelAnnotations(FeatureStructure[], String[], Filter[], Filter)} to apply the ith feature path to the ith feature structure. + * If the filters array is shorter than the feature paths array, the * missing filters will be treated as if they were null. Finally, after all * values have been created in this way, if the overallFilter is not * null, it will be applied to all resulting values. It will be reset once @@ -772,6 +774,46 @@ public ArrayFieldValue createRawFieldValueForAnnotations(FeatureStructure[] fss, return arrayFieldValue; } + /** + * Calls {@link #createRawFieldValueForAnnotation(FeatureStructure, String, Filter)} for all tuples + *
+	 * (fss[i], featurePaths[i], filters[i]), i in {0,..,fss.length-1}
+	 * 
, thus handling feature structures, feature paths and filters separately for each index. fss and + * featurePaths must be non-null and of equal length. filters may be null or shorter. The + * overallFilter will be applied to all values resulting from the previous process. + * @param fss + * @param featurePaths + * @param filters + * @param overallFilter + * @return + * @throws CASException + */ + public ArrayFieldValue createRawFieldValueForParallelAnnotations(FeatureStructure[] fss, String[] featurePaths, + Filter[] filters, Filter overallFilter) throws CASException { + ArrayFieldValue arrayFieldValue = new ArrayFieldValue(); + for (int i = 0; i < fss.length; i++) { + FeatureStructure annotation = fss[i]; + IFieldValue fieldValueForAnnotation = createRawFieldValueForAnnotation(annotation, featurePaths[i], + filters != null && i < filters.length ? filters[i] : null); + arrayFieldValue.addFlattened(fieldValueForAnnotation); + } + if (null != overallFilter) { + overallFilter.reset(); + ArrayFieldValue filteredArrayFieldValue = new ArrayFieldValue(); + for (IFieldValue fieldValue : arrayFieldValue) { + RawToken token = (RawToken) fieldValue; + String tokenString = String.valueOf(token.token); + List filteredTokens = overallFilter.filter(tokenString); + if (!filteredTokens.isEmpty()) { + for (String filteredToken : filteredTokens) + filteredArrayFieldValue.add(new RawToken(filteredToken)); + } + } + arrayFieldValue = filteredArrayFieldValue; + } + return arrayFieldValue; + } + /** * Creates a single array of all field values derived by the given feature paths * and filters. The filters array is taken to be parallel to diff --git a/pom.xml b/pom.xml index 84fad31a2..4b6553f98 100644 --- a/pom.xml +++ b/pom.xml @@ -1,550 +1,350 @@ - - - - + + 4.0.0 - - - - + + - - - - + + de.julielab - - - - + + jcore-parent - - - - + + 2.5.2-SNAPSHOT - - - - + + - - - - + + jcore-base - - - - + + pom - - - - + + JCoRe Base - - - - + + The POM for the JCoRe Base projects. - - - - + + 2.6.0-SNAPSHOT - - - - + + - - - - + + JULIE Lab, Germany - - - - + + http://www.julielab.de - - - - + + - - - - + + - - - - + + - - - - + + BSD-2-Clause - - - - + + https://opensource.org/licenses/BSD-2-Clause - - - - + + - - - - + + - - - - + + https://github.com/JULIELab/jcore-base - - - - + + - - - - + + - - - - + + org.apache.uima - - - - + + uimaj-core - - - - + + ${uima-version} - - - - + + - - - - + + - - - - + + org.apache.uima - - - - + + uimafit-core - - - - + + ${uimafit-version} - - - - + + - - - - + + - - - - + + - - - + + jcore-annotation-adder-ae - - - + + jcore-ace-reader - - - - + + jcore-acronym-ae - - - + + jcore-acronym-writer - - - - + + jcore-banner-ae - - - + + jcore-bc2gm-reader - - - + + jcore-bc2gmformat-writer - - - + + jcore-biolemmatizer-ae - - - - + + jcore-bionlpformat-consumer - - - - + + jcore-bionlpformat-reader - - - - + + jcore-biosem-ae - - - - + + jcore-conll-consumer - - - - + + jcore-coordination-baseline-ae - - - + + jcore-cord19-reader - - - + + jcore-coreference-writer - - - + + jcore-ct-reader - - - + + jcore-db-checkpoint-ae - - - + + jcore-descriptor-creator - - - + + jcore-dta-reader - - - - + + jcore-ec-code-ae - - - - + + jcore-elasticsearch-consumer - - - - + + jcore-embedding-writer - - - - + + jcore-event-flattener-ae - - - - + + jcore-feature-value-replacement-ae - - - - + + jcore-file-reader - - - - + + jcore-flair-ner-ae - - - + + jcore-flair-token-embedding-ae - - - + + jcore-flow-controllers - + + jcore-gnp-bioc-reader - + + jcore-gnp-bioc-writer - - + + jcore-iexml-consumer - - - - + + jcore-iexml-reader - - - - + + jcore-ign-reader - - - - + + jcore-iob-consumer - - - - + + jcore-jnet-ae - - - - + + jcore-jpos-ae - - - - + + jcore-jsbd-ae - - - - + + jcore-jtbd-ae - - - - + + jcore-julielab-entity-evaluator-consumer - - - - + + jcore-likelihood-assignment-ae - - - - + + jcore-likelihood-detection-ae - - - + + jcore-line-multiplier - - - + + jcore-lingpipegazetteer-ae - - - - + + jcore-lingpipe-porterstemmer-ae - - - - + + jcore-lingscope-ae - - - - + + jcore-linnaeus-species-ae - - - - + + jcore-mantra-xml-types - - - - + + jcore-medxn-ae - - - - + + jcore-msdoc-reader - - - - + + jcore-mstparser-ae - - - - + + jcore-muc7-reader - - - - + + jcore-mutationfinder-ae - - - + + jcore-neo4j-relations-consumer - - - - + + jcore-opennlp-chunk-ae - - - - + + jcore-opennlp-parser-ae - - - - + + jcore-opennlp-postag-ae - - - - + + jcore-opennlp-sentence-ae - - - - + + jcore-opennlp-token-ae - - - + + jcore-ppd-writer - - - + + jcore-pmc-reader - - - - + + jcore-pubtator-reader - - - - + + jcore-stanford-lemmatizer-ae - - - - + + jcore-topic-indexing-ae - - - - + + jcore-topics-writer - - - - + + jcore-txt-consumer - - - - + + jcore-types - - - - + + jcore-utilities - - - - + + jcore-xml-mapper - - - - + + jcore-xml-reader - - - - + + jcore-xmi-reader - - - - + + jcore-xmi-writer - - - - + + jedis-parent - - - + + jcore-jedis-integration-tests - - - - + + + jcore-mmax2-reader + - - - - + + - - - - + + scm:git:https://github.com/JULIELab/jcore-base - - - - + + scm:git:https://github.com/JULIELab/jcore-base - - - - + + scm:git:https://github.com/JULIELab/jcore-base - - - - + + - - - + From 2d9b2fd84aaad1df7b1614194600920d7052b7c4 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 20 Apr 2022 12:07:54 +0200 Subject: [PATCH 182/269] Add a test for the correct handling of multiple gene IDs per gene mention. --- .../jcore/reader/BioCCasPopulatorTest.java | 24 +++- .../resources/multipleGeneIdsDocument.xml | 136 ++++++++++++++++++ 2 files changed, 159 insertions(+), 1 deletion(-) create mode 100644 jcore-gnp-bioc-reader/src/test/resources/multipleGeneIdsDocument.xml diff --git a/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java index b93ad6c46..e1ffdc7e4 100644 --- a/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java +++ b/jcore-gnp-bioc-reader/src/test/java/de/julielab/jcore/reader/BioCCasPopulatorTest.java @@ -4,6 +4,7 @@ import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; import org.assertj.core.api.Condition; import org.junit.jupiter.api.Test; @@ -62,7 +63,7 @@ public void populateWithNextDocument() throws Exception { @Test public void addFamilyNames() throws Exception { - BioCCasPopulator bioCCasPopulator = new BioCCasPopulator(Path.of("src", "test", "resources","bioc_collection_0_0.xml"), null, null); + BioCCasPopulator bioCCasPopulator = new BioCCasPopulator(Path.of("src", "test", "resources", "bioc_collection_0_0.xml"), null, null); JCas jCas = getJCas(); bioCCasPopulator.populateWithNextDocument(jCas); @@ -75,4 +76,25 @@ public void addFamilyNames() throws Exception { } } } + + @Test + public void multipleGeneIds() throws Exception { + // Check that gene mentions with multiple IDs (enumerations, alternatives, ranges...) result in multiple ResourceEntries in a Gene annotation + BioCCasPopulator bioCCasPopulator = new BioCCasPopulator(Path.of("src", "test", "resources", "multipleGeneIdsDocument.xml"), null, null); + JCas jCas = getJCas(); + bioCCasPopulator.populateWithNextDocument(jCas); + + Collection genes = JCasUtil.select(jCas, Gene.class); + boolean multipleIdGeneFound = false; + for (Gene o : genes) { + if (o.getBegin() == 805) { + multipleIdGeneFound = true; + FSArray resourceEntryList = o.getResourceEntryList(); + assertThat(resourceEntryList).hasSize(2); + assertThat(o.getResourceEntryList(0).getEntryId()).isEqualTo("12519"); + assertThat(o.getResourceEntryList(1).getEntryId()).isEqualTo("12524"); + } + } + assertThat(multipleIdGeneFound).isTrue(); + } } \ No newline at end of file diff --git a/jcore-gnp-bioc-reader/src/test/resources/multipleGeneIdsDocument.xml b/jcore-gnp-bioc-reader/src/test/resources/multipleGeneIdsDocument.xml new file mode 100644 index 000000000..1a26ceb19 --- /dev/null +++ b/jcore-gnp-bioc-reader/src/test/resources/multipleGeneIdsDocument.xml @@ -0,0 +1,136 @@ + + + + JCoRe GNormPlus BioC Writer + Wed Mar 02 14:58:28 CET 2022 + PubTator.key + + 16177354 + + title + 0 + Cellular mechanisms of the adjuvant activity of the flagellin component FljB of Salmonella enterica + Serovar Typhimurium to potentiate mucosal and systemic responses. + + + NCBITaxonomyID:90371 + FamilyName + + flagellin + + + 90371 + Species + + Salmonella enterica Serovar Typhimurium + + + + abstract + 166 + An expanding area of interest is the utilization of microbe-based components to augment mucosal and + systemic immune responses to target antigens. Thus, the aim of the present study was to assess if the + flagellin component FljB from Salmonella enterica serovar Typhimurium could act as a mucosal adjuvant + and then to determine the cellular mechanism(s) by which FljB mediates its adjuvant properties. To + determine if FljB could act as a mucosal adjuvant, mice were immunized by the intranasal (i.n.) route + with antigen alone or in conjunction with FljB. Additionally, we assessed how FljB affected the levels + of the costimulatory molecules B7-1 and B7-2 on dendritic cells by flow cytometry and determined the + functional role these costimulatory molecules played in the adjuvant properties of FljB in vivo. Mice + immunized by the i.n. route with antigen and FljB exhibited significantly elevated levels of mucosal and + systemic antibody and CD4(+)-T-cell responses compared to mice given antigen only. Stimulation of + dendritic cells in vitro with FljB resulted in a pronounced increase in the surface expression of B7-1 + and B7-2. The percentage of dendritic cells expressing B7-2 but not B7-1 increased significantly when + stimulated with FljB over a concentration range of 10 to 10,000 ng/ml. Immunization of wild-type and + B7-1, B7-2, and B7-1/2 knockout mice by the i.n. route revealed that the ability of FljB to increase + B7-2 expression is largely responsible for its adjuvant effect in vivo. These findings demonstrate that + FljB can act as an effective mucosal adjuvant and that its ability to enhance the level of B7-2 + expression is predominantly responsible for its adjuvant properties. + + + 12519;12524 + Gene + + B7-1 and B7-2 + + + 12519;12524 + Gene + + B7-1 and B7-2 + + + 12519;12524 + Gene + + B7-1/2 + + + 12524 + Gene + + B7-2 + + + 12524 + Gene + + B7-2 + + + 12524 + Gene + + B7-2 + + + 12519 + Gene + + B7-1 + + + 12519 + Gene + + B7-1 + + + 12524 + Gene + + B7-2 + + + NCBITaxonomyID:90371 + FamilyName + + flagellin + + + 90371 + Species + + Salmonella enterica serovar Typhimurium + + + 10090 + Species + + mice + + + 10090 + Species + + Mice + + + 10090 + Species + + mice + + + + \ No newline at end of file From d716980a843cce6412d868373f339977b42cdf83 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 20 Apr 2022 12:13:21 +0200 Subject: [PATCH 183/269] Add component.meta file for the jcore-mmax2-reader. --- jcore-mmax2-reader/component.meta | 20 ++ .../mapper/StructuredAbstractParser.java | 176 +++++++++--------- 2 files changed, 109 insertions(+), 87 deletions(-) create mode 100644 jcore-mmax2-reader/component.meta diff --git a/jcore-mmax2-reader/component.meta b/jcore-mmax2-reader/component.meta new file mode 100644 index 000000000..386acc60b --- /dev/null +++ b/jcore-mmax2-reader/component.meta @@ -0,0 +1,20 @@ +{ + "categories": [ + "reader" + ], + "description": "Collection reader for MMAX2 annotation projects.", + "descriptors": [ + { + "category": "reader", + "location": "de.julielab.jcore.cr.mmax2.desc.jcore-mmax2-reader" + } + ], + "exposable": true, + "group": "general", + "maven-artifact": { + "artifactId": "jcore-mmax2-reader", + "groupId": "de.julielab", + "version": "2.6.0-SNAPSHOT" + }, + "name": "JCoRe MMAX2 reader." +} diff --git a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/StructuredAbstractParser.java b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/StructuredAbstractParser.java index 5881ab36a..08f79a85d 100644 --- a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/StructuredAbstractParser.java +++ b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/StructuredAbstractParser.java @@ -23,106 +23,108 @@ * component, if required.
* NOTE: Using this parser, the AbstractText annotation is already * created and should not be set in the mapping file. - * + * * @author faessler - * */ public class StructuredAbstractParser implements DocumentTextPartParser { - private static final boolean newlineBetweenSections = true; + private static final boolean newlineBetweenSections = true; - public List parseDocumentPart(VTDNav vn, PartOfDocument docTextPart, int offset, JCas jCas, - byte[] identifier) { - String baseXPath = docTextPart.getXPath(); + public List parseDocumentPart(VTDNav vn, PartOfDocument docTextPart, int offset, JCas jCas, + byte[] identifier) { + String baseXPath = docTextPart.getXPath(); - List> fields = new ArrayList<>(); - Map field = new HashMap<>(); - field.put(JulieXMLConstants.NAME, "Label"); - field.put(JulieXMLConstants.XPATH, "@Label"); - fields.add(field); + List> fields = new ArrayList<>(); + Map field = new HashMap<>(); + field.put(JulieXMLConstants.NAME, "Label"); + field.put(JulieXMLConstants.XPATH, "@Label"); + fields.add(field); - field = new HashMap<>(); - field.put(JulieXMLConstants.NAME, "NlmCategory"); - field.put(JulieXMLConstants.XPATH, "@NlmCategory"); - fields.add(field); + field = new HashMap<>(); + field.put(JulieXMLConstants.NAME, "NlmCategory"); + field.put(JulieXMLConstants.XPATH, "@NlmCategory"); + fields.add(field); - field = new HashMap<>(); - field.put(JulieXMLConstants.NAME, "AbstractText"); - field.put(JulieXMLConstants.XPATH, "."); - fields.add(field); - Iterator> rowIterator = JulieXMLTools.constructRowIterator(vn, baseXPath + "/AbstractText", - fields, new String(identifier)); - List abstractParts = new ArrayList<>(); - // for the text contents - StringBuilder sb = new StringBuilder(); + field = new HashMap<>(); + field.put(JulieXMLConstants.NAME, "AbstractText"); + field.put(JulieXMLConstants.XPATH, "."); + fields.add(field); + Iterator> rowIterator = JulieXMLTools.constructRowIterator(vn, baseXPath + "/AbstractText", + fields, new String(identifier)); + List abstractParts = new ArrayList<>(); + // for the text contents + StringBuilder sb = new StringBuilder(); - int sectionOffset = offset; - while (rowIterator.hasNext()) { - Map abstractSectionData = rowIterator.next(); - String label = (String) abstractSectionData.get("Label"); - String nlmCategory = (String) abstractSectionData.get("NlmCategory"); - String abstractSectionText = (String) abstractSectionData.get("AbstractText"); - if (newlineBetweenSections) { - // in case the last section was empty, we delete the trailing - // newline - if (sb.length() > 0 && StringUtils.isBlank(abstractSectionText)) { - sb.deleteCharAt(sb.length() - 1); - --sectionOffset; - } - } - sb.append(abstractSectionText); + int sectionOffset = offset; + while (rowIterator.hasNext()) { + Map abstractSectionData = rowIterator.next(); + String label = (String) abstractSectionData.get("Label"); + String nlmCategory = (String) abstractSectionData.get("NlmCategory"); + String abstractSectionText = (String) abstractSectionData.get("AbstractText"); + if (newlineBetweenSections) { + // in case the last section was empty, we delete the trailing + // newline + if (sb.length() > 0 && StringUtils.isBlank(abstractSectionText)) { + sb.deleteCharAt(sb.length() - 1); + --sectionOffset; + } + } + // comment in to add the structured abstract section labels to the text, e.g. "AIMS: ...", "BACKGROUND: ..." + if (null != label && !"unlabelled".equalsIgnoreCase(label)) + sb.append(label).append(": "); + sb.append(abstractSectionText); - // if label and nlmCategory are null, there is no section heading; - // most probably this just isn't a structured abstract - if (null != label || null != nlmCategory) { - AbstractSectionHeading abstractPartHeading = new AbstractSectionHeading(jCas); - abstractPartHeading.setLabel(label); - abstractPartHeading.setNlmCategory(nlmCategory); - abstractPartHeading.setTitleType("abstractSection"); - abstractPartHeading.addToIndexes(); + // if label and nlmCategory are null, there is no section heading; + // most probably this just isn't a structured abstract + if (null != label || null != nlmCategory) { + AbstractSectionHeading abstractPartHeading = new AbstractSectionHeading(jCas); + abstractPartHeading.setLabel(label); + abstractPartHeading.setNlmCategory(nlmCategory); + abstractPartHeading.setTitleType("abstractSection"); + abstractPartHeading.addToIndexes(); - AbstractSection abstractPart = new AbstractSection(jCas); - abstractPart.setBegin(sectionOffset); - sectionOffset += abstractSectionText.length(); - abstractPart.setEnd(sectionOffset); - abstractPart.setAbstractSectionHeading(abstractPartHeading); - abstractPart.addToIndexes(); + AbstractSection abstractPart = new AbstractSection(jCas); + abstractPart.setBegin(sectionOffset); + sectionOffset += abstractSectionText.length(); + abstractPart.setEnd(sectionOffset); + abstractPart.setAbstractSectionHeading(abstractPartHeading); + abstractPart.addToIndexes(); - abstractParts.add(abstractPart); - } else { - sectionOffset += abstractSectionText.length(); - } + abstractParts.add(abstractPart); + } else { + sectionOffset += abstractSectionText.length(); + } - // let's insert a line break after each section text - if (newlineBetweenSections && sb.length() > 0 && rowIterator.hasNext()) { - sb.append("\n"); - ++sectionOffset; - } - } + // let's insert a line break after each section text + if (newlineBetweenSections && sb.length() > 0 && rowIterator.hasNext()) { + sb.append("\n"); + ++sectionOffset; + } + } - // only create an abstract annotation if there actually is an abstract - if (!abstractParts.isEmpty() || sectionOffset > offset) { - if (sectionOffset == offset) { - // there was no abstract but just empty abstract sections; decrement the offsets so we stay with existing document text - --offset; - --sectionOffset; - for (AbstractSection section : abstractParts) { - section.setBegin(offset); - section.setEnd(offset); - } - } - AbstractText abstractText = new AbstractText(jCas, offset, sectionOffset); - abstractText.setAbstractType("main"); - if (abstractParts.size() > 0) { - FSArray sectionsArray = new FSArray(jCas, abstractParts.size()); - for (int i = 0; i < abstractParts.size(); ++i) - sectionsArray.set(i, abstractParts.get(i)); - abstractText.setStructuredAbstractParts(sectionsArray); - } - abstractText.addToIndexes(); - return Collections.singletonList(sb.toString()); - } - return Collections.emptyList(); - } + // only create an abstract annotation if there actually is an abstract + if (!abstractParts.isEmpty() || sectionOffset > offset) { + if (sectionOffset == offset) { + // there was no abstract but just empty abstract sections; decrement the offsets so we stay with existing document text + --offset; + --sectionOffset; + for (AbstractSection section : abstractParts) { + section.setBegin(offset); + section.setEnd(offset); + } + } + AbstractText abstractText = new AbstractText(jCas, offset, sectionOffset); + abstractText.setAbstractType("main"); + if (abstractParts.size() > 0) { + FSArray sectionsArray = new FSArray(jCas, abstractParts.size()); + for (int i = 0; i < abstractParts.size(); ++i) + sectionsArray.set(i, abstractParts.get(i)); + abstractText.setStructuredAbstractParts(sectionsArray); + } + abstractText.addToIndexes(); + return Collections.singletonList(sb.toString()); + } + return Collections.emptyList(); + } } From 90ac8b499941ac28daf7e83dd8933eaf9ea490f4 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 20 Apr 2022 13:16:36 +0200 Subject: [PATCH 184/269] Bump neo4j plugins version. --- jcore-neo4j-relations-consumer/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-neo4j-relations-consumer/pom.xml b/jcore-neo4j-relations-consumer/pom.xml index dca3293f1..3c5d394e3 100644 --- a/jcore-neo4j-relations-consumer/pom.xml +++ b/jcore-neo4j-relations-consumer/pom.xml @@ -53,7 +53,7 @@ de.julielab julielab-neo4j-plugins-concepts - 3.0.1-SNAPSHOT + 3.1.0-SNAPSHOT test From bc1024a4f1d6864426af3b1dacfb63d720267581 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 20 Apr 2022 13:44:23 +0200 Subject: [PATCH 185/269] Add IDs to sentences. --- jcore-mmax2-reader/component.meta | 2 +- .../main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java | 7 ++++++- .../java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java | 7 +++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/jcore-mmax2-reader/component.meta b/jcore-mmax2-reader/component.meta index 386acc60b..d0e5293fb 100644 --- a/jcore-mmax2-reader/component.meta +++ b/jcore-mmax2-reader/component.meta @@ -16,5 +16,5 @@ "groupId": "de.julielab", "version": "2.6.0-SNAPSHOT" }, - "name": "JCoRe MMAX2 reader." + "name": "JCoRe MMAX2 reader" } diff --git a/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java b/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java index fa09f4c69..ac4e78c59 100644 --- a/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java +++ b/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java @@ -2,6 +2,7 @@ import de.julielab.jcore.types.ConceptMention; import de.julielab.jcore.types.Gene; +import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; import de.julielab.jcore.utility.JCoReAnnotationTools; import de.julielab.jules.mmax.MarkableContainer; @@ -29,7 +30,7 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; -@ResourceMetaData(name = "JCoRe MMAX2 reader.", description = "Collection reader for MMAX2 annotation projects.", vendor = "JULIE Lab Jena, Germany") +@ResourceMetaData(name = "JCoRe MMAX2 reader", description = "Collection reader for MMAX2 annotation projects.", vendor = "JULIE Lab Jena, Germany") public class MMAX2Reader extends JCasCollectionReader_ImplBase { public static final String PARAM_INPUT_DIR = "InputDir"; @@ -211,6 +212,7 @@ private void produceOutput(MMAX2Discourse discourse, WordInformation[] words, JC } for (int i = 0; i < annotationLevels.length; ++i) { Iterator iterator = discourse.getMarkableLevelByName(annotationLevels[i], false).getMarkables().stream().map(Markable.class::cast).filter(Predicate.not(Markable::isDiscontinuous)).iterator(); + int id = 0; while (iterator.hasNext()) { Markable markable = iterator.next(); int beginPosition = markable.getLeftmostDiscoursePosition(); @@ -227,7 +229,10 @@ private void produceOutput(MMAX2Discourse discourse, WordInformation[] words, JC a.setEnd(endOffset); if (a instanceof ConceptMention) ((ConceptMention) a).setSpecificType(markable.getAttributeValue(markable.getMarkableLevelName())); + else if (a instanceof Sentence) + ((Sentence)a).setId(String.valueOf(id)); a.addToIndexes(); + ++id; } } for (WordInformation word : words) { diff --git a/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java b/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java index 410b42ed1..9f8c59f56 100644 --- a/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java +++ b/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java @@ -30,13 +30,20 @@ public void testReader() throws Exception { MMAX2Reader.PARAM_UIMA_ANNOTATION_TYPES, new String[]{"de.julielab.jcore.types.Protein", "de.julielab.jcore.types.Sentence"}); assertThat(reader.hasNext()).isTrue(); reader.getNext(jCas.getCas()); + // the text should be tokenized because we did not provide the original text assertThat(jCas.getDocumentText()).startsWith("Characterization of antihuman IFNAR-1 monoclonal antibodies : epitope localization and functional analysis ."); Collection proteins = JCasUtil.select(jCas, Protein.class); assertThat(proteins).hasSize(16); + assertThat(proteins).map(Protein::getCoveredText).contains("IFNAR-1", "type I interferon receptor", "HuIFNAR-1", "Stat"); Collection sentences = JCasUtil.select(jCas, Sentence.class); assertThat(sentences).hasSize(10); + + assertThat(proteins).extracting(Protein::getSpecificType).filteredOn(type -> type.equals("protein")).hasSize(13); + assertThat(proteins).extracting(Protein::getSpecificType).filteredOn(type -> type.equals("protein_complex")).hasSize(2); + assertThat(proteins).extracting(Protein::getSpecificType).filteredOn(type -> type.equals("protein_familiy_or_group")).hasSize(1); + Collection tokens = JCasUtil.select(jCas, Token.class); // check a small sample of tokens that should have been created assertThat(tokens).map(Token::getCoveredText).contains("Characterization", "IFNAR-1", ":", "(", "subunits", "recognition", ".", "HuIFNAR-1"); From f6a8ede41f602e7c68c332cdadfea0ffc72b6d05 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 20 Apr 2022 13:44:31 +0200 Subject: [PATCH 186/269] Add required dependency. --- jcore-bc2gmformat-writer/pom.xml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/jcore-bc2gmformat-writer/pom.xml b/jcore-bc2gmformat-writer/pom.xml index 37c5a1de0..75acd4004 100644 --- a/jcore-bc2gmformat-writer/pom.xml +++ b/jcore-bc2gmformat-writer/pom.xml @@ -36,6 +36,10 @@ jcore-utilities ${jcore-utilities-version} + + de.julielab + julielab-java-utilities + de.julielab jcore-descriptor-creator From fd57ed03176aebf3d149908e75eb374af3efe98f Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 20 Apr 2022 15:47:47 +0200 Subject: [PATCH 187/269] Fix an issue where the sentence ID column required the document ID column to be output. --- .../EntityEvaluatorConsumer.java | 14 ++++---- .../EntityEvaluatorConsumerTest.java | 36 +++++++++++++++++++ 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java index bffd2311d..16ad3fff3 100644 --- a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java +++ b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java @@ -157,13 +157,11 @@ private void addOffsetsColumn(JCas aJCas) { } private void addDocumentIdColumn(JCas aJCas) throws CASException { - if (outputColumnNames.contains(DOCUMENT_ID_COLUMN)) { - Column c = columns.get(DOCUMENT_ID_COLUMN); - if (c == null) - c = new Column(DOCUMENT_ID_COLUMN + ":" + Header.class.getCanonicalName() + "=/docId", null, aJCas.getTypeSystem()); - c = new DocumentIdColumn(c); - columns.put(DOCUMENT_ID_COLUMN, c); - } + Column c = columns.get(DOCUMENT_ID_COLUMN); + if (c == null) + c = new Column(DOCUMENT_ID_COLUMN + ":" + Header.class.getCanonicalName() + "=/docId", null, aJCas.getTypeSystem()); + c = new DocumentIdColumn(c); + columns.put(DOCUMENT_ID_COLUMN, c); } private void addDocumentTextSha256Column() { @@ -183,7 +181,7 @@ private void addSentenceIdColumn(JCas aJCas) throws CASException { Column docIdColumn = columns.get(DOCUMENT_ID_COLUMN); String documentId = null; if (docIdColumn != null) - documentId = docIdColumn.getValue(aJCas.getDocumentAnnotationFs(), aJCas).getFirst(); + documentId = docIdColumn.getValue(null, aJCas).getFirst(); Type sentenceType = c.getSingleType(); // put all sentences into an index with an // overlap-comparator - this way the index can be diff --git a/jcore-julielab-entity-evaluator-consumer/src/test/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumerTest.java b/jcore-julielab-entity-evaluator-consumer/src/test/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumerTest.java index ca29657b9..b0589b592 100644 --- a/jcore-julielab-entity-evaluator-consumer/src/test/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumerTest.java +++ b/jcore-julielab-entity-evaluator-consumer/src/test/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumerTest.java @@ -75,6 +75,42 @@ public void testEntityEvaluatorConsumerSingleEntity() throws Exception { assertEquals("document1 document1:0 23 gene", lines.get(0)); } + @Test + public void testEntityEvaluatorConsumerSingleEntity2() throws Exception { + // The same test as above but minus the DocumentId column + JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types", + "de.julielab.jcore.types.jcore-semantics-biology-types", + "de.julielab.jcore.types.jcore-document-meta-types"); + AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class, + PARAM_COLUMN_DEFINITIONS, + new String[] { "geneid:Gene=/resourceEntryList[0]/entryId", "name:/:coveredText()" }, + // We here use the default SentenceId column, we did not provide a definition! + PARAM_OUTPUT_COLUMNS, new String[] { SENTENCE_ID_COLUMN, "geneid", "name" }, + PARAM_TYPE_PREFIX, "de.julielab.jcore.types", PARAM_OUTPUT_FILE, "src/test/resources/outfile-test.tsv"); + + jcas.setDocumentText("One gene one sentence."); + Header h = new Header(jcas); + h.setDocId("document1"); + h.addToIndexes(); + Sentence s = new Sentence(jcas, 0, jcas.getDocumentText().length()); + s.setId("sentence1"); + s.addToIndexes(); + Gene g = new Gene(jcas, 4, 8); + GeneResourceEntry re = new GeneResourceEntry(jcas); + re.setEntryId("23"); + FSArray array = new FSArray(jcas, 1); + array.set(0, re); + g.setResourceEntryList(array); + g.addToIndexes(); + + consumer.process(jcas.getCas()); + consumer.collectionProcessComplete(); + + List lines = Files.readLines(new File("src/test/resources/outfile-test.tsv"), Charset.forName("UTF-8")); + assertEquals(1, lines.size()); + assertEquals("document1:0 23 gene", lines.get(0)); + } + @Test public void testEntityEvaluatorConsumerNoEntities() throws Exception { JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types", From 49be6c89573ce03a1360f5a64cb086350c8844d7 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 20 Apr 2022 16:49:37 +0200 Subject: [PATCH 188/269] Add a detailed error message for missing sentences in the JCoReEntityDataset. --- .../de/julielab/jcore/banner/dataset/JCoReEntityDataset.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/jcore-banner-ae/src/main/java/de/julielab/jcore/banner/dataset/JCoReEntityDataset.java b/jcore-banner-ae/src/main/java/de/julielab/jcore/banner/dataset/JCoReEntityDataset.java index 1db2578c9..f22ed040c 100644 --- a/jcore-banner-ae/src/main/java/de/julielab/jcore/banner/dataset/JCoReEntityDataset.java +++ b/jcore-banner-ae/src/main/java/de/julielab/jcore/banner/dataset/JCoReEntityDataset.java @@ -65,6 +65,8 @@ public void load(File sentenceFile, File mentionsFile) { mentReader.lines().forEach(ml -> { String[] split = ml.split("\\t"); Sentence sentence = sentences.get(split[0]); + if (sentence == null) + throw new IllegalStateException("The gene mention '" + ml + "' is associated with sentence ID '" + split[0] + "' but such a sentence ID was not found in " + sentenceFile.getAbsolutePath()); int begin = Integer.parseInt(split[1]); int end = Integer.parseInt(split[2]); EntityType label = EntityType.getType(split[3]); From f4a8953adc2dafab4ccf6cccce6c567bc5da6c24 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 20 Apr 2022 16:50:25 +0200 Subject: [PATCH 189/269] Set the default offset scope to documents even when sentence ID are output. --- .../jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java index 16ad3fff3..413e8fa87 100644 --- a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java +++ b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java @@ -263,7 +263,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept offsetMode = null == offsetModeStr ? OffsetMode.CharacterSpan : OffsetMode.valueOf(offsetModeStr); if (null == offsetScopeStr) { - offsetScope = outputColumnNames.contains(SENTENCE_ID_COLUMN) ? OffsetScope.Sentence : OffsetScope.Document; + offsetScope = OffsetScope.Document; } else { offsetScope = OffsetScope.valueOf(offsetScopeStr); } From 4dea6a1a10bf2b7142fc9df0e28277a52f636454 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 20 Apr 2022 16:51:10 +0200 Subject: [PATCH 190/269] Revoke the change to add the structured abstract sections to the text. This was only added briefly to re-create the original documents of the ProGene corpus where the headings are part of the texts. --- .../reader/xmlmapper/mapper/StructuredAbstractParser.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/StructuredAbstractParser.java b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/StructuredAbstractParser.java index 08f79a85d..ce46c09f6 100644 --- a/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/StructuredAbstractParser.java +++ b/jcore-xml-mapper/src/main/java/de/julielab/jcore/reader/xmlmapper/mapper/StructuredAbstractParser.java @@ -70,8 +70,8 @@ public List parseDocumentPart(VTDNav vn, PartOfDocument docTextPart, int } } // comment in to add the structured abstract section labels to the text, e.g. "AIMS: ...", "BACKGROUND: ..." - if (null != label && !"unlabelled".equalsIgnoreCase(label)) - sb.append(label).append(": "); +// if (null != label && !"unlabelled".equalsIgnoreCase(label)) +// sb.append(label).append(": "); sb.append(abstractSectionText); // if label and nlmCategory are null, there is no section heading; From d311a6a92485cea02a0dfff5d40899b72fce15c3 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 20 Apr 2022 16:51:37 +0200 Subject: [PATCH 191/269] Add a Header annotation containing the document ID. --- .../julielab/jcore/cr/mmax2/MMAX2Reader.java | 10 +++--- .../jcore/cr/mmax2/MMAX2ReaderTest.java | 31 +++++++++++++++++-- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java b/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java index ac4e78c59..85634bb56 100644 --- a/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java +++ b/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java @@ -1,9 +1,6 @@ package de.julielab.jcore.cr.mmax2; -import de.julielab.jcore.types.ConceptMention; -import de.julielab.jcore.types.Gene; -import de.julielab.jcore.types.Sentence; -import de.julielab.jcore.types.Token; +import de.julielab.jcore.types.*; import de.julielab.jcore.utility.JCoReAnnotationTools; import de.julielab.jules.mmax.MarkableContainer; import de.julielab.jules.mmax.Statistics; @@ -25,6 +22,7 @@ import org.slf4j.LoggerFactory; import java.io.*; +import java.util.List; import java.util.*; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -249,6 +247,10 @@ else if (a instanceof Sentence) } String textPlain = outPlain.toString(); jCas.setDocumentText(textPlain); + + Header h = new Header(jCas); + h.setDocId(pmid); + h.addToIndexes(); } private void handleOriginalTextInformation(String pmid, WordInformation[] words) throws CollectionException { diff --git a/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java b/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java index 9f8c59f56..65a401a0b 100644 --- a/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java +++ b/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java @@ -1,5 +1,6 @@ package de.julielab.jcore.cr.mmax2; +import de.julielab.jcore.types.Header; import de.julielab.jcore.types.Protein; import de.julielab.jcore.types.Sentence; import de.julielab.jcore.types.Token; @@ -23,7 +24,7 @@ public class MMAX2ReaderTest { @Test public void testReader() throws Exception { - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types"); + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); CollectionReader reader = CollectionReaderFactory.createReader("de.julielab.jcore.cr.mmax2.desc.jcore-mmax2-reader", MMAX2Reader.PARAM_INPUT_DIR, Path.of("src", "test", "resources", "input").toString(), MMAX2Reader.PARAM_ANNOTATION_LEVELS, new String[]{"proteins", "sentence"}, @@ -31,6 +32,9 @@ public void testReader() throws Exception { assertThat(reader.hasNext()).isTrue(); reader.getNext(jCas.getCas()); + Header h = JCasUtil.selectSingle(jCas, Header.class); + assertThat(h.getDocId()).isEqualTo("10048764"); + // the text should be tokenized because we did not provide the original text assertThat(jCas.getDocumentText()).startsWith("Characterization of antihuman IFNAR-1 monoclonal antibodies : epitope localization and functional analysis ."); Collection proteins = JCasUtil.select(jCas, Protein.class); @@ -39,6 +43,7 @@ public void testReader() throws Exception { assertThat(proteins).map(Protein::getCoveredText).contains("IFNAR-1", "type I interferon receptor", "HuIFNAR-1", "Stat"); Collection sentences = JCasUtil.select(jCas, Sentence.class); assertThat(sentences).hasSize(10); + assertThat(sentences).extracting(Sentence::getId).containsExactlyInAnyOrder("0", "1", "2", "3", "4", "5", "6", "7", "8", "9"); assertThat(proteins).extracting(Protein::getSpecificType).filteredOn(type -> type.equals("protein")).hasSize(13); assertThat(proteins).extracting(Protein::getSpecificType).filteredOn(type -> type.equals("protein_complex")).hasSize(2); @@ -51,7 +56,7 @@ public void testReader() throws Exception { @Test public void testReaderOriginalText() throws Exception { - JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types"); + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); CollectionReader reader = CollectionReaderFactory.createReader("de.julielab.jcore.cr.mmax2.desc.jcore-mmax2-reader", MMAX2Reader.PARAM_INPUT_DIR, Path.of("src", "test", "resources", "input").toString(), MMAX2Reader.PARAM_ORIGINAL_TEXT_FILES, Path.of("src", "test", "resources", "originalText").toString(), @@ -68,4 +73,26 @@ public void testReaderOriginalText() throws Exception { // check a small sample of tokens that should have been created assertThat(tokens).map(Token::getCoveredText).contains("Characterization", "IFNAR-1", ":", "(", "subunits", "recognition", ".", "HuIFNAR-1"); } + + @Test + public void testReader2() throws Exception { + JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types"); + CollectionReader reader = CollectionReaderFactory.createReader("de.julielab.jcore.cr.mmax2.desc.jcore-mmax2-reader", + MMAX2Reader.PARAM_INPUT_DIR, Path.of("src", "test", "resources", "input2").toString(), + MMAX2Reader.PARAM_ANNOTATION_LEVELS, new String[]{"proteins", "sentence"}, + MMAX2Reader.PARAM_UIMA_ANNOTATION_TYPES, new String[]{"de.julielab.jcore.types.Protein", "de.julielab.jcore.types.Sentence"}); + assertThat(reader.hasNext()).isTrue(); + reader.getNext(jCas.getCas()); + + Header h = JCasUtil.selectSingle(jCas, Header.class); + assertThat(h.getDocId()).isEqualTo("10471746"); + + Collection proteins = JCasUtil.select(jCas, Protein.class); + for (var p : proteins) { + System.out.println(p.getCoveredText() + ": " + p.getBegin() + "-"+p.getEnd()); + } + Collection sentences = JCasUtil.select(jCas, Sentence.class); + for (var s : sentences) + System.out.println(s.getBegin() + " - " + s.getEnd()); + } } From d2f432628249282b4bc4e486c6ed62b45a71cb1c Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 20 Apr 2022 21:08:53 +0200 Subject: [PATCH 192/269] Add an option to remove overlapping annotations. Take the longest annotation, then. --- .../julielab/jcore/cr/mmax2/MMAX2Reader.java | 45 +++++++++++++++++-- .../jcore/cr/mmax2/MMAX2ReaderTest.java | 33 ++++++++++---- 2 files changed, 67 insertions(+), 11 deletions(-) diff --git a/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java b/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java index 85634bb56..400c752f5 100644 --- a/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java +++ b/jcore-mmax2-reader/src/main/java/de/julielab/jcore/cr/mmax2/MMAX2Reader.java @@ -35,7 +35,10 @@ public class MMAX2Reader extends JCasCollectionReader_ImplBase { public static final String PARAM_ANNOTATION_LEVELS = "AnnotationLevels"; public static final String PARAM_ORIGINAL_TEXT_FILES = "OriginalTextFiles"; public static final String PARAM_UIMA_ANNOTATION_TYPES = "UimaAnnotationTypes"; + public static final String PARAM_REMOVE_OVERLAPPING_SHORTER_ANNOTATIONS = "RemoveOverlappingShorterAnnotations"; private final static Logger log = LoggerFactory.getLogger(MMAX2Reader.class); + @ConfigurationParameter(name = PARAM_REMOVE_OVERLAPPING_SHORTER_ANNOTATIONS, mandatory = false, defaultValue = "false", description = "If set to true, for all overlapping annotations only the longest is kept.") + boolean removeOverlappingShorterAnnotations; @ConfigurationParameter(name = PARAM_INPUT_DIR, description = "Should point to the directory of which the MMAX2 projects are sub directories of.") private String inputDir; @ConfigurationParameter(name = PARAM_ANNOTATION_LEVELS, description = "The names of the MMAX2 annotation levels to create annotations for.") @@ -44,7 +47,6 @@ public class MMAX2Reader extends JCasCollectionReader_ImplBase { private String[] uimaTypeNames; @ConfigurationParameter(name = PARAM_ORIGINAL_TEXT_FILES, mandatory = false, description = "The MMAX2 base data consists of tokenized text and does not keep track of the original text. This parameter should point to a directory containing the original text files. The file names should match the MMAX2 project IDs.") private String originalTextFilesDir; - private LinkedList folderList; private String actualPath; private HashMap levels2uimaNames; @@ -62,6 +64,7 @@ public void initialize(UimaContext context) throws ResourceInitializationExcepti annotationLevels = (String[]) context.getConfigParameterValue(PARAM_ANNOTATION_LEVELS); uimaTypeNames = (String[]) getUimaContext().getConfigParameterValue(PARAM_UIMA_ANNOTATION_TYPES); originalTextFilesDir = (String) context.getConfigParameterValue(PARAM_ORIGINAL_TEXT_FILES); + removeOverlappingShorterAnnotations = Optional.ofNullable((Boolean) context.getConfigParameterValue(PARAM_REMOVE_OVERLAPPING_SHORTER_ANNOTATIONS)).orElse(false); actualPath = null; if (annotationLevels.length != uimaTypeNames.length) throw new IllegalArgumentException("The number of annotation levels and the number of UIMA type names must match. But the given annotation levels are '" + Arrays.toString(annotationLevels) + "' and the UIMA types names are '" + Arrays.toString(uimaTypeNames) + "'."); @@ -208,8 +211,9 @@ private void produceOutput(MMAX2Discourse discourse, WordInformation[] words, JC outPlain.append(" "); } } + Set ignoredMarkables = getIgnoredMarkables(discourse); for (int i = 0; i < annotationLevels.length; ++i) { - Iterator iterator = discourse.getMarkableLevelByName(annotationLevels[i], false).getMarkables().stream().map(Markable.class::cast).filter(Predicate.not(Markable::isDiscontinuous)).iterator(); + Iterator iterator = discourse.getMarkableLevelByName(annotationLevels[i], false).getMarkables().stream().map(Markable.class::cast).filter(Predicate.not(ignoredMarkables::contains)).filter(Predicate.not(Markable::isDiscontinuous)).iterator(); int id = 0; while (iterator.hasNext()) { Markable markable = iterator.next(); @@ -228,7 +232,7 @@ private void produceOutput(MMAX2Discourse discourse, WordInformation[] words, JC if (a instanceof ConceptMention) ((ConceptMention) a).setSpecificType(markable.getAttributeValue(markable.getMarkableLevelName())); else if (a instanceof Sentence) - ((Sentence)a).setId(String.valueOf(id)); + ((Sentence) a).setId(String.valueOf(id)); a.addToIndexes(); ++id; } @@ -253,6 +257,41 @@ else if (a instanceof Sentence) h.addToIndexes(); } + private Set getIgnoredMarkables(MMAX2Discourse discourse) { + if (!removeOverlappingShorterAnnotations) + return Collections.emptySet(); + Set toIgnore = new HashSet<>(); + for (int i = 0; i < annotationLevels.length; ++i) { + Map> markablesByPos = new HashMap<>(); + Iterator iterator = discourse.getMarkableLevelByName(annotationLevels[i], false).getMarkables().stream().map(Markable.class::cast).filter(Predicate.not(Markable::isDiscontinuous)).iterator(); + while (iterator.hasNext()) { + Markable markable = iterator.next(); + // associate the markable with all the word indices it covers + IntStream.rangeClosed(markable.getLeftmostDiscoursePosition(), markable.getRightmostDiscoursePosition()).forEach(j -> markablesByPos.compute(j, (k, v) -> v != null ? v : new HashSet<>()).add(markable)); + } + // now, for each word index, keep only the longest markable + for (Integer pos : markablesByPos.keySet()) { + Set markables = markablesByPos.get(pos); + if (markables.size() > 1) { + int maxSize = 0; + Markable longestMarkable = null; + for (Markable markable : markables) { + // first, we just add all markables to ignore + toIgnore.add(markable); + int markableLength = markable.getRightmostDiscoursePosition() - markable.getLeftmostDiscoursePosition() + 1; + if (markableLength > maxSize) { + maxSize = markableLength; + longestMarkable = markable; + } + } + // now remove only the longest markable - that we want to keep - from the set of ignores markables + toIgnore.remove(longestMarkable); + } + } + } + return toIgnore; + } + private void handleOriginalTextInformation(String pmid, WordInformation[] words) throws CollectionException { if (originalTextFilesDir.length() > 0 && !originalTextFilesDir.endsWith("/")) originalTextFilesDir += File.separator; diff --git a/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java b/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java index 65a401a0b..79b9bfb11 100644 --- a/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java +++ b/jcore-mmax2-reader/src/test/java/de/julielab/jcore/cr/mmax2/MMAX2ReaderTest.java @@ -15,6 +15,7 @@ import java.util.Collection; import static org.assertj.core.api.Assertions.assertThat; + /** * Unit tests for jcore-mmax2-reader. * @@ -51,7 +52,7 @@ public void testReader() throws Exception { Collection tokens = JCasUtil.select(jCas, Token.class); // check a small sample of tokens that should have been created - assertThat(tokens).map(Token::getCoveredText).contains("Characterization", "IFNAR-1", ":", "(", "subunits", "recognition", ".", "HuIFNAR-1"); + assertThat(tokens).map(Token::getCoveredText).contains("Characterization", "IFNAR-1", ":", "(", "subunits", "recognition", ".", "HuIFNAR-1"); } @Test @@ -85,14 +86,30 @@ public void testReader2() throws Exception { reader.getNext(jCas.getCas()); Header h = JCasUtil.selectSingle(jCas, Header.class); - assertThat(h.getDocId()).isEqualTo("10471746"); + assertThat(h.getDocId()).isEqualTo("14731280"); Collection proteins = JCasUtil.select(jCas, Protein.class); - for (var p : proteins) { - System.out.println(p.getCoveredText() + ": " + p.getBegin() + "-"+p.getEnd()); - } - Collection sentences = JCasUtil.select(jCas, Sentence.class); - for (var s : sentences) - System.out.println(s.getBegin() + " - " + s.getEnd()); + // there is this one protein seemingly annotated double; while this is more of an error than the real case + // to handle, it was responsible for errors and works for a simple test + long overlappingProteinCount = proteins.stream().filter(p -> p.getBegin() == 95 && p.getEnd() == 99).count(); + assertThat(overlappingProteinCount).isEqualTo(2); + + // now activate the parameter to avoid overlapping annotations + jCas.reset(); + reader = CollectionReaderFactory.createReader("de.julielab.jcore.cr.mmax2.desc.jcore-mmax2-reader", + MMAX2Reader.PARAM_INPUT_DIR, Path.of("src", "test", "resources", "input2").toString(), + MMAX2Reader.PARAM_ANNOTATION_LEVELS, new String[]{"proteins", "sentence"}, + MMAX2Reader.PARAM_UIMA_ANNOTATION_TYPES, new String[]{"de.julielab.jcore.types.Protein", "de.julielab.jcore.types.Sentence"}, + MMAX2Reader.PARAM_REMOVE_OVERLAPPING_SHORTER_ANNOTATIONS, true); + assertThat(reader.hasNext()).isTrue(); + reader.getNext(jCas.getCas()); + + + proteins = JCasUtil.select(jCas, Protein.class); + // there shouldn't be an overlap any more + overlappingProteinCount = proteins.stream().filter(p -> p.getBegin() == 95 && p.getEnd() == 99).count(); + assertThat(overlappingProteinCount).isEqualTo(1); } + + } From abccb002d8b3cbce471a97cf7ae0a89eb98fd055 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 21 Apr 2022 08:11:44 +0200 Subject: [PATCH 193/269] Add test data to git. --- .../cr/mmax2/desc/jcore-mmax2-reader.xml | 18 +- .../resources/input2/mmax_23647/Basedata.uri | 1 + .../input2/mmax_23647/Basedata/Basedata.xml | 299 ++++++++++++++++++ .../input2/mmax_23647/Basedata/words.dtd | 3 + .../mmax_23647/Customizations/proteins.xml | 72 +++++ .../mmax_23647/Customizations/sentence.xml | 3 + .../input2/mmax_23647/Markables/markables.dtd | 2 + .../input2/mmax_23647/Markables/proteins.xml | 29 ++ .../input2/mmax_23647/Markables/sentence.xml | 11 + .../input2/mmax_23647/Schemes/proteins.xml | 16 + .../input2/mmax_23647/Schemes/sentence.xml | 3 + .../mmax_23647/Styles/default_style.xsl | 58 ++++ .../input2/mmax_23647/common_paths.xml | 17 + .../resources/input2/mmax_23647/project.mmax | 7 + 14 files changed, 537 insertions(+), 2 deletions(-) create mode 100644 jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata.uri create mode 100644 jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata/Basedata.xml create mode 100644 jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata/words.dtd create mode 100644 jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Customizations/proteins.xml create mode 100644 jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Customizations/sentence.xml create mode 100644 jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/markables.dtd create mode 100644 jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/proteins.xml create mode 100644 jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/sentence.xml create mode 100644 jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Schemes/proteins.xml create mode 100644 jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Schemes/sentence.xml create mode 100644 jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Styles/default_style.xsl create mode 100644 jcore-mmax2-reader/src/test/resources/input2/mmax_23647/common_paths.xml create mode 100644 jcore-mmax2-reader/src/test/resources/input2/mmax_23647/project.mmax diff --git a/jcore-mmax2-reader/src/main/resources/de/julielab/jcore/cr/mmax2/desc/jcore-mmax2-reader.xml b/jcore-mmax2-reader/src/main/resources/de/julielab/jcore/cr/mmax2/desc/jcore-mmax2-reader.xml index 8f3289029..6d5978b54 100644 --- a/jcore-mmax2-reader/src/main/resources/de/julielab/jcore/cr/mmax2/desc/jcore-mmax2-reader.xml +++ b/jcore-mmax2-reader/src/main/resources/de/julielab/jcore/cr/mmax2/desc/jcore-mmax2-reader.xml @@ -3,10 +3,17 @@ org.apache.uima.java de.julielab.jcore.cr.mmax2.MMAX2Reader - JCoRe MMAX2 reader. + JCoRe MMAX2 reader Collection reader for MMAX2 annotation projects. JULIE Lab Jena, Germany + + RemoveOverlappingShorterAnnotations + If set to true, for all overlapping annotations only the longest is kept. + Boolean + false + false + InputDir Should point to the directory of which the MMAX2 projects are sub directories of. @@ -36,7 +43,14 @@ false - + + + RemoveOverlappingShorterAnnotations + + false + + + diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata.uri b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata.uri new file mode 100644 index 000000000..4e6d1a1f3 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata.uri @@ -0,0 +1 @@ +14731280 diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata/Basedata.xml b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata/Basedata.xml new file mode 100644 index 000000000..90e494de3 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata/Basedata.xml @@ -0,0 +1,299 @@ + + + +Multiple +stress +signal +integration +in +the +regulation +of +the +complex +sigma +S +- +dependent +csiD +- +ygaF +- +gabDTP +operon +in +Escherichia +coli +. +The +csiD +- +ygaF +- +gabDTP +region +in +the +Escherichia +coli +genome +represents +a +cluster +of +sigma +S +- +controlled +genes +. +Here +, +we +investigated +promoter +structures +, +sigma +factor +dependencies +, +potential +co +- +regulation +and +environmental +regulatory +patterns +for +all +of +these +genes +. +We +find +that +this +region +constitutes +a +complex +operon +with +expression +being +controlled +by +three +differentially +regulated +promoters +: +(i) +csiDp +, +which +affects +the +expression +of +all +five +genes +, +is +cAMP +- +CRP +/ +sigma +S +- +dependent +and +activated +exclusively +upon +carbon +starvation +and +stationary +phase +; +(ii) +gabDp1 +, +which +is +sigma +S +- +dependent +and +exhibits +multiple +stress +induction +like +sigma +S +itself +; +and +(iii) +gabDp2 +[ +previously +suggested +by +Schneider +, +B.L. +, +Ruback +, +S. +, +Kiupakis +, +A.K. +, +Kasbarian +, +H. +, +Pybus +, +C. +, +and +Reitzer +, +L. +( +2002 +) +J. +Bacteriol. +184 +: +6976-6986 +] +, +which +appears +to +be +Nac +/ +sigma +70 +- +controlled +and +to +respond +to +poor +nitrogen +sources +. +In +addition +, +we +identify +a +novel +repressor +, +CsiR +, +which +modulates +csiDp +activity +in +a +temporal +manner +during +early +stationary +phase +. +Finally +, +we +propose +a +physiological +role +for +sigma +S +- +controlled +GabT +/ +D +- +mediated +gamma-aminobutyrate +( +GABA +) +catabolism +and +glutamate +accumulation +in +general +stress +adaptation +. +This +physiological +role +is +reflected +by +the +activation +of +the +operon +- +internal +gabDp1 +promoter +under +the +different +conditions +that +also +induce +sigma +S +, +which +include +shifts +to +acidic +pH +or +high +osmolarity +as +well +as +starvation +or +stationary +phase +. + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata/words.dtd b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata/words.dtd new file mode 100644 index 000000000..a02b470f1 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Basedata/words.dtd @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Customizations/proteins.xml b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Customizations/proteins.xml new file mode 100644 index 000000000..0f4bd71f8 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Customizations/proteins.xml @@ -0,0 +1,72 @@ + + + + + + + + + + + + + + + + + diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Customizations/sentence.xml b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Customizations/sentence.xml new file mode 100644 index 000000000..6fbf9d136 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Customizations/sentence.xml @@ -0,0 +1,3 @@ + + + diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/markables.dtd b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/markables.dtd new file mode 100644 index 000000000..220e8b3c8 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/markables.dtd @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/proteins.xml b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/proteins.xml new file mode 100644 index 000000000..1a5bd6616 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/proteins.xml @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/sentence.xml b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/sentence.xml new file mode 100644 index 000000000..c35553af7 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Markables/sentence.xml @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Schemes/proteins.xml b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Schemes/proteins.xml new file mode 100644 index 000000000..1045dc27e --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Schemes/proteins.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Schemes/sentence.xml b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Schemes/sentence.xml new file mode 100644 index 000000000..f37fbc936 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Schemes/sentence.xml @@ -0,0 +1,3 @@ + + + diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Styles/default_style.xsl b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Styles/default_style.xsl new file mode 100644 index 000000000..ab671aa34 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/Styles/default_style.xsl @@ -0,0 +1,58 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/common_paths.xml b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/common_paths.xml new file mode 100644 index 000000000..8f55971b4 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/common_paths.xml @@ -0,0 +1,17 @@ + + + +Basedata/ +Markables/ +Schemes/ +Styles/ +Customizations/ +default_style.xsl + + +proteins.xml +sentence.xml + + + + \ No newline at end of file diff --git a/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/project.mmax b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/project.mmax new file mode 100644 index 000000000..52fc0b1c1 --- /dev/null +++ b/jcore-mmax2-reader/src/test/resources/input2/mmax_23647/project.mmax @@ -0,0 +1,7 @@ + + + +Basedata.xml + + + \ No newline at end of file From 12664a548e8bbae79da694f1911af45f80db6646 Mon Sep 17 00:00:00 2001 From: khituras Date: Wed, 4 May 2022 11:42:29 +0200 Subject: [PATCH 194/269] Allow the BANNERAnnotator to set the componentId via a parameter. --- jcore-banner-ae/pom.xml | 4 ++ .../jcore/ae/banner/BANNERAnnotator.java | 6 ++- .../jcore/ae/banner/desc/jcore-banner-ae.xml | 38 +++++++++++-------- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/jcore-banner-ae/pom.xml b/jcore-banner-ae/pom.xml index ec5a25e53..26e4df8e2 100644 --- a/jcore-banner-ae/pom.xml +++ b/jcore-banner-ae/pom.xml @@ -58,6 +58,10 @@ jcore-mallet-2.0.9 2.1.2 + + de.julielab + jcore-descriptor-creator + de.julielab julielab-java-utilities diff --git a/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java b/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java index a29132d5c..43b29b9fd 100644 --- a/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java +++ b/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java @@ -50,6 +50,7 @@ public class BANNERAnnotator extends JCasAnnotator_ImplBase { public static final String PARAM_CONFIG_FILE = "ConfigFile"; public static final String PARAM_TYPE_MAPPING = "TypeMapping"; + public static final String PARAM_COMPONENT_ID = "ComponentId"; private final static Logger log = LoggerFactory.getLogger(BANNERAnnotator.class); private Tokenizer tokenizer; private DictionaryTagger dictionary; @@ -64,6 +65,8 @@ public class BANNERAnnotator extends JCasAnnotator_ImplBase { private String configFilePath; @ConfigurationParameter(name = PARAM_TYPE_MAPPING, mandatory = false, description = "A list of mappings from entity labels to UIMA types in the form