diff --git a/.github/maven-settings.xml b/.github/maven-settings.xml
new file mode 100644
index 000000000..9c8a6c405
--- /dev/null
+++ b/.github/maven-settings.xml
@@ -0,0 +1,24 @@
+
+
+
+
+ sonatype-snapshots
+
+
+ sonatype-nexus-snapshots
+ Sonatype Nexus Snapshots
+ https://oss.sonatype.org/content/repositories/snapshots
+
+ false
+
+
+ true
+
+
+
+
+
+
+ sonatype-snapshots
+
+
\ No newline at end of file
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
new file mode 100644
index 000000000..1d065c4e8
--- /dev/null
+++ b/.github/workflows/maven.yml
@@ -0,0 +1,33 @@
+# This workflow will build a Java project with Maven
+# For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven
+
+name: Java CI with Maven
+
+on:
+ push:
+ branches: [ master, v2.6 ]
+ pull_request:
+ branches: [ master, v2.6 ]
+
+jobs:
+ build:
+
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Set up Python 3.7
+ uses: actions/setup-python@v2
+ with:
+ python-version: 3.7
+ - name: Install python dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install flair==0.11.3
+ - uses: actions/checkout@v2
+ - name: Set up JDK 11
+ uses: actions/setup-java@v2
+ with:
+ java-version: '11'
+ distribution: 'adopt'
+ - name: Build with Maven
+ run: mvn -B package --file pom.xml --settings .github/maven-settings.xml
diff --git a/.gitignore b/.gitignore
index 247d87c61..6da01ef44 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@ target
**/*.iml
/julie-xml-tools.jar
+/jcore-pmc-db-reader/src/test/resources/hiddenConfig
diff --git a/.travis.yml b/.travis.yml
index 208b0219a..bce762cc1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,8 +6,8 @@ addons:
sources:
- deadsnakes
packages:
- - python3.6
- - python3.6-dev
+ - python3.7
+ - python3.7-dev
env:
global:
@@ -19,7 +19,7 @@ env:
- # GPG_KEY_NAME
- secure: pxYxmA/9xS/9DO6rUAhlbAtYQMmG633jSwG8OIVCnnoQSXS4UILJgNl7Q6dQsAuT27tk+/fin0kXTnxWqCe0URb3c3XgNQwfGAuz1JIYVPHvezoDQLLRQA6LRgqd7GuvBDsyXJvBANozGKJYJVfoeT9gqFosFuMdRZ88eQm+ltX7zVKyMiz2rqKYPoSFInNxDGMOaIQ+RZdf8ai8rLY3E11PxsMC0LgypEDbuC7d9Q+Tu89YfUeuRly0hAuxmW++RrMgeeAs/7BndmZqcHVpkrcX6Drq8nZ2cj0ev4IDJelV/Nd17Vjfg7HgfJ4/d9S+PCg4KhvOY/y9Xad8geIIzXLFD9ZgcaK7MT9+BFGYXj7ExizFSc+Ico5Q822RJA1XZWfc/EgnY+7jEZCCMz/ceHx8oSh0ce1VbPl7c+O+jMXUMQC69Gpys57XC48rdPn0bbjc4/jpSOq46Xv7YdcGuA2BcWEEeQ0WAbi9IDcevpCXiZ7kng5hHTCpfaYVhn63KAIAMKf7tu6C78wFZR63F8Gf4x/jKE37QqvHV3uOzD7ar6nTAuy/ukZK0p4zyeIYe25PnS9K4kpolT1I12i7/l/7MO9NPFdB0aOCBHUNPBEkifwceltX6RP4PDIKdtCEQ4vcqrRNvhtAhO9Vo1udkyaeFx5swbY3j11CjzcfrBE=
- # GPG_PASSPHRASE
- - PYTHON=/usr/bin/python3.6
+ - PYTHON=/usr/bin/python3.7
before_install:
- |
@@ -31,11 +31,11 @@ before_install:
if ! find "$HOME/pip-cache" -mindepth 1 -print -quit 2>/dev/null | grep -q .; then
$PYTHON -m pip download --destination-directory="$HOME/pip-cache" flair
fi
- sudo -H $PYTHON -m pip install --find-links="$HOME/pip-cache" flair==0.4.5
+ sudo -H $PYTHON -m pip install --find-links="$HOME/pip-cache" flair==0.6.1 torch==1.7.1
- #./travis-deployment/install-flair-nightly.sh
- export BOTO_CONFIG=/dev/null
install: mvn install -DskipTests=true -Dmaven.javadoc.skip=true -B -V
-script: mvn -T 1C test -B
+script: mvn -T 2C test -B
cache:
directories:
@@ -51,4 +51,4 @@ deploy:
skip_cleanup: true
on:
all_branches: true
- condition: $TRAVIS_BRANCH =~ ^v2.5|master$
\ No newline at end of file
+ condition: $TRAVIS_BRANCH =~ ^v2.6|master$
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000000000..7e93520be
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,25 @@
+BSD 2-Clause License
+
+Copyright (c) 2022, JULIE Lab
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
index 9035ccbb1..79c1fbc99 100644
--- a/README.md
+++ b/README.md
@@ -12,24 +12,29 @@ In order to automate the builds of complex NLP pipelines and properly represent
A description for each individual component can be found in their respective `README.md`.
### Requirements & Dependencies
-In order to use our components you need at least [JDK 11](https://www.oracle.com/technetwork/java/javase/downloads/jdk11-downloads-5066655.html) (Java SE Development Kit 11), [UIMA 2.10](https://uima.apache.org/index.html) & [Maven 3](https://maven.apache.org/). We develop with the [Eclipse IDE for Java Developers](http://www.eclipse.org/downloads/) and [IntelliJ IDEA](https://www.jetbrains.com/idea/) Java IDEs. If course you're free to try it with different versions or tools than those mentioned, but we can't make promises for a flawless functioning of our components in these cases.
+In order to use our components you need at least [JDK 11](https://www.oracle.com/technetwork/java/javase/downloads/jdk11-downloads-5066655.html) (Java SE Development Kit 11), [UIMA 2.x](https://uima.apache.org/index.html) & [Maven 3](https://maven.apache.org/). We develop with the [Eclipse IDE for Java Developers](http://www.eclipse.org/downloads/) and [IntelliJ IDEA](https://www.jetbrains.com/idea/) Java IDEs. If course you're free to try it with different versions or tools than those mentioned, but we can't make promises for a flawless functioning of our components in these cases.
### UIMA's Collection Processing Engine (CPE)
-UIMA features a relatively easy way to combine UIMA components together in order to analyze a collection of artifacts. If you're not firm or willing to deal with Java Code, the usage of a CPE might be the right choice.
+UIMA offers a relatively easy way to combine UIMA components together in order to analyze a collection of artifacts. If you're not firm or willing to deal with Java Code, the usage of a CPE might be the right choice.
For more detailed information see [UIMA's CPE Documentation](https://uima.apache.org/downloads/releaseDocs/2.1.0-incubating/docs/html/tutorials_and_users_guides/tutorials_and_users_guides.html#ugr.tug.cpe).
-We're also working on a simple [Python script](https://github.com/JULIELab/jcore-misc/tree/master/jcore-cpe-builder) that builds rudimentary and preconfigured CPEs of your choice. It's working but still work in progress so please bear with us and post issues.
+A newer alternative is [UIMA AS](https://uima.apache.org/doc-uimaas-what.html). It is today's officially recommended way to use and scale UIMA pipelines. Our existing CPE infrastructure serves us well, however, so we mostly stick to those for the time being.
+
+### JCoRe UIMA Pipeline Builder
+
+Most CPE configurations consisting of JCoRe components can be easily built using the [JCoRe UIMA Pipeline Builder](https://github.com/JULIELab/jcore-pipeline-modules).
+This is a Java program that offers a simple command line interface for the creation of CPEs. There is also support for UIMA AS.
### Maven Artifacts
If not stated otherwise, all the components found in this project are at least in their latest release version also available as Maven artifacts:
```
de.julielab
- #COMPONENT-NAME
+ COMPONENT-NAME${jcore-version}
```
-Where `#COMPONENT-NAME` is exactly the same as the name on GitHub.
+Where `COMPONENT-NAME` is exactly the same as the name on GitHub.
For instance, to get the Acronym Resolver, include this in your Maven dependencies:
```
diff --git a/jcore-ace-reader/component.meta b/jcore-ace-reader/component.meta
index 65d83f33b..ac1392e63 100644
--- a/jcore-ace-reader/component.meta
+++ b/jcore-ace-reader/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-ace-reader",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe ACE Reader"
}
diff --git a/jcore-ace-reader/pom.xml b/jcore-ace-reader/pom.xml
index fad4ca485..c4fa13273 100644
--- a/jcore-ace-reader/pom.xml
+++ b/jcore-ace-reader/pom.xml
@@ -13,7 +13,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0
@@ -32,8 +32,8 @@
${jcore-types-version}
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-enginede.julielab
diff --git a/jcore-ace-reader/src/main/resources/de/julielab/jcore/reader/ace/desc/jcore-ace-reader.xml b/jcore-ace-reader/src/main/resources/de/julielab/jcore/reader/ace/desc/jcore-ace-reader.xml
index 6d7d29ff9..576236d5c 100644
--- a/jcore-ace-reader/src/main/resources/de/julielab/jcore/reader/ace/desc/jcore-ace-reader.xml
+++ b/jcore-ace-reader/src/main/resources/de/julielab/jcore/reader/ace/desc/jcore-ace-reader.xml
@@ -5,7 +5,7 @@
AceReaderDescriptor automatically generated by uimaFIT
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-ace-reader/src/test/java/de/julielab/jcore/reader/ace/AceReaderTest.java b/jcore-ace-reader/src/test/java/de/julielab/jcore/reader/ace/AceReaderTest.java
index 465a384f7..b6bd606e4 100644
--- a/jcore-ace-reader/src/test/java/de/julielab/jcore/reader/ace/AceReaderTest.java
+++ b/jcore-ace-reader/src/test/java/de/julielab/jcore/reader/ace/AceReaderTest.java
@@ -21,7 +21,6 @@
import de.julielab.jcore.types.ArgumentMention;
import de.julielab.jcore.types.EntityMention;
import de.julielab.jcore.types.ace.*;
-import junit.framework.TestCase;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.metadata.AnalysisEngineMetaData;
import org.apache.uima.cas.CAS;
@@ -38,6 +37,8 @@
import org.apache.uima.util.InvalidXMLException;
import org.apache.uima.util.XMLInputSource;
import org.apache.uima.util.XMLSerializer;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
@@ -50,7 +51,9 @@
import java.util.ArrayList;
import java.util.Iterator;
-public class AceReaderTest extends TestCase {
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class AceReaderTest {
/**
* Path to the MedlineReader descriptor
*/
@@ -65,47 +68,46 @@ public class AceReaderTest extends TestCase {
/**
* Object to be tested
*/
- private CollectionReader aceReader;
+ private static CollectionReader aceReader;
/**
* Auxiliary collection reader
*/
- private CollectionReader testReader;
+ private static CollectionReader testReader;
/**
* CAS array list with CAS objects that where processed by the aceReader
*/
- private ArrayList casArrayList = new ArrayList();
+ private static ArrayList casArrayList = new ArrayList();
/**
* Auxiliary CAS objects
*/
- private CAS aceReaderCas;
+ private static CAS aceReaderCas;
- private CAS testReaderCas;
+ private static CAS testReaderCas;
- private JCas aceReaderJCas;
+ private static JCas aceReaderJCas;
- private JCas testReaderJCas;
+ private static JCas testReaderJCas;
- LOC entity1_1;
+ static LOC entity1_1;
- LOC entity1_2;
+ static LOC entity1_2;
- GPE entity2_1;
+ static GPE entity2_1;
- GPE entity2_2;
+ static GPE entity2_2;
- GPE entity2_3;
+ static GPE entity2_3;
- GPE entity2_4;
+ static GPE entity2_4;
/*----------------------------------------------------------------------------------------------*/
- @Override
- protected void setUp() throws Exception {
+ @BeforeAll
+ protected static void setUp() throws Exception {
aceReader = getCollectionReader(ACE_READER_DESCRIPTOR);
processAllCases();
- super.setUp();
System.out.println("ALL CASes were processed");
} // of setUp
@@ -118,7 +120,7 @@ protected void setUp() throws Exception {
* @throws SAXException
* @throws ParserConfigurationException
*/
- private void processAllCases() throws CASException, SAXException, ParserConfigurationException {
+ private static void processAllCases() throws CASException, SAXException, ParserConfigurationException {
try {
while (aceReader.hasNext()) {
@@ -157,13 +159,13 @@ private void processAllCases() throws CASException, SAXException, ParserConfigur
} // of processAllCases
/*----------------------------------------------------------------------------------------------*/
- private void compareCASes() {
- assertTrue("Invalid source file attributes!", checkSourceFile());
- assertTrue("Invalid generated Jules Components!", checkGeneratedJulesComponents());
+ private static void compareCASes() {
+ assertTrue(checkSourceFile(), "Invalid source file attributes!");
+ assertTrue(checkGeneratedJulesComponents(), "Invalid generated Jules Components!");
} // compareCASes
/*----------------------------------------------------------------------------------------------*/
- private boolean checkGeneratedJulesComponents() {
+ private static boolean checkGeneratedJulesComponents() {
System.out.println("CALL checkGeneratedJulesComponents()");
boolean julesComponentsEqual = true;
@@ -185,7 +187,7 @@ private boolean checkGeneratedJulesComponents() {
} // checkGeneratedJulesComponents
/*----------------------------------------------------------------------------------------------*/
- private boolean checkJulesEntities() {
+ private static boolean checkJulesEntities() {
System.out.println("CALL checkJulesEntities()");
boolean julesEntityEqual = true;
@@ -237,7 +239,7 @@ private boolean checkJulesEntities() {
} // of checkJulesEntities
/*----------------------------------------------------------------------------------------------*/
- private boolean checkJulesRelations() {
+ private static boolean checkJulesRelations() {
System.out.println("CALL checkJulesRelations()");
boolean juleRelationEqual = true;
@@ -286,8 +288,8 @@ private boolean checkJulesRelations() {
} // of checkJulesRelations
/*----------------------------------------------------------------------------------------------*/
- private boolean checkJulesRelationArguments(de.julielab.jcore.types.RelationMention aceReaderRelation,
- de.julielab.jcore.types.RelationMention testReaderRelation) {
+ private static boolean checkJulesRelationArguments(de.julielab.jcore.types.RelationMention aceReaderRelation,
+ de.julielab.jcore.types.RelationMention testReaderRelation) {
System.out.println("CALL checkJulesRelationArguments()");
boolean julesRelationArgumentEqual = true;
@@ -449,7 +451,7 @@ private boolean checkJulesEventArguments(de.julielab.jcore.types.EventMention ac
} // of checkJulesEventArguments
/*----------------------------------------------------------------------------------------------*/
- private boolean checkSourceFile() {
+ private static boolean checkSourceFile() {
boolean sourceFileEqual = true;
Iterator aceReaderIterator = getTypeIterator(aceReaderCas, de.julielab.jcore.types.ace.SourceFile.type);
@@ -499,7 +501,7 @@ private boolean checkSourceFile() {
} // checkSourceFile
/*----------------------------------------------------------------------------------------------*/
- private boolean checkDocument() {
+ private static boolean checkDocument() {
boolean documentEqual = true;
Iterator aceReaderIterator = getTypeIterator(aceReaderCas, de.julielab.jcore.types.ace.Document.type);
@@ -568,7 +570,7 @@ private boolean checkDocument() {
} // of checkDocument
/*----------------------------------------------------------------------------------------------*/
- private boolean checkEvents(Document aceReaderDocument, Document testReaderDocument) {
+ private static boolean checkEvents(Document aceReaderDocument, Document testReaderDocument) {
System.out.println("CALL checkEvents()");
boolean eventEqual = true;
@@ -641,7 +643,7 @@ private boolean checkEvents(Document aceReaderDocument, Document testReaderDocum
} // of checkEvents
/*----------------------------------------------------------------------------------------------*/
- private boolean checkEventMentions(Event aceReaderEvent, Event testReaderEvent) {
+ private static boolean checkEventMentions(Event aceReaderEvent, Event testReaderEvent) {
boolean eventMentionEqual = true;
FSArray aceReaderEventMentionFSArray = aceReaderEvent.getMentions();
@@ -703,7 +705,7 @@ private boolean checkEventMentions(Event aceReaderEvent, Event testReaderEvent)
} // checkEventMentions
/*----------------------------------------------------------------------------------------------*/
- private boolean checkEventMentionArguments(EventMention aceReaderEventMention, EventMention testReaderEventMention) {
+ private static boolean checkEventMentionArguments(EventMention aceReaderEventMention, EventMention testReaderEventMention) {
boolean eventMentionArgumentEqual = true;
FSArray aceReaderEventMentionArgumentFSArray = aceReaderEventMention.getArguments();
@@ -740,7 +742,7 @@ private boolean checkEventMentionArguments(EventMention aceReaderEventMention, E
} // of checkEventMentionArguments
/*----------------------------------------------------------------------------------------------*/
- private boolean checkEventArguments(Event aceReaderEvent, Event testReaderEvent) {
+ private static boolean checkEventArguments(Event aceReaderEvent, Event testReaderEvent) {
boolean eventArgumentEqual = true;
FSArray aceReaderEventArgumentFSArray = aceReaderEvent.getArguments();
@@ -767,7 +769,7 @@ private boolean checkEventArguments(Event aceReaderEvent, Event testReaderEvent)
} // of checkEventArguments
/*----------------------------------------------------------------------------------------------*/
- private boolean checkRelations(Document aceReaderDocument, Document testReaderDocument) {
+ private static boolean checkRelations(Document aceReaderDocument, Document testReaderDocument) {
boolean relationEqual = true;
FSArray aceReaderRelationFSArray = aceReaderDocument.getRelations();
@@ -830,7 +832,7 @@ private boolean checkRelations(Document aceReaderDocument, Document testReaderDo
} // of checkRelations
/*----------------------------------------------------------------------------------------------*/
- private boolean checkRelationMentions(Relation aceReaderRelation, Relation testReaderRelation) {
+ private static boolean checkRelationMentions(Relation aceReaderRelation, Relation testReaderRelation) {
boolean relationMentionEqual = true;
FSArray aceReaderRelationMentionFSArray = aceReaderRelation.getMentions();
@@ -885,8 +887,8 @@ private boolean checkRelationMentions(Relation aceReaderRelation, Relation testR
} // checkRelationMentions
/*----------------------------------------------------------------------------------------------*/
- private boolean checkRelationMentionArguments(RelationMention aceReaderRelationMention,
- RelationMention testReaderRelationMention) {
+ private static boolean checkRelationMentionArguments(RelationMention aceReaderRelationMention,
+ RelationMention testReaderRelationMention) {
boolean relationMentionArgumentEqual = true;
FSArray aceReaderRelationMentionArgumentFSArray = aceReaderRelationMention.getArguments();
@@ -925,7 +927,7 @@ private boolean checkRelationMentionArguments(RelationMention aceReaderRelationM
}
/*----------------------------------------------------------------------------------------------*/
- private boolean checkRelationArguments(Relation aceReaderRelation, Relation testReaderRelation) {
+ private static boolean checkRelationArguments(Relation aceReaderRelation, Relation testReaderRelation) {
boolean relationArgumentEqual = true;
FSArray aceReaderRelationArgumentFSArray = aceReaderRelation.getArguments();
@@ -952,7 +954,7 @@ private boolean checkRelationArguments(Relation aceReaderRelation, Relation test
} // checkRelationArguments
/*----------------------------------------------------------------------------------------------*/
- private boolean checkTimex2(Document aceReaderDocument, Document testReaderDocument) {
+ private static boolean checkTimex2(Document aceReaderDocument, Document testReaderDocument) {
boolean timex2Equal = true;
FSArray aceReaderTimex2FSArray = aceReaderDocument.getTimex2();
@@ -985,7 +987,7 @@ private boolean checkTimex2(Document aceReaderDocument, Document testReaderDocum
} // checkTimex2
/*----------------------------------------------------------------------------------------------*/
- private boolean checkTimex2Mentions(Timex2 aceReaderTimex2, Timex2 testReaderTimex2) {
+ private static boolean checkTimex2Mentions(Timex2 aceReaderTimex2, Timex2 testReaderTimex2) {
boolean timex2MentionEqual = true;
FSArray aceReaderTimex2MentionFSArray = aceReaderTimex2.getMentions();
@@ -1017,7 +1019,7 @@ private boolean checkTimex2Mentions(Timex2 aceReaderTimex2, Timex2 testReaderTim
} // of checkTimex2Mentions
/*----------------------------------------------------------------------------------------------*/
- private boolean checkValues(Document aceReaderDocument, Document testReaderDocument) {
+ private static boolean checkValues(Document aceReaderDocument, Document testReaderDocument) {
boolean valueEqual = true;
FSArray aceReaderValueFSArray = aceReaderDocument.getValues();
@@ -1060,7 +1062,7 @@ private boolean checkValues(Document aceReaderDocument, Document testReaderDocum
} // of checkValues
/*----------------------------------------------------------------------------------------------*/
- private boolean checkValueMentions(Value aceReaderValue, Value testReaderValue) {
+ private static boolean checkValueMentions(Value aceReaderValue, Value testReaderValue) {
boolean valueMentionEqual = true;
FSArray aceReaderValueMentionFSArray = aceReaderValue.getMentions();
@@ -1093,7 +1095,7 @@ private boolean checkValueMentions(Value aceReaderValue, Value testReaderValue)
} // of checkValueMentions
/*----------------------------------------------------------------------------------------------*/
- private boolean checkEntities() {
+ private static boolean checkEntities() {
boolean entityEqual = true;
Iterator aceReaderIterator = getTypeIterator(aceReaderCas, de.julielab.jcore.types.ace.Entity.type);
@@ -1176,7 +1178,7 @@ private boolean checkEntities() {
} // checkEntities
/*----------------------------------------------------------------------------------------------*/
- private boolean checkEntityAttributes(Entity aceReaderEntity, Entity testReaderEntity) {
+ private static boolean checkEntityAttributes(Entity aceReaderEntity, Entity testReaderEntity) {
boolean entityAttributeEqual = true;
FSArray aceReaderEntityAttributeFSArray = aceReaderEntity.getEntity_attributes();
FSArray testReaderEntityAttributeFSArray = testReaderEntity.getEntity_attributes();
@@ -1208,8 +1210,8 @@ private boolean checkEntityAttributes(Entity aceReaderEntity, Entity testReaderE
} // of checkEntityAttributes
/*----------------------------------------------------------------------------------------------*/
- private boolean checkEntityAttributesNames(EntityAttribute aceReaderEntityAttribute,
- EntityAttribute testReaderEntityAttribute) {
+ private static boolean checkEntityAttributesNames(EntityAttribute aceReaderEntityAttribute,
+ EntityAttribute testReaderEntityAttribute) {
boolean entityAttributesNamesEqual = true;
FSArray aceReaderEntityAttributesNamesFSArray = aceReaderEntityAttribute.getNames();
FSArray testReaderEntityAttributesNamesFSArray = testReaderEntityAttribute.getNames();
@@ -1241,7 +1243,7 @@ private boolean checkEntityAttributesNames(EntityAttribute aceReaderEntityAttrib
} // checkEntityAttributesNames
/*----------------------------------------------------------------------------------------------*/
- private boolean checkEntityMentions(Entity aceReaderEntity, Entity testReaderEntity) {
+ private static boolean checkEntityMentions(Entity aceReaderEntity, Entity testReaderEntity) {
boolean entityMentionEqual = true;
FSArray aceReaderEntityMentionFSArray = aceReaderEntity.getEntity_mentions();
FSArray testReaderEntityMentionFSArray = testReaderEntity.getEntity_mentions();
@@ -1309,7 +1311,7 @@ private boolean checkEntityMentions(Entity aceReaderEntity, Entity testReaderEnt
} // of checkEntityMentions
/*----------------------------------------------------------------------------------------------*/
- private void buildSourceFile(JCas jcas) throws SAXException, IOException, ParserConfigurationException {
+ private static void buildSourceFile(JCas jcas) throws SAXException, IOException, ParserConfigurationException {
de.julielab.jcore.types.ace.SourceFile sourceFile = new de.julielab.jcore.types.ace.SourceFile(jcas);
sourceFile.setUri("XIN_ENG_20030624.0085.sgm");
@@ -1329,14 +1331,14 @@ private void buildSourceFile(JCas jcas) throws SAXException, IOException, Parser
} // buildSourceFile
/*----------------------------------------------------------------------------------------------*/
- private void setDocumentText(CAS testReaderCas2, org.w3c.dom.Document sgmDomDocument) {
+ private static void setDocumentText(CAS testReaderCas2, org.w3c.dom.Document sgmDomDocument) {
Node documentNode = sgmDomDocument.getDocumentElement();
String documentText = documentNode.getTextContent();
testReaderCas2.setDocumentText(documentText);
} // of setDocumentText
/*----------------------------------------------------------------------------------------------*/
- private void buildDocument(JCas jcas, SourceFile sourceFile) {
+ private static void buildDocument(JCas jcas, SourceFile sourceFile) {
de.julielab.jcore.types.ace.Document document = new de.julielab.jcore.types.ace.Document(jcas);
document.setDocid("XIN_ENG_20030624.0085");
buildEntities(jcas, document);
@@ -1401,7 +1403,7 @@ private void buildJulesEventArgs(JCas jcas, Transaction event1) {
} // buildJulesEventArgs
/*----------------------------------------------------------------------------------------------*/
- private void buildJulesRelations(JCas jcas, Document document) {
+ private static void buildJulesRelations(JCas jcas, Document document) {
System.out.println("CALL buildJulesRelations()");
PART_WHOLE relation1_1 = new PART_WHOLE(jcas);
relation1_1.setBegin(543);
@@ -1490,7 +1492,7 @@ private void buildJulesRelations(JCas jcas, Document document) {
} // of buildJulesRelations
/*----------------------------------------------------------------------------------------------*/
- private void buildJulesEntities(JCas jcas, Document document) {
+ private static void buildJulesEntities(JCas jcas, Document document) {
System.out.println("CALL buildJulesEntities()");
entity1_1 = new LOC(jcas);
@@ -1562,7 +1564,7 @@ private void buildJulesEntities(JCas jcas, Document document) {
} // of buildJulesEntities
/*----------------------------------------------------------------------------------------------*/
- private void buildEvents(JCas jcas, Document document) {
+ private static void buildEvents(JCas jcas, Document document) {
de.julielab.jcore.types.ace.Event event = new de.julielab.jcore.types.ace.Event(jcas);
event.setGenericity("Specific");
@@ -1583,7 +1585,7 @@ private void buildEvents(JCas jcas, Document document) {
} // of buildEvents
/*----------------------------------------------------------------------------------------------*/
- private void buildEventMentions(JCas jcas, Event event) {
+ private static void buildEventMentions(JCas jcas, Event event) {
de.julielab.jcore.types.ace.EventMention eventMention = new de.julielab.jcore.types.ace.EventMention(jcas);
eventMention.setId("XIN_ENG_20030405.0080-EV2-1");
eventMention.setBegin(625);
@@ -1612,7 +1614,7 @@ private void buildEventMentions(JCas jcas, Event event) {
} // of buildEventMentions
/*----------------------------------------------------------------------------------------------*/
- private void buildEventMentionArguments(JCas jcas, EventMention eventMention) {
+ private static void buildEventMentionArguments(JCas jcas, EventMention eventMention) {
de.julielab.jcore.types.ace.EventMentionArgument eventMentionArgument1 = new de.julielab.jcore.types.ace.EventMentionArgument(
jcas);
eventMentionArgument1.setAce_role("Recipient");
@@ -1637,7 +1639,7 @@ private void buildEventMentionArguments(JCas jcas, EventMention eventMention) {
} // of buildEventMentionArguments
/*----------------------------------------------------------------------------------------------*/
- private void buildEventArguments(JCas jcas, Event event) {
+ private static void buildEventArguments(JCas jcas, Event event) {
de.julielab.jcore.types.ace.EventArgument eventArgument1 = new de.julielab.jcore.types.ace.EventArgument(jcas);
eventArgument1.setAce_role("Recipient");
eventArgument1.setRefid("XIN_ENG_20030405.0080-E1");
@@ -1656,7 +1658,7 @@ private void buildEventArguments(JCas jcas, Event event) {
} // of buildEventArguments
/*----------------------------------------------------------------------------------------------*/
- private void buildRelations(JCas jcas, Document document) {
+ private static void buildRelations(JCas jcas, Document document) {
de.julielab.jcore.types.ace.Relation relation1 = new de.julielab.jcore.types.ace.Relation(jcas);
relation1.setModality("Asserted");
relation1.setTense("Unspecified");
@@ -1685,7 +1687,7 @@ private void buildRelations(JCas jcas, Document document) {
} // of buildRelations
/*----------------------------------------------------------------------------------------------*/
- private void buildRelationMentions2(JCas jcas, Relation relation2) {
+ private static void buildRelationMentions2(JCas jcas, Relation relation2) {
de.julielab.jcore.types.ace.RelationMention relationMention2_1 = new de.julielab.jcore.types.ace.RelationMention(
jcas);
relationMention2_1.setLexical_condition("Preposition");
@@ -1714,7 +1716,7 @@ private void buildRelationMentions2(JCas jcas, Relation relation2) {
} // of buildRelationMentions2
/*----------------------------------------------------------------------------------------------*/
- private void buildRelationMentionArgument2_2(JCas jcas, RelationMention relationMention2_2) {
+ private static void buildRelationMentionArgument2_2(JCas jcas, RelationMention relationMention2_2) {
de.julielab.jcore.types.ace.RelationMentionArgument argument1 = new de.julielab.jcore.types.ace.RelationMentionArgument(
jcas);
argument1.setAce_role("Arg-2");
@@ -1739,7 +1741,7 @@ private void buildRelationMentionArgument2_2(JCas jcas, RelationMention relation
} // of buildRelationMentionArgument2_2
/*----------------------------------------------------------------------------------------------*/
- private void buildRelationMentionArguments2_1(JCas jcas, RelationMention relationMention1) {
+ private static void buildRelationMentionArguments2_1(JCas jcas, RelationMention relationMention1) {
de.julielab.jcore.types.ace.RelationMentionArgument argument1 = new de.julielab.jcore.types.ace.RelationMentionArgument(
jcas);
argument1.setAce_role("Arg-2");
@@ -1764,7 +1766,7 @@ private void buildRelationMentionArguments2_1(JCas jcas, RelationMention relatio
} // of buildRelationMentionArguments2_1
/*----------------------------------------------------------------------------------------------*/
- private void buildRelationArguments2(JCas jcas, Relation relation2) {
+ private static void buildRelationArguments2(JCas jcas, Relation relation2) {
de.julielab.jcore.types.ace.RelationArgument argument1 = new de.julielab.jcore.types.ace.RelationArgument(jcas);
argument1.setAce_role("Arg-2");
argument1.setRefid("XIN_ENG_20030624.0085-E1");
@@ -1782,7 +1784,7 @@ private void buildRelationArguments2(JCas jcas, Relation relation2) {
} // of buildRelationArguments2
/*----------------------------------------------------------------------------------------------*/
- private void buildRelationMentions1(JCas jcas, Relation relation) {
+ private static void buildRelationMentions1(JCas jcas, Relation relation) {
de.julielab.jcore.types.ace.RelationMention relationMention1 = new de.julielab.jcore.types.ace.RelationMention(
jcas);
relationMention1.setLexical_condition("Preposition");
@@ -1811,7 +1813,7 @@ private void buildRelationMentions1(JCas jcas, Relation relation) {
} // buildRelationMentions
/*----------------------------------------------------------------------------------------------*/
- private void buildRelationMentionArguments1_2(JCas jcas, RelationMention relationMention2) {
+ private static void buildRelationMentionArguments1_2(JCas jcas, RelationMention relationMention2) {
de.julielab.jcore.types.ace.RelationMentionArgument argument1 = new de.julielab.jcore.types.ace.RelationMentionArgument(
jcas);
argument1.setAce_role("Arg-1");
@@ -1836,7 +1838,7 @@ private void buildRelationMentionArguments1_2(JCas jcas, RelationMention relatio
} // buildRelationMentionArguments2
/*----------------------------------------------------------------------------------------------*/
- private void buildRelationMentionArguments1_1(JCas jcas, RelationMention relationMention1) {
+ private static void buildRelationMentionArguments1_1(JCas jcas, RelationMention relationMention1) {
de.julielab.jcore.types.ace.RelationMentionArgument argument1 = new de.julielab.jcore.types.ace.RelationMentionArgument(
jcas);
argument1.setAce_role("Arg-1");
@@ -1861,7 +1863,7 @@ private void buildRelationMentionArguments1_1(JCas jcas, RelationMention relatio
} // buildRelationMentionArguments1
/*----------------------------------------------------------------------------------------------*/
- private void buildRelationAgruments1(JCas jcas, Relation relation) {
+ private static void buildRelationAgruments1(JCas jcas, Relation relation) {
de.julielab.jcore.types.ace.RelationArgument argument1 = new de.julielab.jcore.types.ace.RelationArgument(jcas);
argument1.setAce_role("Arg-1");
argument1.setRefid("XIN_ENG_20030624.0085-E1");
@@ -1880,7 +1882,7 @@ private void buildRelationAgruments1(JCas jcas, Relation relation) {
} // buildRelationAgruments
/*----------------------------------------------------------------------------------------------*/
- private void buildTimex2(JCas jcas, Document document) {
+ private static void buildTimex2(JCas jcas, Document document) {
de.julielab.jcore.types.ace.Timex2 timex2_1 = new de.julielab.jcore.types.ace.Timex2(jcas);
timex2_1.setId("XIN_ENG_20030624.0085-T4");
buildTimex2Mentions1(jcas, timex2_1);
@@ -1897,7 +1899,7 @@ private void buildTimex2(JCas jcas, Document document) {
} // buildTimex2
/*----------------------------------------------------------------------------------------------*/
- private void buildTimex2Mentions2(JCas jcas, Timex2 timex2_2) {
+ private static void buildTimex2Mentions2(JCas jcas, Timex2 timex2_2) {
de.julielab.jcore.types.ace.Timex2Mention timex2Mention = new de.julielab.jcore.types.ace.Timex2Mention(jcas);
timex2Mention.setId("XIN_ENG_20030624.0085-T8-1");
timex2Mention.setBegin(1327);
@@ -1911,7 +1913,7 @@ private void buildTimex2Mentions2(JCas jcas, Timex2 timex2_2) {
} // buildTimex2Mentions2
/*----------------------------------------------------------------------------------------------*/
- private void buildTimex2Mentions1(JCas jcas, Timex2 timex2_1) {
+ private static void buildTimex2Mentions1(JCas jcas, Timex2 timex2_1) {
de.julielab.jcore.types.ace.Timex2Mention timex2Mention = new de.julielab.jcore.types.ace.Timex2Mention(jcas);
timex2Mention.setId("XIN_ENG_20030624.0085-T4-1");
timex2Mention.setBegin(327);
@@ -1925,7 +1927,7 @@ private void buildTimex2Mentions1(JCas jcas, Timex2 timex2_1) {
} // buildTimex2Mentions1
/*----------------------------------------------------------------------------------------------*/
- private void buildValues(JCas jcas, Document document) {
+ private static void buildValues(JCas jcas, Document document) {
de.julielab.jcore.types.ace.Value value1 = new de.julielab.jcore.types.ace.Value(jcas);
value1.setAce_type("Numeric");
value1.setAce_subtype("Money");
@@ -1948,7 +1950,7 @@ private void buildValues(JCas jcas, Document document) {
} // buildValues
/*----------------------------------------------------------------------------------------------*/
- private void buildValueMentuions2(JCas jcas, Value value2) {
+ private static void buildValueMentuions2(JCas jcas, Value value2) {
de.julielab.jcore.types.ace.ValueMention valueMention = new de.julielab.jcore.types.ace.ValueMention(jcas);
valueMention.setId("XIN_ENG_20030624.0085-V3-1");
valueMention.setBegin(1079);
@@ -1962,7 +1964,7 @@ private void buildValueMentuions2(JCas jcas, Value value2) {
} // buildValueMentuions2
/*----------------------------------------------------------------------------------------------*/
- private void buildValueMentions1(JCas jcas, Value value1) {
+ private static void buildValueMentions1(JCas jcas, Value value1) {
de.julielab.jcore.types.ace.ValueMention valueMention = new de.julielab.jcore.types.ace.ValueMention(jcas);
valueMention.setId("XIN_ENG_20030624.0085-V2-1");
valueMention.setBegin(826);
@@ -1976,7 +1978,7 @@ private void buildValueMentions1(JCas jcas, Value value1) {
} // buildValueMentions1
/*----------------------------------------------------------------------------------------------*/
- private void buildEntities(JCas jcas, de.julielab.jcore.types.ace.Document document) {
+ private static void buildEntities(JCas jcas, de.julielab.jcore.types.ace.Document document) {
Entity entity1 = new Entity(jcas);
entity1.setAce_class("USP");
entity1.setAce_type("LOC");
@@ -2003,14 +2005,14 @@ private void buildEntities(JCas jcas, de.julielab.jcore.types.ace.Document docum
} // of buildEntities
/*----------------------------------------------------------------------------------------------*/
- private void buildEntityAttributes1(JCas jcas, Entity entity1) {
+ private static void buildEntityAttributes1(JCas jcas, Entity entity1) {
FSArray entityAttributeFSArray = new FSArray(jcas, 0);
entityAttributeFSArray.addToIndexes();
entity1.setEntity_attributes(entityAttributeFSArray);
} // buildEntityAttributes1
/*----------------------------------------------------------------------------------------------*/
- private void buildEntityAttributes2(JCas jcas, Entity entity2) {
+ private static void buildEntityAttributes2(JCas jcas, Entity entity2) {
de.julielab.jcore.types.ace.EntityAttribute entityAttribute = new de.julielab.jcore.types.ace.EntityAttribute(
jcas);
@@ -2024,7 +2026,7 @@ private void buildEntityAttributes2(JCas jcas, Entity entity2) {
} // ofbuildEntityAttributes2
/*----------------------------------------------------------------------------------------------*/
- private void buildEntityAttributeNames(JCas jcas, de.julielab.jcore.types.ace.EntityAttribute entityAttribute) {
+ private static void buildEntityAttributeNames(JCas jcas, de.julielab.jcore.types.ace.EntityAttribute entityAttribute) {
FSArray nameFSArray = new FSArray(jcas, 4);
de.julielab.jcore.types.ace.Name entityAttributeName1 = new de.julielab.jcore.types.ace.Name(jcas);
@@ -2060,7 +2062,7 @@ private void buildEntityAttributeNames(JCas jcas, de.julielab.jcore.types.ace.En
} // buildEntityAttributeNames
/*----------------------------------------------------------------------------------------------*/
- private void buildEntityMentions1(JCas jcas, Entity entity) {
+ private static void buildEntityMentions1(JCas jcas, Entity entity) {
de.julielab.jcore.types.ace.EntityMention entityMention1 = new de.julielab.jcore.types.ace.EntityMention(jcas);
entityMention1.setMention_ldctype("PTV");
entityMention1.setMention_type("PRO");
@@ -2101,7 +2103,7 @@ private void buildEntityMentions1(JCas jcas, Entity entity) {
} // of buildEntityMentions
/*----------------------------------------------------------------------------------------------*/
- private void buildEntityMentions2(JCas jcas, Entity entity2) {
+ private static void buildEntityMentions2(JCas jcas, Entity entity2) {
de.julielab.jcore.types.ace.EntityMention entityMention1 = new de.julielab.jcore.types.ace.EntityMention(jcas);
entityMention1.setLdcatr("FALSE");
entityMention1.setAce_role("LOC");
@@ -2180,6 +2182,7 @@ private void buildEntityMentions2(JCas jcas, Entity entity2) {
/**
* Test if method getNextCas() has done its job
*/
+ @Test
public void testGetNextCas() {
System.out.println("CALL testGetNextCas");
checkDocumentText();
@@ -2195,7 +2198,7 @@ public void checkDocumentText() {
for (int i = 0; i < casArrayList.size(); i++) {
String text = casArrayList.get(i).getDocumentText();
- assertTrue(((text == null) ? "null" : text), (text != null) && (!text.equals("")));
+ assertTrue((text != null) && (!text.equals("")), ((text == null) ? "null" : text));
} // of for
} // of checkDocumentText
@@ -2209,7 +2212,7 @@ public void checkDocumentText() {
* the type
* @return the iterator
*/
- private Iterator getTypeIterator(CAS cas, int type) {
+ private static Iterator getTypeIterator(CAS cas, int type) {
Iterator iterator = null;
try {
@@ -2221,7 +2224,7 @@ private Iterator getTypeIterator(CAS cas, int type) {
} // getTypeIterator
/*----------------------------------------------------------------------------------------------*/
- private void writeCasToXMI(CAS cas, int docs) throws CASException, IOException, SAXException {
+ private static void writeCasToXMI(CAS cas, int docs) throws CASException, IOException, SAXException {
JFSIndexRepository indexes = cas.getJCas().getJFSIndexRepository();
Iterator documentIter = indexes.getAnnotationIndex(Document.type).iterator();
diff --git a/jcore-ace-reader/src/test/resources/de/julielab/jcore/reader/ace/data/out/PLACEHOLDER b/jcore-ace-reader/src/test/resources/de/julielab/jcore/reader/ace/data/out/PLACEHOLDER
new file mode 100644
index 000000000..e69de29bb
diff --git a/jcore-acronym-ae/component.meta b/jcore-acronym-ae/component.meta
index 4ccd014c0..b7c013133 100644
--- a/jcore-acronym-ae/component.meta
+++ b/jcore-acronym-ae/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-acronym-ae",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe Acronym Tagger"
}
diff --git a/jcore-acronym-ae/pom.xml b/jcore-acronym-ae/pom.xml
index df40261b4..b5e1c0d89 100644
--- a/jcore-acronym-ae/pom.xml
+++ b/jcore-acronym-ae/pom.xml
@@ -14,7 +14,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0
@@ -38,8 +38,8 @@
${jcore-utilities-version}
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-engine
diff --git a/jcore-acronym-ae/src/main/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java b/jcore-acronym-ae/src/main/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java
index ad7877e80..a8e588af9 100644
--- a/jcore-acronym-ae/src/main/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java
+++ b/jcore-acronym-ae/src/main/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotator.java
@@ -158,12 +158,9 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept
LOGGER.info(" done");
- } catch (AnnotatorContextException e) {
- throw new ResourceInitializationException();
- } catch (AnnotatorConfigurationException e) {
- throw new ResourceInitializationException();
- } catch (ResourceProcessException e) {
- throw new ResourceInitializationException();
+ } catch (AnnotatorContextException | AnnotatorConfigurationException| ResourceProcessException e) {
+ LOGGER.error("Could not initialize acronym annotator", e);
+ throw new ResourceInitializationException(e);
}
}
@@ -237,19 +234,21 @@ public void process(JCas aJCas) {
annotate(sentenceText, aJCas, sentence.getBegin());
}
- // if extra annotation is whished, do so :-)
+ // if extra annotation is wished, do so :-)
if (consistencyAnno) {
ConsistencyAnnotator ca = new ConsistencyAnnotator();
ca.consistencyAnnotate(aJCas);
}
-
+
if (postprocessing) {
Postprocessing.doPostprocessing(aJCas);
}
-
+
} catch (StringIndexOutOfBoundsException e) {
LOGGER.error("typical Error in AcronymAnnotator.process() : StringIndexOutOfBounds");
+ } catch (Throwable t) {
+ LOGGER.error("Acronym resolution error: ", t);
}
}
@@ -557,10 +556,6 @@ private int findFullformStart(String potFF, String acro) {
/**
* looks for the 'best' position in the sentence to start looking for a fullform
*
- * @param sentence
- * @param acroStart
- * @param maxTokens
- * @return
*/
private int getPotFullformStart(String sentence, int acroStart, int acroLength) {
diff --git a/jcore-acronym-ae/src/main/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronym-ae.xml b/jcore-acronym-ae/src/main/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronym-ae.xml
index f31cada2f..df6b3d9cc 100755
--- a/jcore-acronym-ae/src/main/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronym-ae.xml
+++ b/jcore-acronym-ae/src/main/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronym-ae.xml
@@ -6,7 +6,7 @@
JCoRe AcronymAnnotator
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-acronym-ae/src/test/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotatorTest.java b/jcore-acronym-ae/src/test/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotatorTest.java
index c2c74ba6e..3721ee562 100644
--- a/jcore-acronym-ae/src/test/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotatorTest.java
+++ b/jcore-acronym-ae/src/test/java/de/julielab/jcore/ae/acronymtagger/main/AcronymAnnotatorTest.java
@@ -18,7 +18,6 @@
import de.julielab.jcore.types.Abbreviation;
import de.julielab.jcore.types.AbbreviationLongform;
import de.julielab.jcore.types.Sentence;
-import junit.framework.TestCase;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.cas.CAS;
@@ -35,7 +34,7 @@
import org.apache.uima.util.CasCreationUtils;
import org.apache.uima.util.InvalidXMLException;
import org.apache.uima.util.XMLInputSource;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -45,12 +44,14 @@
import java.util.ArrayList;
import java.util.Collection;
+import static org.junit.jupiter.api.Assertions.*;
+
/**
* The AcronymAnnotatorTest class
*
* @author jwermter
*/
-public class AcronymAnnotatorTest extends TestCase {
+public class AcronymAnnotatorTest {
private static final String DOCUMENT_TEXT = "[TAZ]Die Firma Kohl-kopf (FK-K) hat für die Straßenverkehrsordnung (StVO) "
+ "in der Bundesrepublik Deutschland(BRD) einen hochintelligenten Manager für die Chefetage "
@@ -73,6 +74,7 @@ public class AcronymAnnotatorTest extends TestCase {
private static final String ALL_TYPES_NAME = "de.julielab.jcore.types.jcore-all-types";
+ @Test
public void testProcess() throws ResourceInitializationException, InvalidXMLException, IOException, CASException {
CAS cas = CasCreationUtils.createCas(
diff --git a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/JulesToolsAEDescriptor.xml b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/JulesToolsAEDescriptor.xml
index 9aa0a7e09..05179e6b2 100644
--- a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/JulesToolsAEDescriptor.xml
+++ b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/JulesToolsAEDescriptor.xml
@@ -6,7 +6,7 @@
JulesToolsDescriptor
- 2.5.1-SNAPSHOT
+ 2.6.0
diff --git a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronymtagger-test.xml b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronymtagger-test.xml
index 8e179d4c3..f9a981135 100755
--- a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronymtagger-test.xml
+++ b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/desc/jcore-acronymtagger-test.xml
@@ -6,7 +6,7 @@
JCoRe AcronymAnnotator
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/types/StemNetSemanticsTypeSystem.xml b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/types/StemNetSemanticsTypeSystem.xml
index fd197d12f..d918bfcba 100644
--- a/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/types/StemNetSemanticsTypeSystem.xml
+++ b/jcore-acronym-ae/src/test/resources/de/julielab/jcore/ae/acronymtagger/types/StemNetSemanticsTypeSystem.xml
@@ -2,7 +2,7 @@
StemNetSemanticsTypeSystem
-2.5.1-SNAPSHOT
+2.6.0http://www.julielab.de
diff --git a/jcore-acronym-writer/component.meta b/jcore-acronym-writer/component.meta
index b0999bc38..7cdcd3451 100644
--- a/jcore-acronym-writer/component.meta
+++ b/jcore-acronym-writer/component.meta
@@ -2,7 +2,7 @@
"categories": [
"consumer"
],
- "description": "Writes acronyms annotations from the CAS to a text file format.",
+ "description": "Writes acronym annotations from the CAS to a text file format.",
"descriptors": [
{
"category": "consumer",
@@ -13,8 +13,8 @@
"group": "general",
"maven-artifact": {
"artifactId": "jcore-acronym-writer",
- "groupId": "de.julielab.jcore.consumer.acronyms",
- "version": "2.5.0-SNAPSHOT"
+ "groupId": "de.julielab",
+ "version": "2.6.0"
},
"name": "JCoRe Acronym Writer"
}
diff --git a/jcore-acronym-writer/pom.xml b/jcore-acronym-writer/pom.xml
index e01349996..287448025 100644
--- a/jcore-acronym-writer/pom.xml
+++ b/jcore-acronym-writer/pom.xml
@@ -5,12 +5,11 @@
4.0.0jcore-acronym-writerjar
- de.julielab.jcore.consumer.acronymsde.julielabjcore-base
- 2.5.0-SNAPSHOT
+ 2.6.0
@@ -39,11 +38,11 @@
de.julielabjcore-types
- ${jcore-version}
+ ${jcore-types-version}
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-engineJCoRe Acronym Writer
@@ -58,5 +57,5 @@
https://github.com/JULIELab/jcore-base/tree/master/jcore-acronym-writer
- Writes acronyms annotations from the CAS to a text file format.
+ Writes acronym annotations from the CAS to a text file format.
diff --git a/jcore-acronym-writer/src/main/java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java b/jcore-acronym-writer/src/main/java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java
index b1aabca29..ddc1ba416 100644
--- a/jcore-acronym-writer/src/main/java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java
+++ b/jcore-acronym-writer/src/main/java/de/julielab/jcore/consumer/acronyms/AcronymWriter.java
@@ -15,6 +15,8 @@
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
@@ -24,7 +26,7 @@
@ResourceMetaData(name = "JCoRe Acronym Writer", description = "Writes acronym annotation to a text file.")
public class AcronymWriter extends JCasAnnotator_ImplBase {
-
+private final static Logger log = LoggerFactory.getLogger(AcronymWriter.class);
public static final String PARAM_OUTPUTFILE = "OutputFile";
@ConfigurationParameter(name = PARAM_OUTPUTFILE)
@@ -38,12 +40,15 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept
try {
os = FileUtilities.getOutputStreamToFile(new File(outputFile));
} catch (IOException e) {
+ log.error("Could not initialize acronym writer", e);
throw new ResourceInitializationException(e);
}
+ log.trace("AcronymWriter successfully initialized.");
}
@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
+ log.trace("Processing with AcronymWriter");
try {
String pubmedId = JCoReTools.getDocId(jcas);
FSIterator it = jcas.getAnnotationIndex(Abbreviation.type).iterator();
@@ -70,7 +75,10 @@ public void process(JCas jcas) throws AnalysisEngineProcessException {
++abbrCount;
}
} catch (CASRuntimeException | IOException e) {
+ log.error("Exception while writing acronyms", e);
throw new AnalysisEngineProcessException(e);
+ } catch (Throwable t) {
+ log.error("Exception while writing acronyms", t);
}
}
diff --git a/jcore-acronym-writer/src/main/resources/de/julielab/jcore/consumer/acronyms/desc/jcore-acronym-writer.xml b/jcore-acronym-writer/src/main/resources/de/julielab/jcore/consumer/acronyms/desc/jcore-acronym-writer.xml
index 5f3073b02..31ce7af9a 100644
--- a/jcore-acronym-writer/src/main/resources/de/julielab/jcore/consumer/acronyms/desc/jcore-acronym-writer.xml
+++ b/jcore-acronym-writer/src/main/resources/de/julielab/jcore/consumer/acronyms/desc/jcore-acronym-writer.xml
@@ -2,11 +2,11 @@
org.apache.uima.javatrue
- de.julielab.jcore.consumer.acronyms.AcronymWriter
+ de.julielab.jcore.consumer.coreference.AcronymWriterJCoRe Acronym WriterWrites acronym annotation to a text file.
- 2.5.1-SNAPSHOT
+ 2.6.0OutputFile
diff --git a/jcore-acronym-writer/src/test/java/de/julielab/jcore/consumer/acronyms/AcronymWriterTest.java b/jcore-acronym-writer/src/test/java/de/julielab/jcore/consumer/acronyms/AcronymWriterTest.java
index 243f4481a..c63bfd442 100644
--- a/jcore-acronym-writer/src/test/java/de/julielab/jcore/consumer/acronyms/AcronymWriterTest.java
+++ b/jcore-acronym-writer/src/test/java/de/julielab/jcore/consumer/acronyms/AcronymWriterTest.java
@@ -1,5 +1,5 @@
-package de.julielab.jcore.consumer.acronyms;
+package de.julielab.jcore.consumer.coreference;
/**
* Unit tests for jcore-acronym-writer.
diff --git a/jcore-annotation-adder-ae/README.md b/jcore-annotation-adder-ae/README.md
index bf3d32b2c..c3e1f9fe0 100644
--- a/jcore-annotation-adder-ae/README.md
+++ b/jcore-annotation-adder-ae/README.md
@@ -28,11 +28,11 @@ For document class annotations, no offset mode is required, obviously. Whether t
**3. External Resource Dependencies**
-This component requires an external resource given with the `AnnotationSource` key. This dependency definition is present in the provided default descriptor.
+This component requires an external resource given with the `AnnotationSource` key. This dependency definition is pre-configured in the provided default descriptor and must be adapted to point to the correct annotation source.
The external dependency may currently be a file which is read completely into an in-memory map by the `de.julielab.jcore.ae.annotationadder.annotationsources.InMemoryFileTextAnnotationProvider` class for textual annotations with offsets or by the `de.julielab.jcore.ae.annotationadder.annotationsources.InMemoryFileDocumentClassAnnotationProvider` class for document classes. Both provider classes implement the required external resource interface `de.julielab.jcore.ae.annotationadder.annotationsources.AnnotationProvider`.
-Other approaches, that are possible easier on the resources - might be implemented if necessary.
+Other approaches that are possibly easier on the resources might be implemented if necessary.
Currently, the external resource definition looks as follows:
diff --git a/jcore-annotation-adder-ae/component.meta b/jcore-annotation-adder-ae/component.meta
index 500127938..fe12dbf50 100644
--- a/jcore-annotation-adder-ae/component.meta
+++ b/jcore-annotation-adder-ae/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-annotation-adder-ae",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe Annotation Adder"
}
diff --git a/jcore-annotation-adder-ae/pom.xml b/jcore-annotation-adder-ae/pom.xml
index 1473a562b..7cdc4c465 100644
--- a/jcore-annotation-adder-ae/pom.xml
+++ b/jcore-annotation-adder-ae/pom.xml
@@ -9,7 +9,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0
@@ -41,8 +41,8 @@
${jcore-types-version}
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-enginede.julielab
@@ -53,6 +53,11 @@
commons-codec1.13
+
+ com.h2database
+ h2
+ 2.1.214
+ JCoRe Annotation Adder
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java
index b31fc7d05..ceaac7535 100644
--- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotator.java
@@ -6,6 +6,7 @@
import de.julielab.jcore.utility.JCoReTools;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.fit.descriptor.ResourceMetaData;
@@ -39,6 +40,7 @@ public enum OffsetMode {CHARACTER, TOKEN}
@ConfigurationParameter(name = PARAM_PREVENT_PROCESSED_MARK, mandatory = false, description = "This setting is only in effect if an input format is used that contains document text SHA256 digests while also writing the annotation results into a JeDIS database. If then a CAS document text, to which annotations should be added, does not match the digest given by an annotation, this CAS will not marked as being finished processing by DBCheckpointAE that may follow in the pipeline. The idea is that the mismatched documents require a reprocessing of the original annotation creation algorithm because their text has been changed relative to the annotation on file. By not setting the document as being finished processed, it is straightforward to process only those documents again that failed to add one or multiple annotations.")
private boolean preventProcessedOnDigestMismatch;
+
private List annotationAdders = Arrays.asList(new TextAnnotationListAdder(), new DocumentClassAnnotationAdder());
/**
@@ -49,6 +51,7 @@ public enum OffsetMode {CHARACTER, TOKEN}
public void initialize(final UimaContext aContext) throws ResourceInitializationException {
offsetMode = OffsetMode.valueOf(Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_OFFSET_MODE)).orElse(OffsetMode.CHARACTER.name()));
defaultUimaType = (String) aContext.getConfigParameterValue(PARAM_DEFAULT_UIMA_TYPE);
+ preventProcessedOnDigestMismatch = Optional.ofNullable((Boolean) aContext.getConfigParameterValue(PARAM_PREVENT_PROCESSED_MARK)).orElse(false);
try {
annotationProvider = (AnnotationProvider extends AnnotationData>) aContext.getResourceObject(KEY_ANNOTATION_SOURCE);
} catch (ResourceAccessException e) {
@@ -65,23 +68,29 @@ public void initialize(final UimaContext aContext) throws ResourceInitialization
* is where the actual work happens.
*/
@Override
- public void process(final JCas aJCas) {
- final String docId = JCoReTools.getDocId(aJCas);
- if (docId == null)
- log.error("The current document does not have a header. Cannot add external annotations.");
- final AnnotationData annotations = annotationProvider.getAnnotations(docId);
- final AnnotationAdderHelper helper = new AnnotationAdderHelper();
- if (annotations != null) {
- boolean success = false;
- int adderNum = 0;
- // We are now iterating through the available annotation adders for the one that handles the obtained annotation data
- while (adderNum < annotationAdders.size() && !(success = annotationAdders.get(adderNum).addAnnotations(annotations, helper, adderConfiguration, aJCas, preventProcessedOnDigestMismatch))) {
- ++adderNum;
+ public void process(final JCas aJCas) throws AnalysisEngineProcessException {
+ try {
+ final String docId = JCoReTools.getDocId(aJCas);
+ if (docId == null)
+ log.error("The current document does not have a header. Cannot add external annotations.");
+ final AnnotationData annotations = annotationProvider.getAnnotations(docId);
+ final AnnotationAdderHelper helper = new AnnotationAdderHelper();
+ if (annotations != null) {
+ log.trace("Found annotations for document ID {}.", docId);
+ boolean success = false;
+ int adderNum = 0;
+ // We are now iterating through the available annotation adders for the one that handles the obtained annotation data
+ while (adderNum < annotationAdders.size() && !(success = annotationAdders.get(adderNum).addAnnotations(annotations, helper, adderConfiguration, aJCas, preventProcessedOnDigestMismatch))) {
+ ++adderNum;
+ }
+ if (!success)
+ throw new IllegalArgumentException("There was no annotation adder to handle the annotation data of class " + annotations.getClass().getCanonicalName());
+ } else {
+ log.debug("No external annotations were delivered for document ID {}", docId);
}
- if (!success)
- throw new IllegalArgumentException("There was no annotation adder to handle the annotation data of class " + annotations.getClass().getCanonicalName());
- } else {
- log.debug("No external annotations were delivered for document ID {}", docId);
+ } catch (Throwable t) {
+ log.error("Could not add annotations due to exception.", t);
+ throw new AnalysisEngineProcessException(t);
}
}
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelper.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelper.java
index 831ecb280..219d4d286 100644
--- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelper.java
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelper.java
@@ -1,31 +1,41 @@
package de.julielab.jcore.ae.annotationadder;
+import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation;
import de.julielab.jcore.ae.annotationadder.annotationrepresentations.TextAnnotation;
import de.julielab.jcore.types.Sentence;
import de.julielab.jcore.types.Token;
+import org.apache.commons.lang3.StringUtils;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import java.util.stream.Collectors;
/**
* Caches information for the current document.
*/
public class AnnotationAdderHelper {
+ private final static Logger log = LoggerFactory.getLogger(AnnotationAdderHelper.class);
// Required for token-offsets
private List tokenList;
private Map> tokensBySentences;
private Matcher wsFinder = Pattern.compile("\\s").matcher("");
private Matcher nonWsMatcher = Pattern.compile("[^\\s]+").matcher("");
-
+ /**
+ * Caches methods for feature
+ */
+ private Map featureSetters;
public void setAnnotationOffsetsRelativeToDocument(Annotation annotation, TextAnnotation a, AnnotationAdderConfiguration configuration) throws CASException, AnnotationOffsetException {
if (configuration.getOffsetMode() == AnnotationAdderAnnotator.OffsetMode.CHARACTER) {
@@ -68,8 +78,10 @@ public void setAnnotationOffsetsRelativeToSentence(Sentence sentence, Annotation
List tokenList = tokensBySentences.get(sentence);
int startTokenNum = a.getStart();
int endTokenNum = a.getEnd();
- if (startTokenNum < 1 || startTokenNum > tokenList.size())
+ if (startTokenNum < 1 || startTokenNum > tokenList.size()) {
+ log.error("Cannot create entity because of a token offset mismatch. The entity should tart at token {} and end at {}. But there are only {} tokens available: {}", startTokenNum, endTokenNum, tokenList.size(), tokenList.stream().map(Annotation::getCoveredText).collect(Collectors.joining(" ")));
throw new AnnotationOffsetException("The current annotation to add to the CAS starts at token " + startTokenNum + " which does not fit to the range of tokens in the sentence with ID " + sentence.getId() + " which is 1 - " + tokenList.size());
+ }
if (endTokenNum < 1 || endTokenNum > tokenList.size())
throw new AnnotationOffsetException("The current annotation to add to the CAS ends at token " + endTokenNum + " which does not fit to the range of tokens in the sentence with ID " + sentence.getId() + " which is 1 - " + tokenList.size());
if (endTokenNum < startTokenNum)
@@ -134,4 +146,47 @@ public List createTokenList(JCas jCas, AnnotationAdderConfiguration confi
}
return tokenList;
}
+
+ public void setAnnotationPayloadsToFeatures(Annotation annotation, ExternalTextAnnotation a) {
+ final TypeSystem ts = annotation.getCAS().getTypeSystem();
+ Collection keys = a.getPayloadKeys();
+ if (!keys.isEmpty())
+ featureSetters = new HashMap<>();
+ try {
+ for (String key : keys) {
+ Object value = a.getPayload(key);
+ Method setter = featureSetters.get(key);
+ if (setter == null) {
+ Class> valueClass = convertUimaTypeToJavaType(ts.getType(annotation.getClass().getCanonicalName()).getFeatureByBaseName(key).getRange());
+ setter = annotation.getClass().getMethod("set" + StringUtils.capitalize(key), valueClass);
+ featureSetters.put(key, setter);
+ }
+ // We do this because it is possible a string feature could have values there are actually numbers.
+ // The automatic type detection of some formats will read those as numbers so we might need to
+ // convert here.
+ if (setter.getParameterTypes()[0].equals(String.class))
+ value = String.valueOf(value);
+ setter.invoke(annotation, value);
+ }
+ } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private Class> convertUimaTypeToJavaType(Type type) {
+ switch (type.getName()) {
+ case "uima.cas.String":
+ return String.class;
+ case "uima.cas.Integer":
+ return int.class;
+ case "uima.cas.Double":
+ return double.class;
+ case "uima.cas.Boolean":
+ return boolean.class;
+ case "uima.cas.Long":
+ return long.class;
+ default:
+ throw new IllegalArgumentException("Unsupported type for arbitrary feature-based input columns: " + type);
+ }
+ }
}
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java
index d249cf906..7626dce18 100644
--- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/TextAnnotationListAdder.java
@@ -5,7 +5,6 @@
import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation;
import de.julielab.jcore.types.ext.DBProcessingMetaData;
import de.julielab.jcore.utility.JCoReAnnotationTools;
-import de.julielab.jcore.utility.JCoReTools;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.uima.cas.CASException;
@@ -36,14 +35,18 @@ public boolean addAnnotations(AnnotationData data, AnnotationAdderHelper helper,
String jCasDocTextSha = null;
boolean shaMismatchWasReported = false;
for (ExternalTextAnnotation a : annotationList) {
- String uimaType = a.getUimaType() == null ? configuration.getDefaultUimaType() : a.getUimaType();
- if (uimaType == null)
+ String uimaType;
+ if (a.getUimaType() != null && jCas.getTypeSystem().getType(a.getUimaType()) != null)
+ uimaType = a.getUimaType();
+ else if (configuration.getDefaultUimaType() != null)
+ uimaType = configuration.getDefaultUimaType();
+ else
throw new IllegalArgumentException("Missing annotation type: Neither the annotation of document " + a.getDocumentId() + " with offsets " + a.getStart() + "-" + a.getEnd() + " provides a type nor is the default type set.");
if (jCas.getTypeSystem().getType(uimaType) == null)
throw new IllegalArgumentException("The entity annotation type " + uimaType + " does not exist in the type system.");
try {
// The sha check is supposed to compare the document text on which the annotation was made with the
- // document text the current CAS has. If the differ, the annotations will most likely have
+ // document text the current CAS has. If they differ, the annotations will most likely have
// offset discrepancies which is why they won't be added and a warning will be issued.
final String shaFromAnnotation = (String) a.getPayload("sha");
boolean shaMatches = true;
@@ -59,15 +62,18 @@ public boolean addAnnotations(AnnotationData data, AnnotationAdderHelper helper,
// that the SHA was the same as it was at time of the original entity tagging.
if (a.getStart() >= 0) {
final Annotation annotation = JCoReAnnotationTools.getAnnotationByClassName(jCas, uimaType);
+ if (annotation instanceof de.julielab.jcore.types.Annotation)
+ ((de.julielab.jcore.types.Annotation)annotation).setComponentId(AnnotationAdderAnnotator.class.getSimpleName());
helper.setAnnotationOffsetsRelativeToDocument(annotation, a, configuration);
+ helper.setAnnotationPayloadsToFeatures(annotation, a);
+ log.trace("Adding annotation of type {} with offsets {}-{} to document with ID {}", uimaType, annotation.getBegin(), annotation.getEnd(), annotationList.getDocId());
annotation.addToIndexes();
} else {
log.trace("ExternalAnnotation for document {} has no entity offsets or offsets < 0, not adding anything to the CAS.", a.getDocumentId());
}
} else {
if (!shaMismatchWasReported) {
- final String docId = JCoReTools.getDocId(jCas);
- log.warn("The document with ID '{}' has a differing document text hash from a given annotation. The annotation will not be added to the document. Annotation hash: {}, current document text hash: {}", docId, shaFromAnnotation, jCasDocTextSha);
+ log.warn("The document with ID '{}' has a differing document text hash from a given annotation. The annotation will not be added to the document. Annotation hash: {}, current document text hash: {}", annotationList.getDocId(), shaFromAnnotation, jCasDocTextSha);
shaMismatchWasReported = true;
if (preventProcessedOnDigestMismatch) {
try {
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/AnnotationFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/AnnotationFormat.java
index cb28d7d9f..d6d791256 100644
--- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/AnnotationFormat.java
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/AnnotationFormat.java
@@ -2,6 +2,37 @@
import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationData;
+import java.util.List;
+
public interface AnnotationFormat {
T parse(String data);
+
+ void hasHeader(boolean withHeader);
+
+ String[] getHeader();
+
+ List> getColumnDataTypes();
+
+ void setColumnNames(String[] header);
+
+ int getDocumentIdColumnIndex();
+
+ default Class> determineDataType(String value) {
+ Class> dataType = String.class;
+ try {
+ Integer.parseInt(value);
+ dataType = Integer.class;
+ } catch (NumberFormatException e) {
+ try {
+ Double.parseDouble(value);
+ dataType = Double.class;
+ } catch (NumberFormatException e2) {
+ if (value.equalsIgnoreCase("false") || value.equalsIgnoreCase("true")) {
+ dataType = Boolean.class;
+ }
+ }
+ }
+ return dataType;
+ }
+
}
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/DocumentClassAnnotationFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/DocumentClassAnnotationFormat.java
index 6376e803d..115d8de94 100644
--- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/DocumentClassAnnotationFormat.java
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/DocumentClassAnnotationFormat.java
@@ -2,6 +2,10 @@
import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalDocumentClassAnnotation;
+import java.util.List;
+
+import static de.julielab.jcore.ae.annotationadder.annotationsources.TextAnnotationProvider.COL_DOC_ID;
+
public class DocumentClassAnnotationFormat implements AnnotationFormat {
@Override
public ExternalDocumentClassAnnotation parse(String data) {
@@ -14,7 +18,32 @@ public ExternalDocumentClassAnnotation parse(String data) {
String docId = record[1];
String documentClass = record[2].intern();
String componentId = record[3].intern();
- String type = null;
return new ExternalDocumentClassAnnotation(docId, documentClass, confidence, componentId);
}
+
+ @Override
+ public void hasHeader(boolean withHeader) {
+ // does nothing right now
+ }
+
+ @Override
+ public String[] getHeader() {
+ return new String[]{"confidence", COL_DOC_ID, "documentClass", "componentId"};
+ }
+
+ @Override
+ public List> getColumnDataTypes() {
+ return List.of(Double.class, String.class, String.class, String.class);
+ }
+
+ @Override
+ public void setColumnNames(String[] header) {
+ // does nothing right now
+ }
+
+ @Override
+ public int getDocumentIdColumnIndex() {
+ return 1;
+ }
+
}
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/FeatureBasedTSVFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/FeatureBasedTSVFormat.java
new file mode 100644
index 000000000..1e83dc73d
--- /dev/null
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/FeatureBasedTSVFormat.java
@@ -0,0 +1,104 @@
+package de.julielab.jcore.ae.annotationadder.annotationformat;
+
+import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static de.julielab.jcore.ae.annotationadder.annotationsources.TextAnnotationProvider.COL_UIMA_TYPE;
+
+public class FeatureBasedTSVFormat implements AnnotationFormat {
+ private final static Logger log = LoggerFactory.getLogger(FeatureBasedTSVFormat.class);
+ private String[] header;
+ private boolean withHeader;
+ private Integer uimaTypeIndex;
+ private List> columnDataTypes;
+
+ @Override
+ public ExternalTextAnnotation parse(String data) {
+ if (data == null || data.startsWith("#"))
+ return null;
+ final String[] record = data.split("\t");
+ if (record.length < 3)
+ throw new IllegalArgumentException("Expected at least 3 column format providing document ID, begin and end offset for the annotation but got " + record.length + " columns: " + data);
+ if (withHeader && header == null) {
+ header = record;
+ return null;
+ }
+ if (columnDataTypes == null)
+ columnDataTypes = new ArrayList<>(header.length);
+ if (uimaTypeIndex == null) {
+ uimaTypeIndex = -1;
+ for (int i = 0; i < header.length; i++) {
+ if (header[i].equals(COL_UIMA_TYPE))
+ uimaTypeIndex = i;
+ }
+ if (uimaTypeIndex == 0)
+ throw new IllegalArgumentException("Found the uima_type column at index 0. However, the first column is reserved for the document ID.");
+ }
+ if (columnDataTypes.isEmpty())
+ determineColumnDataTypes(record);
+ String docId = record[0];
+ String type = uimaTypeIndex >= 0 ? record[uimaTypeIndex] : null;
+ ExternalTextAnnotation externalTextAnnotation = new ExternalTextAnnotation(docId, 0, 0, type);
+ externalTextAnnotation.setPayloadFeatureValues(true);
+ for (int i = 1; i < Math.min(header.length, record.length); i++) {
+ String featureName = header[i];
+ String columnValue = record[i];
+ if (!featureName.equals(COL_UIMA_TYPE))
+ externalTextAnnotation.addPayload(featureName, convertValueToFieldDataType(columnValue, i));
+ }
+
+ return externalTextAnnotation;
+ }
+
+ private Object convertValueToFieldDataType(String columnValue, int columnIndex) {
+ final Class> columnDataType = columnDataTypes.get(columnIndex);
+ if (columnDataType.equals(Integer.class))
+ return Integer.parseInt(columnValue);
+ else if (columnDataType.equals(Double.class))
+ return Double.parseDouble(columnValue);
+ else if (columnDataType.equals(Boolean.class))
+ return Boolean.parseBoolean(columnValue);
+ return columnValue.intern();
+ }
+
+ private void determineColumnDataTypes(String[] record) {
+ for (int i = 0; i < record.length; i++) {
+ String value = record[i];
+ Class> dataType = determineDataType(value);
+ columnDataTypes.add(dataType);
+ }
+ log.info("Identified the data types of columns {} as {}", header, columnDataTypes);
+ }
+
+
+ @Override
+ public void hasHeader(boolean withHeader) {
+ this.withHeader = withHeader;
+ }
+
+ @Override
+ public String[] getHeader() {
+ return header;
+ }
+
+ @Override
+ public List> getColumnDataTypes() {
+ if (columnDataTypes == null)
+ throw new IllegalStateException("The column data types are not yet set. This call must come after the first line of data has been read.");
+ return columnDataTypes;
+ }
+
+ @Override
+ public void setColumnNames(String[] header) {
+ this.header = header;
+ }
+
+ @Override
+ public int getDocumentIdColumnIndex() {
+ return 0;
+ }
+}
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormat.java
index b35e4f26c..a47bc5d55 100644
--- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormat.java
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormat.java
@@ -2,7 +2,17 @@
import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import static de.julielab.jcore.ae.annotationadder.annotationsources.TextAnnotationProvider.*;
+
public class SimpleTSVEntityAnnotationFormat implements AnnotationFormat {
+ private String[] header;
+ private boolean withHeader;
+ private List> columnDataTypes;
+
@Override
public ExternalTextAnnotation parse(String data) {
if (data == null || data.startsWith("#"))
@@ -10,12 +20,63 @@ public ExternalTextAnnotation parse(String data) {
final String[] record = data.split("\t");
if (record.length < 3)
throw new IllegalArgumentException("Expected a 3 or 4-column format providing document ID, begin, end and UIMA type (optional if the default type is set to the AnnotationAdderAnnotator) for the annotation but got " + record.length + " columns: " + data);
+ if (withHeader && header == null) {
+ header = record;
+ return null;
+ }
+ boolean columnDataTypesWasNull = columnDataTypes == null;
+ if (columnDataTypesWasNull) {
+ columnDataTypes = Stream.of(String.class, Integer.class, Integer.class).collect(Collectors.toList());
+ }
String docId = record[0];
int begin = Integer.parseInt(record[1]);
int end = Integer.parseInt(record[2]);
String type = null;
- if (record.length > 3)
+ if (record.length > 3) {
type = record[3];
- return new ExternalTextAnnotation(docId, begin, end, type);
+ if (columnDataTypesWasNull)
+ columnDataTypes.add(String.class);
+ }
+ if (header == null && record.length <= 3)
+ header = new String[]{COL_DOC_ID, COL_BEGIN, COL_END, COL_UIMA_TYPE};
+ ExternalTextAnnotation externalTextAnnotation = new ExternalTextAnnotation(docId, begin, end, type);
+ if (record.length > 4) {
+ if (header != null) {
+ for (int i = 4; i < record.length; i++) {
+ externalTextAnnotation.addPayload(header[i], record[i]);
+ if (columnDataTypesWasNull) {
+ columnDataTypes.add(determineDataType(record[i]));
+ }
+ }
+ }
+ }
+ return externalTextAnnotation;
+ }
+
+ @Override
+ public void hasHeader(boolean withHeader) {
+ this.withHeader = withHeader;
+ }
+
+ @Override
+ public String[] getHeader() {
+ return header;
+ }
+
+ @Override
+ public List> getColumnDataTypes() {
+ if (columnDataTypes == null)
+ throw new IllegalStateException("The column data types are not yet set. This call must come after the first line of data has been read.");
+ return columnDataTypes;
+ }
+
+ @Override
+ public void setColumnNames(String[] header) {
+ this.header = header;
+ }
+
+ @Override
+ public int getDocumentIdColumnIndex() {
+ return 0;
}
}
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityWithDocumentTextShaAnnotationFormat.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityWithDocumentTextShaAnnotationFormat.java
index f46893595..39bdf0016 100644
--- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityWithDocumentTextShaAnnotationFormat.java
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityWithDocumentTextShaAnnotationFormat.java
@@ -2,7 +2,12 @@
import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation;
+import java.util.List;
+
+import static de.julielab.jcore.ae.annotationadder.annotationsources.TextAnnotationProvider.*;
+
public class SimpleTSVEntityWithDocumentTextShaAnnotationFormat implements AnnotationFormat {
+ private List> columnDataTypes;
@Override
public ExternalTextAnnotation parse(String data) {
if (data == null || data.startsWith("#"))
@@ -17,8 +22,37 @@ public ExternalTextAnnotation parse(String data) {
String type = null;
if (record.length > 4)
type = record[4].intern();
+ if (columnDataTypes==null)
+ columnDataTypes = List.of(String.class, Integer.class, Integer.class, String.class, String.class);
final ExternalTextAnnotation externalTextAnnotation = new ExternalTextAnnotation(docId, begin, end, type);
externalTextAnnotation.addPayload("sha", sha);
return externalTextAnnotation;
}
+
+ @Override
+ public void hasHeader(boolean withHeader) {
+ // does nothing right now
+ }
+
+ @Override
+ public String[] getHeader() {
+ return new String[]{COL_DOC_ID, COL_BEGIN, COL_END, "sha", COL_UIMA_TYPE};
+ }
+
+ @Override
+ public List> getColumnDataTypes() {
+ if (columnDataTypes == null)
+ throw new IllegalStateException("The column data types are not yet set. This call must come after the first line of data has been read.");
+ return columnDataTypes;
+ }
+
+ @Override
+ public void setColumnNames(String[] header) {
+ // does nothing right now
+ }
+
+ @Override
+ public int getDocumentIdColumnIndex() {
+ return 0;
+ }
}
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/AnnotationList.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/AnnotationList.java
index afa5e074d..44da0c57c 100644
--- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/AnnotationList.java
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/AnnotationList.java
@@ -1,8 +1,34 @@
package de.julielab.jcore.ae.annotationadder.annotationrepresentations;
import java.util.ArrayList;
+import java.util.Collection;
public class AnnotationList extends ArrayList implements AnnotationData {
+ @Override
+ public boolean add(T t) {
+ setDocId(t.getDocumentId());
+ return super.add(t);
+ }
+
+ @Override
+ public void add(int index, T element) {
+ setDocId(element.getDocumentId());
+ super.add(index, element);
+ }
+
+ @Override
+ public boolean addAll(Collection extends T> c) {
+ if (c != null)
+ c.stream().findAny().ifPresent(annotation -> setDocId(annotation.getDocumentId()));
+ return super.addAll(c);
+ }
+
+ @Override
+ public boolean addAll(int index, Collection extends T> c) {
+ if (c != null)
+ c.stream().findAny().ifPresent(annotation -> setDocId(annotation.getDocumentId()));
+ return super.addAll(index, c);
+ }
private String docId;
@@ -11,11 +37,12 @@ public String getDocId() {
}
public void setDocId(String docId) {
+ if (docId != null && this.docId != null && !docId.equals(this.docId))
+ throw new IllegalArgumentException("This annotation list already contains annotations for document with ID " + this.docId + " but the document ID should now be set to " + docId + ".");
this.docId = docId;
}
@Override
-
public String getDocumentId() {
return docId;
}
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/ExternalTextAnnotation.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/ExternalTextAnnotation.java
index bd1408f47..cd43296f0 100644
--- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/ExternalTextAnnotation.java
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationrepresentations/ExternalTextAnnotation.java
@@ -1,13 +1,18 @@
package de.julielab.jcore.ae.annotationadder.annotationrepresentations;
+import java.util.Collection;
+import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
+import static de.julielab.jcore.ae.annotationadder.annotationsources.TextAnnotationProvider.*;
+
public class ExternalTextAnnotation implements TextAnnotation {
private String documentId;
private int start;
private int end;
private String uimaType;
+ private boolean payloadFeatureValues;
private Map payload;
public ExternalTextAnnotation(String documentId, int start, int end, String uimaType) {
@@ -56,7 +61,30 @@ public void addPayload(String key, Object value) {
payload.put(key, value);
}
+ public Map getAllFieldValuesAsMap() {
+ final Map values = new HashMap<>();
+ values.put(COL_BEGIN, start);
+ values.put(COL_END, end);
+ values.put(COL_UIMA_TYPE, uimaType);
+ values.put(COL_DOC_ID, documentId);
+ if (payload != null)
+ values.putAll(payload);
+ return values;
+ }
+
public Object getPayload(String key) {
return payload != null ? payload.get(key) : null;
}
+
+ public Collection getPayloadKeys() {
+ return payload != null ? payload.keySet() : Collections.emptySet();
+ }
+
+ public boolean isPayloadFeatureValues() {
+ return payloadFeatureValues;
+ }
+
+ public void setPayloadFeatureValues(boolean payloadFeatureValues) {
+ this.payloadFeatureValues = payloadFeatureValues;
+ }
}
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/AnnotationSource.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/AnnotationSource.java
index d7a1daad9..5a18be30e 100644
--- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/AnnotationSource.java
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/AnnotationSource.java
@@ -3,7 +3,12 @@
import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationData;
import org.apache.uima.resource.DataResource;
+import java.io.IOException;
+import java.net.URI;
+
public interface AnnotationSource {
- void initialize(DataResource dataResource);
+ void loadAnnotations(URI annotationUri) throws IOException;
+
+ void initialize(DataResource dataResource) throws IOException;
T getAnnotations(String id);
}
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java
deleted file mode 100644
index 4e6ba0a88..000000000
--- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/FileAnnotationSource.java
+++ /dev/null
@@ -1,44 +0,0 @@
-package de.julielab.jcore.ae.annotationadder.annotationsources;
-
-import de.julielab.java.utilities.FileUtilities;
-import de.julielab.jcore.ae.annotationadder.annotationformat.AnnotationFormat;
-import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationData;
-import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationList;
-import org.apache.uima.resource.DataResource;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.IOException;
-import java.util.Map;
-import java.util.stream.Collectors;
-
-public class FileAnnotationSource implements AnnotationSource> {
- private final static Logger log = LoggerFactory.getLogger(FileAnnotationSource.class);
- private AnnotationFormat format;
- private Map> entitiesByDocId;
-
- public FileAnnotationSource(AnnotationFormat format) {
- this.format = format;
- }
-
- public void loadAnnotations(File annotationfile) {
- try (BufferedReader br = FileUtilities.getReaderFromFile(annotationfile)) {
- entitiesByDocId = br.lines().map(format::parse).collect(Collectors.groupingBy(AnnotationData::getDocumentId, Collectors.toCollection(AnnotationList::new)));
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- @Override
- public void initialize(DataResource dataResource) {
- log.info("Loading entity annotations from {}", dataResource.getUri());
- loadAnnotations(new File(dataResource.getUri()));
- }
-
- @Override
- public AnnotationList getAnnotations(String id) {
- return entitiesByDocId.get(id);
- }
-}
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/H2AnnotationSource.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/H2AnnotationSource.java
new file mode 100644
index 000000000..326c7746c
--- /dev/null
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/H2AnnotationSource.java
@@ -0,0 +1,229 @@
+package de.julielab.jcore.ae.annotationadder.annotationsources;
+
+import de.julielab.java.utilities.UriUtilities;
+import de.julielab.jcore.ae.annotationadder.annotationformat.AnnotationFormat;
+import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationData;
+import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationList;
+import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalDocumentClassAnnotation;
+import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.uima.resource.DataResource;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.*;
+import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static de.julielab.jcore.ae.annotationadder.annotationsources.TextAnnotationProvider.*;
+
+public class H2AnnotationSource implements AnnotationSource> {
+ private final static Logger log = LoggerFactory.getLogger(H2AnnotationSource.class);
+ private AnnotationFormat format;
+ private Path h2DbPath;
+ private Statement queryStmt;
+ private Class> annotationDataClass;
+
+ public H2AnnotationSource(AnnotationFormat format) {
+ this.format = format;
+ if (format.getHeader() == null)
+ throw new IllegalArgumentException("To use the H2AnnotationSource, the input format must define the column headers. The employed format " + format + " does not specify them itself. Thus, the header must be specified in the component descriptor external resource definition.");
+ try {
+ Class.forName("org.h2.Driver");
+ } catch (ClassNotFoundException e) {
+ log.error("Could not load the h2 Driver through 'Class.forName(\"org.h2.Driver\").");
+ throw new IllegalStateException(e);
+ }
+ }
+
+ @Override
+ public void loadAnnotations(URI annotationUri) throws IOException {
+ final Path annotationFilePath = annotationUri.toString().contains("file:"+File.separator) ? Path.of(annotationUri) : Path.of(annotationUri.toString().replace("file:", ""));
+ h2DbPath = annotationFilePath.isAbsolute() ? Path.of(annotationFilePath + ".h2") : Path.of("."+ File.separator+annotationFilePath+".h2");
+ if (!Files.exists(h2DbPath) || Files.getLastModifiedTime(annotationFilePath).toMillis() < Files.getLastModifiedTime(h2DbPath).toMillis()) {
+ log.info("Source annotation file {} is newer than database file {}. Creating a new database.", annotationFilePath, h2DbPath);
+ Files.list(h2DbPath.getParent()).filter(p -> p.toString().startsWith(h2DbPath.toString())).forEach(p -> FileUtils.deleteQuietly(p.toFile()));
+ try (Connection conn = DriverManager.
+ getConnection("jdbc:h2:" + h2DbPath, "sa", "")) {
+ conn.setAutoCommit(false);
+ PreparedStatement ps = null;
+ Map columnIndexes = new HashMap<>();
+ try (BufferedReader br = UriUtilities.getReaderFromUri(annotationUri)) {
+ final Iterator iterator = br.lines().map(format::parse).filter(Objects::nonNull).iterator();
+ boolean firstDataItem = true;
+ int psSize = 0;
+ int linesRead = 0;
+ while (iterator.hasNext()) {
+ ++linesRead;
+ T annotationData = iterator.next();
+ // We need to create the table after the retrieval of the first annotation item because the
+ // format parser derive the data types from the data
+ if (firstDataItem) {
+ for (int i = 0; i < format.getHeader().length; i++) {
+ if (format.getHeader()[i].equals("begin"))
+ format.getHeader()[i] = COL_BEGIN;
+ else if (format.getHeader()[i].equals("end"))
+ format.getHeader()[i] = COL_END;
+ }
+ IntStream.range(0, format.getHeader().length).forEach(i -> columnIndexes.put(format.getHeader()[i], i));
+ annotationDataClass = annotationData.getClass();
+ createAnnotationTable(conn, annotationData);
+ String insertionSql = "INSERT INTO annotations VALUES (" + IntStream.range(0, format.getHeader().length).mapToObj(i -> "?").collect(Collectors.joining(",")) + ")";
+ ps = conn.prepareStatement(insertionSql);
+ firstDataItem = false;
+ }
+ if (annotationData instanceof ExternalDocumentClassAnnotation)
+ throw new NotImplementedException("ExternalDocumentClassAnnotation data is currently not supprted by the H2AnnotationSource.");
+ ExternalTextAnnotation textAnnotation = (ExternalTextAnnotation) annotationData;
+ final Map fieldValues = textAnnotation.getAllFieldValuesAsMap();
+ for (String columnName : format.getHeader()) {
+ ps.setObject(columnIndexes.get(columnName) + 1, fieldValues.get(columnName));
+ }
+ ps.addBatch();
+ ++psSize;
+ if (psSize % 50 == 0) {
+ ps.executeBatch();
+ }
+ if (psSize % 10000 == 0 && log.isTraceEnabled()) {
+ int numRows = getCount(conn, "SELECT count(*) FROM annotations");
+ int numDocIds = getCount(conn, "SELECT count(DISTINCT docId) FROM annotations");
+ log.trace("Loaded {} entity annotations for {} document IDs.", numRows, numDocIds);
+ }
+ if (linesRead % 10000 == 0 && log.isTraceEnabled()) {
+ log.trace("Read {} lines from input {}", linesRead, annotationUri);
+ }
+ }
+ if (psSize > 0)
+ ps.executeBatch();
+ }
+ if (log.isTraceEnabled()) {
+ int numRows = getCount(conn, "SELECT count(*) FROM annotations");
+ int numDocIds = getCount(conn, "SELECT count(DISTINCT docId) FROM annotations");
+ log.trace("Loaded {} entity annotations for {} document IDs.", numRows, numDocIds);
+ }
+ conn.commit();
+ } catch (SQLException e) {
+ log.error("Could not create H2 database at {}", h2DbPath);
+ throw new IllegalStateException(e);
+ }
+ }
+ }
+
+ private int getCount(Connection conn, String sql) {
+ try {
+ final ResultSet rs = conn.createStatement().executeQuery(sql);
+ if (rs.next())
+ return rs.getInt(1);
+ } catch (SQLException e) {
+ log.error("Could not count rows via SQL query {}", sql, e);
+ throw new IllegalStateException(e);
+ }
+ return 0;
+ }
+
+ private void createAnnotationTable(Connection conn, T annotationData) throws SQLException {
+ final Statement stmt = conn.createStatement();
+ String tableCreationSql = getTableCreationSql(format.getHeader(), format.getColumnDataTypes(), annotationData);
+ try {
+ stmt.execute(tableCreationSql);
+ } catch (SQLException e) {
+ log.error("Could not create the annotation SQL table with command {}", tableCreationSql, e);
+ throw new IllegalStateException(e);
+ }
+ final String indexCreationSql = "CREATE INDEX annotations_doc_id_idx ON annotations (" + format.getHeader()[format.getDocumentIdColumnIndex()] + ")";
+ try {
+ stmt.execute(indexCreationSql);
+ } catch (SQLException e) {
+ log.error("Could not create index on document ID column which should be found at index {} of the header {} with SQL {}.", format.getDocumentIdColumnIndex(), format.getHeader(), indexCreationSql, e);
+ throw new IllegalStateException(e);
+ }
+ }
+
+ private String getTableCreationSql(String[] header, List> columnDataTypes, T annotationData) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("CREATE TABLE annotations (");
+ for (int i = 0; i < header.length; i++) {
+ String columnName = header[i];
+ Class> dataType = columnDataTypes.get(i);
+ String dbDataType = getDbDataType(dataType);
+ sb.append(columnName).append(" ").append(dbDataType);
+ if (i < header.length - 1)
+ sb.append(",");
+ }
+ sb.append(")");
+ return sb.toString();
+ }
+
+ private String getDbDataType(Class> dataType) {
+ if (dataType.equals(Integer.class))
+ return "INT";
+ else if (dataType.equals(Double.class))
+ return "DOUBLE";
+ else if (dataType.equals(Boolean.class))
+ return "BOOL";
+ return "VARCHAR";
+ }
+
+ @Override
+ public void initialize(DataResource dataResource) throws IOException {
+ log.info("Loading entity annotations from {}", dataResource.getUri());
+ loadAnnotations(dataResource.getUri());
+ }
+
+ @Override
+ public AnnotationList getAnnotations(String id) {
+ try {
+ if (queryStmt == null) {
+ Connection queryConn = DriverManager.
+ getConnection("jdbc:h2:" + h2DbPath, "sa", "");
+ queryStmt = queryConn.createStatement();
+ }
+ } catch (SQLException e) {
+ log.error("Could not connect to database at {}", h2DbPath, e);
+ throw new IllegalStateException(e);
+ }
+ final String sql = "SELECT * FROM annotations WHERE docId='" + id + "'";
+ try {
+ final ResultSet rs = queryStmt.executeQuery(sql);
+ final AnnotationList annotationList = new AnnotationList<>();
+ while (rs.next()) {
+ T textAnnotation = null;
+ if (annotationDataClass == null)
+ throw new IllegalStateException("The annotation data class should have been recorded when data was read from file but it is null.");
+ try {
+ if (annotationDataClass.equals(ExternalTextAnnotation.class))
+ textAnnotation = (T) annotationDataClass.getConstructor(String.class, int.class, int.class, String.class).newInstance(rs.getString(COL_DOC_ID), rs.getInt(COL_BEGIN), rs.getInt(COL_END), rs.getString(COL_UIMA_TYPE));
+ else
+ throw new NotImplementedException("The annotation class " + annotationDataClass + " is currently not supported by the H2AnnotationSource.");
+ } catch (Exception e) {
+ log.error("Could not create instance of annotation data class {}", annotationDataClass, e);
+ }
+ for (String columnName : format.getHeader()) {
+ final Object value = rs.getObject(columnName);
+ if (value != null && textAnnotation instanceof ExternalTextAnnotation && !columnName.equals(COL_UIMA_TYPE) && !columnName.equals(COL_DOC_ID)) {
+ ExternalTextAnnotation a = (ExternalTextAnnotation) textAnnotation;
+ String payLoadKey = columnName;
+ if(payLoadKey.equals(COL_BEGIN))
+ payLoadKey = "begin";
+ else if (payLoadKey.equals(COL_END))
+ payLoadKey = "end";
+ a.addPayload(payLoadKey, value);
+ }
+ }
+ annotationList.add(textAnnotation);
+ }
+ return annotationList;
+ } catch (SQLException e) {
+ log.error("Could not retrieve annotation values from the H2 database via SQL query '{}'", sql);
+ throw new IllegalStateException(e);
+ }
+ }
+}
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/H2TextAnnotationProvider.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/H2TextAnnotationProvider.java
new file mode 100644
index 000000000..a70c3af5f
--- /dev/null
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/H2TextAnnotationProvider.java
@@ -0,0 +1,17 @@
+package de.julielab.jcore.ae.annotationadder.annotationsources;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class H2TextAnnotationProvider extends TextAnnotationProvider {
+ private final static Logger log = LoggerFactory.getLogger(H2TextAnnotationProvider.class);
+ @Override
+ void initializeAnnotationSource() {
+ annotationSource = new H2AnnotationSource<>(format);
+ }
+
+ @Override
+ Logger getLogger() {
+ return log;
+ }
+}
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryAnnotationSource.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryAnnotationSource.java
new file mode 100644
index 000000000..f82929792
--- /dev/null
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryAnnotationSource.java
@@ -0,0 +1,46 @@
+package de.julielab.jcore.ae.annotationadder.annotationsources;
+
+import de.julielab.java.utilities.UriUtilities;
+import de.julielab.jcore.ae.annotationadder.annotationformat.AnnotationFormat;
+import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationData;
+import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationList;
+import org.apache.uima.resource.DataResource;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.net.URI;
+import java.util.Map;
+import java.util.Objects;
+import java.util.stream.Collectors;
+
+public class InMemoryAnnotationSource implements AnnotationSource> {
+ private final static Logger log = LoggerFactory.getLogger(InMemoryAnnotationSource.class);
+ private AnnotationFormat format;
+ private Map> entitiesByDocId;
+
+ public InMemoryAnnotationSource(AnnotationFormat format) {
+ this.format = format;
+ }
+
+ @Override
+ public void loadAnnotations(URI annotationUri) throws IOException {
+ try (BufferedReader br = UriUtilities.getReaderFromUri(annotationUri)) {
+ entitiesByDocId = br.lines().map(format::parse).filter(Objects::nonNull).collect(Collectors.groupingBy(AnnotationData::getDocumentId, Collectors.toCollection(AnnotationList::new)));
+ }
+ if (log.isTraceEnabled())
+ log.trace("Loaded {} entity annotations for {} document IDs.", entitiesByDocId.values().stream().flatMap(AnnotationList::stream).count(), entitiesByDocId.size());
+ }
+
+ @Override
+ public void initialize(DataResource dataResource) throws IOException {
+ log.info("Loading entity annotations from {}", dataResource.getUri());
+ loadAnnotations(dataResource.getUri());
+ }
+
+ @Override
+ public AnnotationList getAnnotations(String id) {
+ return entitiesByDocId.get(id);
+ }
+}
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileDocumentClassAnnotationProvider.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileDocumentClassAnnotationProvider.java
index ab95d5759..69e91f14a 100644
--- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileDocumentClassAnnotationProvider.java
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileDocumentClassAnnotationProvider.java
@@ -6,6 +6,8 @@
import org.apache.uima.resource.DataResource;
import org.apache.uima.resource.ResourceInitializationException;
+import java.io.IOException;
+
public class InMemoryFileDocumentClassAnnotationProvider implements AnnotationProvider {
private AnnotationSource> annotationSource;
@@ -17,8 +19,12 @@ public AnnotationList getAnnotations(String id)
@Override
public void load(DataResource dataResource) throws ResourceInitializationException {
// This logic could be made configurable if required so in the future.
- annotationSource = new FileAnnotationSource(new DocumentClassAnnotationFormat());
- annotationSource.initialize(dataResource);
+ annotationSource = new InMemoryAnnotationSource(new DocumentClassAnnotationFormat());
+ try {
+ annotationSource.initialize(dataResource);
+ } catch (IOException e) {
+ throw new ResourceInitializationException(e);
+ }
}
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java
index 6de11f4d3..950069570 100644
--- a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/InMemoryFileTextAnnotationProvider.java
@@ -1,42 +1,17 @@
package de.julielab.jcore.ae.annotationadder.annotationsources;
-import de.julielab.jcore.ae.annotationadder.annotationformat.AnnotationFormat;
-import de.julielab.jcore.ae.annotationadder.annotationformat.SimpleTSVEntityAnnotationFormat;
-import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationList;
-import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation;
-import org.apache.uima.resource.DataResource;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.resource.metadata.ConfigurationParameterSettings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.lang.reflect.InvocationTargetException;
-import java.util.Optional;
-
-public class InMemoryFileTextAnnotationProvider implements AnnotationProvider {
- public static final String PARAM_ANNOTATION_FORMAT = "AnnotationFormatClass";
+public class InMemoryFileTextAnnotationProvider extends TextAnnotationProvider {
private final static Logger log = LoggerFactory.getLogger(InMemoryFileTextAnnotationProvider.class);
- private AnnotationSource annotationSource;
-
@Override
- public AnnotationList getAnnotations(String id) {
- return annotationSource.getAnnotations(id);
+ void initializeAnnotationSource() {
+ annotationSource = new InMemoryAnnotationSource<>(format);
}
@Override
- public void load(DataResource dataResource) throws ResourceInitializationException {
- final ConfigurationParameterSettings parameterSettings = dataResource.getMetaData().getConfigurationParameterSettings();
- final String formatClassName = (String) Optional.ofNullable(parameterSettings.getParameterValue(PARAM_ANNOTATION_FORMAT)).orElse(SimpleTSVEntityAnnotationFormat.class.getCanonicalName());
- AnnotationFormat format;
- try {
- format = (AnnotationFormat) Class.forName(formatClassName).getDeclaredConstructor().newInstance();
- } catch (NoSuchMethodException | InvocationTargetException | InstantiationException | IllegalAccessException | ClassNotFoundException e) {
- log.error("Could not instantiate class {}", formatClassName);
- throw new ResourceInitializationException(e);
- }
- annotationSource = new FileAnnotationSource(format);
- annotationSource.initialize(dataResource);
+ Logger getLogger() {
+ return log;
}
-
-
}
diff --git a/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/TextAnnotationProvider.java b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/TextAnnotationProvider.java
new file mode 100644
index 000000000..007ac0bae
--- /dev/null
+++ b/jcore-annotation-adder-ae/src/main/java/de/julielab/jcore/ae/annotationadder/annotationsources/TextAnnotationProvider.java
@@ -0,0 +1,58 @@
+package de.julielab.jcore.ae.annotationadder.annotationsources;
+
+import de.julielab.jcore.ae.annotationadder.annotationformat.AnnotationFormat;
+import de.julielab.jcore.ae.annotationadder.annotationformat.SimpleTSVEntityAnnotationFormat;
+import de.julielab.jcore.ae.annotationadder.annotationrepresentations.AnnotationList;
+import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation;
+import org.apache.uima.resource.DataResource;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.metadata.ConfigurationParameterSettings;
+import org.slf4j.Logger;
+
+import java.io.IOException;
+import java.lang.reflect.InvocationTargetException;
+import java.util.Optional;
+
+public abstract class TextAnnotationProvider implements AnnotationProvider {
+ public static final String PARAM_ANNOTATION_FORMAT = "AnnotationFormatClass";
+ public static final String PARAM_INPUT_HAS_HEADER = "InputHasHeader";
+ public static final String PARAM_COLUMN_NAMES = "ColumnNames";
+ public static final String COL_DOC_ID = "docId";
+ public static final String COL_BEGIN = "beginOffset";
+ public static final String COL_END = "endOffset";
+ public static final String COL_UIMA_TYPE = "uimaType";
+ protected Logger log;
+ protected AnnotationSource> annotationSource;
+ protected AnnotationFormat format;
+
+ @Override
+ public AnnotationList getAnnotations(String id) {
+ return annotationSource.getAnnotations(id);
+ }
+
+ abstract void initializeAnnotationSource();
+
+ abstract Logger getLogger();
+
+ @Override
+ public void load(DataResource dataResource) throws ResourceInitializationException {
+ final ConfigurationParameterSettings parameterSettings = dataResource.getMetaData().getConfigurationParameterSettings();
+ final String formatClassName = (String) Optional.ofNullable(parameterSettings.getParameterValue(PARAM_ANNOTATION_FORMAT)).orElse(SimpleTSVEntityAnnotationFormat.class.getCanonicalName());
+ final boolean hasHeader = (boolean) Optional.ofNullable(parameterSettings.getParameterValue(PARAM_INPUT_HAS_HEADER)).orElse(false);
+ final String[] columnNames = (String[])parameterSettings.getParameterValue(PARAM_COLUMN_NAMES);
+ try {
+ format = (AnnotationFormat) Class.forName(formatClassName).getDeclaredConstructor().newInstance();
+ format.hasHeader(hasHeader);
+ format.setColumnNames(columnNames);
+ } catch (NoSuchMethodException | InvocationTargetException | InstantiationException | IllegalAccessException | ClassNotFoundException e) {
+ getLogger().error("Could not instantiate class {}", formatClassName);
+ throw new ResourceInitializationException(e);
+ }
+ initializeAnnotationSource();
+ try {
+ annotationSource.initialize(dataResource);
+ } catch (IOException e) {
+ throw new ResourceInitializationException(e);
+ }
+ }
+}
diff --git a/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml b/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml
index fcd2c1d27..585e4eeb7 100644
--- a/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml
+++ b/jcore-annotation-adder-ae/src/main/resources/de/julielab/jcore/ae/annotationadder/desc/jcore-annotation-adder-ae.xml
@@ -6,7 +6,7 @@
JCoRe Annotation AdderThis component helps to import annotations made on the exact CAS document text by an external process back into the CAS. To this end, the component is prepared to read several data formats. Currently, simple offset-based annotations are supported with configurable UIMA types. The component supports character and token based offsets.
- 2.5.1-SNAPSHOT
+ 2.6.0OffsetMode
@@ -79,6 +79,20 @@
falsefalse
+
+ InputHasHeader
+ Indicates whether the the input TSV file has a header line.
+ Boolean
+ false
+ false
+
+
+ ColumnNames
+ For column formats without a header. Required when the columns should be mapped to annotation type features. Then, the headers must correspond to the feature names and are case sensitive. When specified, the number of elements for this parameter must equal the number of columns in the input file. Then, the i-th parameter value will be set as the name of the i-th column.
+ String
+ true
+ false
+
diff --git a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java
index 65c0de306..d0be14929 100644
--- a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java
+++ b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderAnnotatorTest.java
@@ -1,9 +1,12 @@
package de.julielab.jcore.ae.annotationadder;
+import de.julielab.jcore.ae.annotationadder.annotationsources.H2TextAnnotationProvider;
import de.julielab.jcore.ae.annotationadder.annotationsources.InMemoryFileDocumentClassAnnotationProvider;
import de.julielab.jcore.ae.annotationadder.annotationsources.InMemoryFileTextAnnotationProvider;
+import de.julielab.jcore.ae.annotationadder.annotationsources.TextAnnotationProvider;
import de.julielab.jcore.types.*;
+import org.apache.commons.io.FileUtils;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
@@ -12,10 +15,13 @@
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ExternalResourceDescription;
+import org.apache.uima.resource.SharedResourceObject;
import org.assertj.core.data.Offset;
-import org.junit.Test;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Test;
import java.io.File;
+import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
@@ -25,10 +31,27 @@
*
*/
public class AnnotationAdderAnnotatorTest{
+
+ @AfterEach
+ public void cleanup() {
+ Path h2DbPath = Path.of("src", "test", "resources", "geneannotations_character_offsets.tsv.h2.mv.db");
+ FileUtils.deleteQuietly(h2DbPath.toFile());
+ }
+
+ @Test
+ public void testCharacterOffsetsInMemory() throws Exception {
+ testCharacterOffsets(InMemoryFileTextAnnotationProvider.class);
+ }
+
@Test
- public void testCharacterOffsets() throws Exception {
+ public void testCharacterOffsetsH2DB() throws Exception {
+ testCharacterOffsets(H2TextAnnotationProvider.class);
+ }
+
+
+ public void testCharacterOffsets(Class extends SharedResourceObject> annotationProviderClass) throws Exception {
final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types");
- final ExternalResourceDescription externalResourceDescription = ExternalResourceFactory.createExternalResourceDescription(InMemoryFileTextAnnotationProvider.class, new File("src/test/resources/geneannotations_character_offsets.tsv"));
+ final ExternalResourceDescription externalResourceDescription = ExternalResourceFactory.createExternalResourceDescription(annotationProviderClass, new File("src/test/resources/geneannotations_character_offsets.tsv"), TextAnnotationProvider.PARAM_COLUMN_NAMES, new String[]{"docId", "begin", "end", "uimaType", "confidence", "specificType"});
final AnalysisEngine engine = AnalysisEngineFactory.createEngine(AnnotationAdderAnnotator.class, AnnotationAdderAnnotator.KEY_ANNOTATION_SOURCE, externalResourceDescription);
// Test doc1 (two gene annotations)
jCas.setDocumentText("BRCA PRKII are the genes of this sentence.");
@@ -47,7 +70,100 @@ public void testCharacterOffsets() throws Exception {
assertThat(genes.get(1).getBegin()).isEqualTo(5);
assertThat(genes.get(1).getEnd()).isEqualTo(10);
- // Test doc2 (no gene annotations)
+ // Test doc2 (no gene annotations, there will be a warning on DEBUG level)
+ jCas.reset();
+ jCas.setDocumentText("There are no gene mentions in here");
+ Header h2 = new Header(jCas);
+ h2.setDocId("doc2");
+ h2.addToIndexes();
+ engine.process(jCas);
+ assertThat(JCasUtil.exists(jCas, Gene.class)).isFalse();
+
+ // Test doc3 (one gene annotation)
+ jCas.reset();
+ jCas.setDocumentText("PRKAVI does not exist, I think. But this is just a test so it doesn't matter.");
+ Header h3 = new Header(jCas);
+ h3.setDocId("doc3");
+ h3.addToIndexes();
+ engine.process(jCas);
+ final Gene gene = JCasUtil.selectSingle(jCas, Gene.class);
+ assertThat(gene.getBegin()).isEqualTo(0);
+ assertThat(gene.getEnd()).isEqualTo(6);
+ }
+
+ @Test
+ public void testPayload() throws Exception {
+ final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types");
+ final ExternalResourceDescription externalResourceDescription = ExternalResourceFactory.createExternalResourceDescription(InMemoryFileTextAnnotationProvider.class, new File("src/test/resources/geneannotations_character_offsets_payload.tsv"), InMemoryFileTextAnnotationProvider.PARAM_INPUT_HAS_HEADER, true);
+ final AnalysisEngine engine = AnalysisEngineFactory.createEngine(AnnotationAdderAnnotator.class, AnnotationAdderAnnotator.KEY_ANNOTATION_SOURCE, externalResourceDescription);
+ // Test doc1 (two gene annotations)
+ jCas.setDocumentText("BRCA PRKII are the genes of this sentence.");
+ final Header h = new Header(jCas);
+ h.setDocId("doc1");
+ h.addToIndexes();
+
+ engine.process(jCas);
+
+ final List genes = new ArrayList<>(JCasUtil.select(jCas, Gene.class));
+ assertThat(genes).hasSize(2);
+
+ assertThat(genes.get(0).getBegin()).isEqualTo(0);
+ assertThat(genes.get(0).getEnd()).isEqualTo(4);
+ assertThat(genes.get(0).getSpecificType()).isEqualTo("protein");
+
+ assertThat(genes.get(1).getBegin()).isEqualTo(5);
+ assertThat(genes.get(1).getEnd()).isEqualTo(10);
+ assertThat(genes.get(1).getSpecificType()).isEqualTo("dna");
+
+ // Test doc2 (no gene annotations, there will be a warning on DEBUG level)
+ jCas.reset();
+ jCas.setDocumentText("There are no gene mentions in here");
+ Header h2 = new Header(jCas);
+ h2.setDocId("doc2");
+ h2.addToIndexes();
+ engine.process(jCas);
+ assertThat(JCasUtil.exists(jCas, Gene.class)).isFalse();
+
+ // Test doc3 (one gene annotation)
+ jCas.reset();
+ jCas.setDocumentText("PRKAVI does not exist, I think. But this is just a test so it doesn't matter.");
+ Header h3 = new Header(jCas);
+ h3.setDocId("doc3");
+ h3.addToIndexes();
+ engine.process(jCas);
+ final Gene gene = JCasUtil.selectSingle(jCas, Gene.class);
+ assertThat(gene.getBegin()).isEqualTo(0);
+ assertThat(gene.getEnd()).isEqualTo(6);
+ assertThat(gene.getComponentId()).isEqualTo("GoldData");
+ }
+
+ @Test
+ public void testHeaderParameter() throws Exception {
+ final JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.jcore-document-meta-types");
+ final ExternalResourceDescription externalResourceDescription = ExternalResourceFactory.createExternalResourceDescription(InMemoryFileTextAnnotationProvider.class, new File("src/test/resources/geneannotations_character_offsets.tsv"), InMemoryFileTextAnnotationProvider.PARAM_COLUMN_NAMES, new String[]{"docId", "begin", "end", "uimaType", "specificType", "componentId"});
+ final AnalysisEngine engine = AnalysisEngineFactory.createEngine(AnnotationAdderAnnotator.class, AnnotationAdderAnnotator.KEY_ANNOTATION_SOURCE, externalResourceDescription);
+ // Test doc1 (two gene annotations)
+ jCas.setDocumentText("BRCA PRKII are the genes of this sentence.");
+ final Header h = new Header(jCas);
+ h.setDocId("doc1");
+ h.addToIndexes();
+
+ engine.process(jCas);
+
+ final List genes = new ArrayList<>(JCasUtil.select(jCas, Gene.class));
+ assertThat(genes).hasSize(2);
+
+ assertThat(genes.get(0).getBegin()).isEqualTo(0);
+ assertThat(genes.get(0).getEnd()).isEqualTo(4);
+ assertThat(genes.get(0).getSpecificType()).isEqualTo("0.1234");
+ assertThat(genes.get(0).getComponentId()).isEqualTo("additionalColumn2");
+
+ assertThat(genes.get(1).getBegin()).isEqualTo(5);
+ assertThat(genes.get(1).getEnd()).isEqualTo(10);
+ assertThat(genes.get(1).getSpecificType()).isEqualTo("0.1234");
+ assertThat(genes.get(1).getComponentId()).isEqualTo("additionalColumn2");
+
+ // Test doc2 (no gene annotations, there will be a warning on DEBUG level)
jCas.reset();
jCas.setDocumentText("There are no gene mentions in here");
Header h2 = new Header(jCas);
@@ -66,6 +182,8 @@ public void testCharacterOffsets() throws Exception {
final Gene gene = JCasUtil.selectSingle(jCas, Gene.class);
assertThat(gene.getBegin()).isEqualTo(0);
assertThat(gene.getEnd()).isEqualTo(6);
+ assertThat(gene.getSpecificType()).isEqualTo("0.1234");
+ assertThat(gene.getComponentId()).isEqualTo("additionalColumn2");
}
@Test
diff --git a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelperTest.java b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelperTest.java
new file mode 100644
index 000000000..bcb96ec08
--- /dev/null
+++ b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/AnnotationAdderHelperTest.java
@@ -0,0 +1,24 @@
+package de.julielab.jcore.ae.annotationadder;
+
+import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation;
+import de.julielab.jcore.types.Gene;
+import org.apache.uima.UIMAException;
+import org.apache.uima.fit.factory.JCasFactory;
+import org.apache.uima.jcas.JCas;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+class AnnotationAdderHelperTest {
+
+ @Test
+ void setAnnotationPayloadsToFeatures() throws UIMAException {
+ JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-biology-types");
+ Gene gene = new Gene(jCas);
+ ExternalTextAnnotation extAnnotation = new ExternalTextAnnotation("1", 0, 1, "dummy");
+ extAnnotation.addPayload("specificType", "protein");
+ AnnotationAdderHelper helper = new AnnotationAdderHelper();
+ helper.setAnnotationPayloadsToFeatures(gene, extAnnotation);
+ assertEquals("protein", gene.getSpecificType());
+ }
+}
\ No newline at end of file
diff --git a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/FeatureBasedTSVFormatTest.java b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/FeatureBasedTSVFormatTest.java
new file mode 100644
index 000000000..74e086220
--- /dev/null
+++ b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/FeatureBasedTSVFormatTest.java
@@ -0,0 +1,27 @@
+package de.julielab.jcore.ae.annotationadder.annotationformat;
+
+import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+class FeatureBasedTSVFormatTest {
+
+ @Test
+ void parse() {
+ FeatureBasedTSVFormat format = new FeatureBasedTSVFormat();
+ format.hasHeader(true);
+ // should be ignored
+ assertNull(format.parse("# comment"));
+ // should be stored as header but not return something
+ assertNull(format.parse("docId\tbegin\tend\tcomponentId\tuimaType\tspecificType"));
+ ExternalTextAnnotation extAnnotation = format.parse("123\t0\t5\tGoldAnnotation\tde.julielab.jcore.types.Gene\tprotein");
+ assertEquals("123", extAnnotation.getDocumentId());
+ assertEquals(0, extAnnotation.getStart());
+ assertEquals(0, extAnnotation.getEnd());
+ assertEquals("de.julielab.jcore.types.Gene", extAnnotation.getUimaType());
+ assertEquals("protein", extAnnotation.getPayload("specificType"));
+ assertEquals("GoldAnnotation", extAnnotation.getPayload("componentId"));
+ }
+}
\ No newline at end of file
diff --git a/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormatTest.java b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormatTest.java
new file mode 100644
index 000000000..848526c03
--- /dev/null
+++ b/jcore-annotation-adder-ae/src/test/java/de/julielab/jcore/ae/annotationadder/annotationformat/SimpleTSVEntityAnnotationFormatTest.java
@@ -0,0 +1,27 @@
+package de.julielab.jcore.ae.annotationadder.annotationformat;
+
+import de.julielab.jcore.ae.annotationadder.annotationrepresentations.ExternalTextAnnotation;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+class SimpleTSVEntityAnnotationFormatTest {
+
+ @Test
+ void parse() {
+ SimpleTSVEntityAnnotationFormat format = new SimpleTSVEntityAnnotationFormat();
+ format.hasHeader(true);
+ // should be ignored
+ assertNull(format.parse("# comment"));
+ // should be stored as header but not return something
+ assertNull(format.parse("docId\tbegin\tend\ttype\tspecificType\tcomponentId"));
+ ExternalTextAnnotation extAnnotation = format.parse("123\t0\t5\tde.julielab.jcore.types.Gene\tprotein\tGoldAnnotation");
+ assertEquals("123", extAnnotation.getDocumentId());
+ assertEquals(0, extAnnotation.getStart());
+ assertEquals(5, extAnnotation.getEnd());
+ assertEquals("de.julielab.jcore.types.Gene", extAnnotation.getUimaType());
+ assertEquals("protein", extAnnotation.getPayload("specificType"));
+ assertEquals("GoldAnnotation", extAnnotation.getPayload("componentId"));
+ }
+}
\ No newline at end of file
diff --git a/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets.tsv b/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets.tsv
index a3b4799ab..1f1f04a44 100644
--- a/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets.tsv
+++ b/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets.tsv
@@ -1,3 +1,3 @@
-doc1 0 4 de.julielab.jcore.types.Gene
-doc1 5 10 de.julielab.jcore.types.Gene
-doc3 0 6 de.julielab.jcore.types.Gene
\ No newline at end of file
+doc1 0 4 de.julielab.jcore.types.Gene 0.1234 additionalColumn2
+doc1 5 10 de.julielab.jcore.types.Gene 0.1234 additionalColumn2
+doc3 0 6 de.julielab.jcore.types.Gene 0.1234 additionalColumn2
\ No newline at end of file
diff --git a/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets_payload.tsv b/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets_payload.tsv
new file mode 100644
index 000000000..7606678d6
--- /dev/null
+++ b/jcore-annotation-adder-ae/src/test/resources/geneannotations_character_offsets_payload.tsv
@@ -0,0 +1,4 @@
+docId begin end uimaType specificType componentId
+doc1 0 4 de.julielab.jcore.types.Gene protein GoldData
+doc1 5 10 de.julielab.jcore.types.Gene dna GoldData
+doc3 0 6 de.julielab.jcore.types.Gene gene GoldData
\ No newline at end of file
diff --git a/jcore-annotation-removal-ae/LICENSE b/jcore-annotation-removal-ae/LICENSE
new file mode 100644
index 000000000..fbbd41e05
--- /dev/null
+++ b/jcore-annotation-removal-ae/LICENSE
@@ -0,0 +1,26 @@
+BSD 2-Clause License
+
+Copyright (c) 2017, JULIE Lab
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/jcore-annotation-removal-ae/README.md b/jcore-annotation-removal-ae/README.md
new file mode 100644
index 000000000..563b7ad3e
--- /dev/null
+++ b/jcore-annotation-removal-ae/README.md
@@ -0,0 +1,34 @@
+# JCoRe Annotation Removal AE
+
+**Descriptor Path**:
+```
+de.julielab.jcore.ae.annotationremoval.desc.jcore-annotation-removal-ae
+```
+
+Removes annotations from the CAS that belong to one of the types specified as a parameter value in the descriptor.
+
+
+
+**1. Parameters**
+
+| Parameter Name | Parameter Type | Mandatory | Multivalued | Description |
+|----------------|----------------|-----------|-------------|-------------|
+| param1 | UIMA-Type | Boolean | Boolean | Description |
+| param2 | UIMA-Type | Boolean | Boolean | Description |
+
+**2. Predefined Settings**
+
+| Parameter Name | Parameter Syntax | Example |
+|----------------|------------------|---------|
+| param1 | Syntax-Description | `Example` |
+| param2 | Syntax-Description | `Example` |
+
+**3. Capabilities**
+
+| Type | Input | Output |
+|------|:-----:|:------:|
+| de.julielab.jcore.types.TYPE | | `+` |
+| de.julielab.jcore.types.ace.TYPE | `+` | |
+
+
+[1] Some Literature?
diff --git a/jcore-annotation-removal-ae/component.meta b/jcore-annotation-removal-ae/component.meta
new file mode 100644
index 000000000..04e9d8c1e
--- /dev/null
+++ b/jcore-annotation-removal-ae/component.meta
@@ -0,0 +1,20 @@
+{
+ "categories": [
+ "ae"
+ ],
+ "description": "Removes annotations from the CAS that belong to one of the types specified as a parameter value in the descriptor.",
+ "descriptors": [
+ {
+ "category": "ae",
+ "location": "de.julielab.jcore.ae.annotationremoval.desc.jcore-annotation-removal-ae"
+ }
+ ],
+ "exposable": true,
+ "group": "general",
+ "maven-artifact": {
+ "artifactId": "jcore-annotation-removal-ae",
+ "groupId": "de.julielab",
+ "version": "2.6.0"
+ },
+ "name": "JCoRe Annotation Removal AE"
+}
diff --git a/jcore-annotation-removal-ae/pom.xml b/jcore-annotation-removal-ae/pom.xml
new file mode 100644
index 000000000..e434a54b2
--- /dev/null
+++ b/jcore-annotation-removal-ae/pom.xml
@@ -0,0 +1,55 @@
+
+
+
+ 4.0.0
+ jcore-annotation-removal-ae
+ jar
+ de.julielab
+
+
+ de.julielab
+ jcore-base
+ 2.6.0
+
+
+
+
+
+
+ ch.qos.logback
+ logback-classic
+ test
+
+
+ org.slf4j
+ slf4j-api
+
+
+ de.julielab
+ jcore-types
+ ${jcore-types-version}
+
+
+ de.julielab
+ jcore-descriptor-creator
+
+
+ org.junit.jupiter
+ junit-jupiter-engine
+
+
+ JCoRe Annotation Removal AE
+
+ JULIE Lab Jena, Germany
+ http://www.julielab.de
+
+ https://github.com/JULIELab/jcore-base/tree/master/jcore-annotation-removal-ae
+ Removes annotations from the CAS that belong to one of the types specified as a parameter value in the descriptor.
+
+
+ BSD 2-Clause
+ https://opensource.org/licenses/BSD-2-Clause
+
+
+
diff --git a/jcore-annotation-removal-ae/src/main/java/de/julielab/jcore/ae/annotationremoval/AnnotationRemovalAnnotator.java b/jcore-annotation-removal-ae/src/main/java/de/julielab/jcore/ae/annotationremoval/AnnotationRemovalAnnotator.java
new file mode 100644
index 000000000..019f06e02
--- /dev/null
+++ b/jcore-annotation-removal-ae/src/main/java/de/julielab/jcore/ae/annotationremoval/AnnotationRemovalAnnotator.java
@@ -0,0 +1,51 @@
+
+package de.julielab.jcore.ae.annotationremoval;
+
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
+import org.apache.uima.cas.Type;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.descriptor.ResourceMetaData;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.List;
+
+@ResourceMetaData(name="JCoRe Annotation Removal AE", description = "Removes annotations from the CAS that belong to one of the types specified as a parameter value in the descriptor.", vendor = "JULIE Lab Jena, Germany")
+public class AnnotationRemovalAnnotator extends JCasAnnotator_ImplBase {
+public static final String PARAM_ANNOTATION_TYPES = "AnnotationTypes";
+ private final static Logger log = LoggerFactory.getLogger(AnnotationRemovalAnnotator.class);
+
+ @ConfigurationParameter(name=PARAM_ANNOTATION_TYPES, description="List of qualified UIMA type names for which all annotations should be removed from each CAS.")
+ private String[] annotationTypesForRemoval;
+
+ /**
+ * This method is called a single time by the framework at component
+ * creation. Here, descriptor parameters are read and initial setup is done.
+ */
+ @Override
+ public void initialize(final UimaContext aContext) throws ResourceInitializationException {
+ annotationTypesForRemoval = (String[]) aContext.getConfigParameterValue(PARAM_ANNOTATION_TYPES);
+ if (annotationTypesForRemoval.length == 0)
+ throw new ResourceInitializationException(new IllegalArgumentException("The list of annotations for removal, given through parameter '" + PARAM_ANNOTATION_TYPES + "' is empty."));
+ }
+
+ /**
+ * This method is called for each document going through the component. This
+ * is where the actual work happens.
+ */
+ @Override
+ public void process(final JCas aJCas) {
+ List removalList = new ArrayList<>();
+ for (String annotationTypeName : annotationTypesForRemoval) {
+ final Type type = aJCas.getTypeSystem().getType(annotationTypeName);
+ aJCas.getAnnotationIndex(type).forEach(removalList::add);
+ removalList.forEach(Annotation::removeFromIndexes);
+ }
+ }
+
+}
diff --git a/jcore-annotation-removal-ae/src/main/resources/de/julielab/jcore/ae/annotationremoval/desc/jcore-annotation-removal-ae.xml b/jcore-annotation-removal-ae/src/main/resources/de/julielab/jcore/ae/annotationremoval/desc/jcore-annotation-removal-ae.xml
new file mode 100644
index 000000000..3cebc1704
--- /dev/null
+++ b/jcore-annotation-removal-ae/src/main/resources/de/julielab/jcore/ae/annotationremoval/desc/jcore-annotation-removal-ae.xml
@@ -0,0 +1,34 @@
+
+
+ org.apache.uima.java
+ true
+ de.julielab.jcore.ae.annotationremoval.AnnotationRemovalAnnotator
+
+ JCoRe Annotation Removal AE
+ Removes annotations from the CAS that belong to one of the types specified as a parameter value in the descriptor.
+ 2.6.0
+ JULIE Lab Jena, Germany
+
+
+ AnnotationTypes
+ List of qualified UIMA type names for which all annotations should be removed from each CAS.
+ String
+ true
+ true
+
+
+
+
+
+
+
+
+
+
+
+ true
+ true
+ false
+
+
+
\ No newline at end of file
diff --git a/jcore-annotation-removal-ae/src/test/java/de/julielab/jcore/ae/annotationremoval/AnnotationRemovalAnnotatorTest.java b/jcore-annotation-removal-ae/src/test/java/de/julielab/jcore/ae/annotationremoval/AnnotationRemovalAnnotatorTest.java
new file mode 100644
index 000000000..a401c969f
--- /dev/null
+++ b/jcore-annotation-removal-ae/src/test/java/de/julielab/jcore/ae/annotationremoval/AnnotationRemovalAnnotatorTest.java
@@ -0,0 +1,60 @@
+package de.julielab.jcore.ae.annotationremoval;
+
+import de.julielab.jcore.types.Gene;
+import de.julielab.jcore.types.Sentence;
+import de.julielab.jcore.types.Token;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+
+/**
+ * Unit tests for jcore-annotation-removal-ae.
+ */
+public class AnnotationRemovalAnnotatorTest {
+ private final static Logger log = LoggerFactory.getLogger(AnnotationRemovalAnnotatorTest.class);
+
+ @Test
+ public void testAnnotator() throws Exception {
+ final AnalysisEngine engine = AnalysisEngineFactory.createEngine("de.julielab.jcore.ae.annotationremoval.desc.jcore-annotation-removal-ae",
+ AnnotationRemovalAnnotator.PARAM_ANNOTATION_TYPES, new String[]{"de.julielab.jcore.types.Token", "de.julielab.jcore.types.Gene"});
+ final JCas jCas = engine.newJCas();
+ jCas.setDocumentText("There is a gene in this sentence.");
+ addTokens(jCas);
+ new Sentence(jCas, 0, jCas.getDocumentText().length()).addToIndexes();
+ new Gene(jCas, 11, 15).addToIndexes();
+
+ // Check that the annotations we just created are actually there.
+ assertFalse(JCasUtil.select(jCas, Sentence.class).isEmpty());
+ assertFalse(JCasUtil.select(jCas, Token.class).isEmpty());
+ assertFalse(JCasUtil.select(jCas, Gene.class).isEmpty());
+
+ engine.process(jCas);
+
+ // And now check that the annotation that should be removed are really gone.
+ assertFalse(JCasUtil.select(jCas, Sentence.class).isEmpty());
+ assertTrue(JCasUtil.select(jCas, Token.class).isEmpty());
+ assertTrue(JCasUtil.select(jCas, Gene.class).isEmpty());
+ }
+
+ private void addTokens(JCas jCas) {
+ Matcher alphanumericalTokens = Pattern.compile("[A-Za-z0-9]+").matcher(jCas.getDocumentText());
+ while (alphanumericalTokens.find()) {
+ new Token(jCas, alphanumericalTokens.start(), alphanumericalTokens.end()).addToIndexes();
+ }
+ Matcher punctuation = Pattern.compile("\\p{Punct}").matcher(jCas.getDocumentText());
+ while (alphanumericalTokens.find()) {
+ new Token(jCas, punctuation.start(), punctuation.end()).addToIndexes();
+ }
+ }
+}
diff --git a/jcore-banner-ae/component.meta b/jcore-banner-ae/component.meta
index 8785baa0c..4ba9b7c9e 100644
--- a/jcore-banner-ae/component.meta
+++ b/jcore-banner-ae/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-banner-ae",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe Banner"
}
diff --git a/jcore-banner-ae/pom.xml b/jcore-banner-ae/pom.xml
index 6b10e4221..a4f8e8d32 100644
--- a/jcore-banner-ae/pom.xml
+++ b/jcore-banner-ae/pom.xml
@@ -37,6 +37,10 @@
log4jlog4j
+
+ junit
+ junit
+
@@ -54,20 +58,24 @@
jcore-mallet-2.0.92.1.2
+
+ de.julielab
+ jcore-descriptor-creator
+ de.julielabjulielab-java-utilities
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-enginede.julielabjcore-base
- 2.5.1-SNAPSHOT
- ..
+ 2.6.0
+ ../pom.xml
@@ -75,4 +83,22 @@
https://opensource.org/licenses/BSD-2-Clause
+
+
+
+ maven-dependency-plugin
+
+
+ prepare-package
+
+ copy-dependencies
+
+
+ ${project.build.directory}/lib
+
+
+
+
+
+
diff --git a/jcore-banner-ae/src/main/java/banner/annotation/BEAT.java b/jcore-banner-ae/src/main/java/banner/annotation/BEAT.java
index 113f6139f..38281692f 100644
--- a/jcore-banner-ae/src/main/java/banner/annotation/BEAT.java
+++ b/jcore-banner-ae/src/main/java/banner/annotation/BEAT.java
@@ -18,8 +18,8 @@
import java.awt.*;
import java.awt.event.*;
import java.io.IOException;
-import java.util.*;
import java.util.List;
+import java.util.*;
public class BEAT extends JFrame implements ActionListener, CaretListener
{
diff --git a/jcore-banner-ae/src/main/java/banner/tagging/FeatureSet.java b/jcore-banner-ae/src/main/java/banner/tagging/FeatureSet.java
index df6548577..009154e3c 100644
--- a/jcore-banner-ae/src/main/java/banner/tagging/FeatureSet.java
+++ b/jcore-banner-ae/src/main/java/banner/tagging/FeatureSet.java
@@ -111,9 +111,9 @@ private SerialPipes createPipe(TagFormat format, Lemmatiser lemmatiser, dragon.n
//siddhartha added these;
pipes.add(simFindFilename == null ? new Noop() : new SimFind(simFindFilename));
-// pipes.add(new ChemicalSuffix("CHEM_SUFF="));
-// pipes.add(new MentionTypeHint("MENTION_TYPE="));
-// pipes.add(new ProteinSymbols("PROT_SYM="));
+ pipes.add(new ChemicalSuffix("CHEM_SUFF="));
+ pipes.add(new MentionTypeHint("MENTION_TYPE="));
+ pipes.add(new ProteinSymbols("PROT_SYM="));
pipes.add(new OffsetConjunctions(new int[][] { { -2 }, { -1 }, { 1 }, { 2 } }));
pipes.add(new TokenSequence2FeatureVectorSequence(true, true));
diff --git a/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java b/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java
index 1c28c28b0..e5cb62761 100644
--- a/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java
+++ b/jcore-banner-ae/src/main/java/banner/tagging/pipe/LemmaPOS.java
@@ -43,31 +43,37 @@ public LemmaPOS(Lemmatiser lemmatiser, Tagger posTagger) {
public void setLemmatiser(Lemmatiser lemmatiser) {
initResourcesMap();
getResources().lemmatiser = lemmatiser;
+ System.out.println("Setting lemmatiser to " + Thread.currentThread() + " in object " + this);
}
public void setPosTagger(Tagger posTagger) {
initResourcesMap();
getResources().posTagger = posTagger;
+ System.out.println("Setting PoS Tagger to " + Thread.currentThread() + " in object " + this);
}
- private void initResourcesMap() {
+ synchronized private void initResourcesMap() {
if (resourcesByThread == null)
resourcesByThread = new HashMap<>();
}
private Resources getResources() {
- return resourcesByThread.compute(Thread.currentThread(), (t, r) -> {
- Resources ret = r;
- if (ret == null)
- ret = new Resources();
- return ret;
- });
+ Thread currentThread = Thread.currentThread();
+ Resources resources = resourcesByThread.get(currentThread);
+ if (resources == null) {
+ resources = new Resources();
+ synchronized (resourcesByThread) {
+// System.out.println("Creating resources for thread " + currentThread);
+ resourcesByThread.put(currentThread, resources);
+ }
+ }
+ return resources;
}
@Override
public Instance pipe(Instance carrier) {
if (expectLemmatiser != (getResources().lemmatiser != null))
- throw new IllegalStateException("Model was trained with lemmatiser; not present in current config");
+ throw new IllegalStateException("Model was trained with lemmatiser; not present in current config; resource map: " + resourcesByThread + ", current thread: " + Thread.currentThread());
if (expectPOSTagger != (getResources().posTagger != null))
throw new IllegalStateException("Model was trained with POS tagger; not present in current config");
// TODO Add prefix ability
@@ -112,5 +118,14 @@ public Instance pipe(Instance carrier) {
private class Resources {
public Lemmatiser lemmatiser;
public Tagger posTagger;
+
+ @Override
+ public String toString() {
+ return "Resources{" +
+ "lemmatiser=" + lemmatiser +
+ ", posTagger=" + posTagger +
+ ", idHashCode= " + System.identityHashCode(this) +
+ '}';
+ }
}
}
diff --git a/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java b/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java
index 1f6077e17..43b29b9fd 100644
--- a/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java
+++ b/jcore-banner-ae/src/main/java/de/julielab/jcore/ae/banner/BANNERAnnotator.java
@@ -9,8 +9,10 @@
import banner.types.Mention;
import banner.types.Sentence;
import de.julielab.jcore.types.EntityMention;
+import de.julielab.jcore.types.pubmed.InternalReference;
import de.julielab.jcore.utility.JCoReAnnotationTools;
import de.julielab.jcore.utility.JCoReTools;
+import de.julielab.jcore.utility.index.JCoReOverlapAnnotationIndex;
import dragon.nlp.tool.Tagger;
import dragon.nlp.tool.lemmatiser.EngLemmatiser;
import org.apache.commons.configuration.ConfigurationException;
@@ -34,6 +36,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
+import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
@@ -47,6 +50,7 @@ public class BANNERAnnotator extends JCasAnnotator_ImplBase {
public static final String PARAM_CONFIG_FILE = "ConfigFile";
public static final String PARAM_TYPE_MAPPING = "TypeMapping";
+ public static final String PARAM_COMPONENT_ID = "ComponentId";
private final static Logger log = LoggerFactory.getLogger(BANNERAnnotator.class);
private Tokenizer tokenizer;
private DictionaryTagger dictionary;
@@ -61,6 +65,8 @@ public class BANNERAnnotator extends JCasAnnotator_ImplBase {
private String configFilePath;
@ConfigurationParameter(name = PARAM_TYPE_MAPPING, mandatory = false, description = "A list of mappings from entity labels to UIMA types in the form
-
- de.julielab.jcore.types.Sentence
+ de.julielab.jcore.types.Sentence
- de.julielab.jcore.types.Gene
+ de.julielab.jcore.types.Gene
-
+ truetruefalse
-
\ No newline at end of file
diff --git a/jcore-banner-ae/src/main/resources/desc/BANNERAE.xml b/jcore-banner-ae/src/main/resources/desc/BANNERAE.xml
index 28c2a1499..fb2981574 100644
--- a/jcore-banner-ae/src/main/resources/desc/BANNERAE.xml
+++ b/jcore-banner-ae/src/main/resources/desc/BANNERAE.xml
@@ -6,7 +6,7 @@
BANNERAE
- 2.5.1-SNAPSHOT
+ 2.6.0
diff --git a/jcore-banner-ae/src/main/resources/desc/bannerTS.xml b/jcore-banner-ae/src/main/resources/desc/bannerTS.xml
index d25adc102..a78fd02a0 100644
--- a/jcore-banner-ae/src/main/resources/desc/bannerTS.xml
+++ b/jcore-banner-ae/src/main/resources/desc/bannerTS.xml
@@ -2,7 +2,7 @@
bannerTSbasic typesystem started by sid
- 2.5.1-SNAPSHOT
+ 2.6.0
diff --git a/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java b/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java
index 12e9e2776..ed1ce4cee 100644
--- a/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java
+++ b/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/BANNERAnnotatorTest.java
@@ -12,20 +12,21 @@
import de.julielab.jcore.types.Gene;
import de.julielab.jcore.types.Sentence;
+import de.julielab.jcore.types.pubmed.InternalReference;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
-import static org.junit.Assert.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
public class BANNERAnnotatorTest {
private final static Logger log = LoggerFactory.getLogger(BANNERAnnotatorTest.class);
@@ -34,7 +35,8 @@ public void testProcess() throws Exception {
// just tag a single sentence with a test model that actually used that sentence as training data.
JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types",
"de.julielab.jcore.types.jcore-document-meta-types",
- "de.julielab.jcore.types.jcore-semantics-biology-types");
+ "de.julielab.jcore.types.jcore-semantics-biology-types",
+ "de.julielab.jcore.types.jcore-document-structure-pubmed-types");
// this is sentence P00055040A0000 from the test BC2GM train data
jcas.setDocumentText(
"Ten out-patients with pustulosis palmaris et plantaris were examined with direct immunofluorescence (IF) technique for deposition of fibrinogen, fibrin or its degradation products (FR-antigen) in affected and unaffected skin, together with heparin-precipitable fraction (HPF), cryoglobulin and total plasma fibrinogen in the blood.");
@@ -59,6 +61,40 @@ public void testProcess() throws Exception {
assertEquals("fibrinogen", geneList.get(4).getCoveredText());
}
+ @Test
+ public void testInternalReferenceExclusion() throws Exception {
+ // Internal references in papers, e.g. for bibliography, often appear as numbers. If such a number is
+ // directly appended to a gene name, it is mostly included into the gene name by BANNER.
+ // Thus, such reference spans are removed afterwards in the annotator and this test is checking that it works.
+ JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types",
+ "de.julielab.jcore.types.jcore-document-meta-types",
+ "de.julielab.jcore.types.jcore-semantics-biology-types",
+ "de.julielab.jcore.types.jcore-document-structure-pubmed-types");
+ // this is sentence P00055040A0000 from the test BC2GM train data EXCEPT the '19' following 'fibrinogen' which
+ // is our internal reference for this test.
+ jcas.setDocumentText(
+ "Ten out-patients with pustulosis palmaris et plantaris were examined with direct immunofluorescence (IF) technique for deposition of fibrinogen19, fibrin or its degradation products (FR-antigen) in affected and unaffected skin, together with heparin-precipitable fraction (HPF), cryoglobulin and total plasma fibrinogen in the blood.");
+ new Sentence(jcas, 0, jcas.getDocumentText().length()).addToIndexes();
+ new InternalReference(jcas, 143, 145).addToIndexes();
+ AnalysisEngine bannerAe = AnalysisEngineFactory.createEngine(BANNERAnnotator.class,
+ BANNERAnnotator.PARAM_CONFIG_FILE, "src/test/resources/banner_ae_test.xml", BANNERAnnotator.PARAM_TYPE_MAPPING, new String[] {"GENE=de.julielab.jcore.types.Gene"});
+ bannerAe.process(jcas);
+
+ // expected result from the GENE.eval.small file:
+ // P00055040A0000|116 125|fibrinogen
+ // P00055040A0000|127 132|fibrin
+ // P00055040A0000|158 167|FR-antigen
+ // P00055040A0000|243 254|cryoglobulin
+ // P00055040A0000|269 278|fibrinogen
+ // However, we ignore the offsets because the eval offsets ignore white spaces
+ List geneList = new ArrayList(JCasUtil.select(jcas, Gene.class));
+ assertEquals("fibrinogen", geneList.get(0).getCoveredText());
+ assertEquals("fibrin", geneList.get(1).getCoveredText());
+ assertEquals("FR-antigen", geneList.get(2).getCoveredText());
+ assertEquals("cryoglobulin", geneList.get(3).getCoveredText());
+ assertEquals("fibrinogen", geneList.get(4).getCoveredText());
+ }
+
@Test
public void testMultithreading() throws Exception {
List ts = new ArrayList<>();
@@ -77,7 +113,8 @@ private void tagalot() throws UIMAException {
// just tag a single sentence with a test model that actually used that sentence as training data.
JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types",
"de.julielab.jcore.types.jcore-document-meta-types",
- "de.julielab.jcore.types.jcore-semantics-biology-types");
+ "de.julielab.jcore.types.jcore-semantics-biology-types",
+ "de.julielab.jcore.types.jcore-document-structure-pubmed-types");
// this is sentence P00055040A0000 from the test BC2GM train data
jcas.setDocumentText(
"Maintenance of skeletal muscle mass is regulated by the balance between anabolic and catabolic processes. Mammalian target of rapamycin (mTOR) is an evolutionarily conserved serine/threonine kinase, and is known to play vital roles in protein synthesis. Recent findings have continued to refine our understanding of the function of mTOR in maintaining skeletal muscle mass. mTOR controls the anabolic and catabolic signaling of skeletal muscle mass, resulting in the modulation of muscle hypertrophy and muscle wastage. This review will highlight the fundamental role of mTOR in skeletal muscle growth by summarizing the phenotype of skeletal-specific mTOR deficiency. In addition, the evidence that mTOR is a dual regulator of anabolism and catabolism in skeletal muscle mass will be discussed. A full understanding of mTOR signaling in the maintenance of skeletal muscle mass could help to develop mTOR-targeted therapeutics to prevent muscle wasting.");
diff --git a/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/ModelTrainTest.java b/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/ModelTrainTest.java
index 7604ae62f..9d5d4958c 100644
--- a/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/ModelTrainTest.java
+++ b/jcore-banner-ae/src/test/java/de/julielab/jcore/ae/banner/ModelTrainTest.java
@@ -12,11 +12,11 @@
import banner.eval.BANNER;
import org.apache.commons.configuration.XMLConfiguration;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import java.io.File;
-import static org.junit.Assert.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertTrue;
public class ModelTrainTest {
@Test
diff --git a/jcore-banner-ae/src/test/java/de/julielab/jcore/banner/dataset/JCoReEntityDatasetTest.java b/jcore-banner-ae/src/test/java/de/julielab/jcore/banner/dataset/JCoReEntityDatasetTest.java
index 35925ad84..843106130 100644
--- a/jcore-banner-ae/src/test/java/de/julielab/jcore/banner/dataset/JCoReEntityDatasetTest.java
+++ b/jcore-banner-ae/src/test/java/de/julielab/jcore/banner/dataset/JCoReEntityDatasetTest.java
@@ -11,14 +11,14 @@
package de.julielab.jcore.banner.dataset;
import banner.tokenization.SimpleTokenizer;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import java.io.File;
import java.util.Set;
import java.util.stream.Collectors;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
public class JCoReEntityDatasetTest {
@Test
diff --git a/jcore-bc2gm-reader/component.meta b/jcore-bc2gm-reader/component.meta
index 748123c36..49b7f8c1a 100644
--- a/jcore-bc2gm-reader/component.meta
+++ b/jcore-bc2gm-reader/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-bc2gm-reader",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe BioCreative II Gene Mention Reader"
}
diff --git a/jcore-bc2gm-reader/pom.xml b/jcore-bc2gm-reader/pom.xml
index 1ec0602a9..9cc023682 100644
--- a/jcore-bc2gm-reader/pom.xml
+++ b/jcore-bc2gm-reader/pom.xml
@@ -9,7 +9,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0
diff --git a/jcore-bc2gm-reader/src/main/resources/de/julielab/jcore/reader/bc2gm/desc/jcore-bc2gm-reader.xml b/jcore-bc2gm-reader/src/main/resources/de/julielab/jcore/reader/bc2gm/desc/jcore-bc2gm-reader.xml
index 04e62abd2..7b932fbf9 100644
--- a/jcore-bc2gm-reader/src/main/resources/de/julielab/jcore/reader/bc2gm/desc/jcore-bc2gm-reader.xml
+++ b/jcore-bc2gm-reader/src/main/resources/de/julielab/jcore/reader/bc2gm/desc/jcore-bc2gm-reader.xml
@@ -5,7 +5,7 @@
JCoRe BioCreative II Gene Mention readerThis component reads gene annotated sentences in the BioCreative II Gene Mention challenge format. Each CAS will contain one annotated sentence.
- 2.5.1-SNAPSHOT
+ 2.6.0SentencesFile
diff --git a/jcore-bc2gmformat-writer/component.meta b/jcore-bc2gmformat-writer/component.meta
index 384a54b21..ee98994c8 100644
--- a/jcore-bc2gmformat-writer/component.meta
+++ b/jcore-bc2gmformat-writer/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-bc2gmformat-writer",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe BioCreative II Gene Mention Format Writer"
}
diff --git a/jcore-bc2gmformat-writer/pom.xml b/jcore-bc2gmformat-writer/pom.xml
index c68e9f170..2f531a820 100644
--- a/jcore-bc2gmformat-writer/pom.xml
+++ b/jcore-bc2gmformat-writer/pom.xml
@@ -9,7 +9,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0
@@ -28,14 +28,18 @@
${jcore-types-version}
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-enginede.julielabjcore-utilities${jcore-utilities-version}
+
+ de.julielab
+ julielab-java-utilities
+ de.julielabjcore-descriptor-creator
diff --git a/jcore-bc2gmformat-writer/src/main/resources/de/julielab/jcore/consumer/bc2gmformat/desc/jcore-bc2gmformat-writer.xml b/jcore-bc2gmformat-writer/src/main/resources/de/julielab/jcore/consumer/bc2gmformat/desc/jcore-bc2gmformat-writer.xml
index 2e122f8b6..0504d2b1b 100644
--- a/jcore-bc2gmformat-writer/src/main/resources/de/julielab/jcore/consumer/bc2gmformat/desc/jcore-bc2gmformat-writer.xml
+++ b/jcore-bc2gmformat-writer/src/main/resources/de/julielab/jcore/consumer/bc2gmformat/desc/jcore-bc2gmformat-writer.xml
@@ -6,7 +6,7 @@
JCoRe BioCreative II Gene Mention Format writerThis component writes gene annotations in the CAS to the format employed by the BioCreative II Gene Mention challenge.
- 2.5.1-SNAPSHOT
+ 2.6.0OutputDirectory
diff --git a/jcore-bc2gmformat-writer/src/test/java/de/julielab/jcore/consumer/bc2gmformat/BC2GMFormatWriterTest.java b/jcore-bc2gmformat-writer/src/test/java/de/julielab/jcore/consumer/bc2gmformat/BC2GMFormatWriterTest.java
index 41faec637..3752d67b5 100644
--- a/jcore-bc2gmformat-writer/src/test/java/de/julielab/jcore/consumer/bc2gmformat/BC2GMFormatWriterTest.java
+++ b/jcore-bc2gmformat-writer/src/test/java/de/julielab/jcore/consumer/bc2gmformat/BC2GMFormatWriterTest.java
@@ -2,13 +2,13 @@
package de.julielab.jcore.consumer.bc2gmformat;
import org.apache.uima.fit.factory.UimaContextFactory;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import java.lang.reflect.Method;
import java.util.TreeMap;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
/**
diff --git a/jcore-biolemmatizer-ae/component.meta b/jcore-biolemmatizer-ae/component.meta
index 66fd947c5..4e79fc201 100644
--- a/jcore-biolemmatizer-ae/component.meta
+++ b/jcore-biolemmatizer-ae/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-biolemmatizer-ae",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe BioLemmatizer"
}
diff --git a/jcore-biolemmatizer-ae/pom.xml b/jcore-biolemmatizer-ae/pom.xml
index bf56276d0..62e6a6234 100644
--- a/jcore-biolemmatizer-ae/pom.xml
+++ b/jcore-biolemmatizer-ae/pom.xml
@@ -8,7 +8,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0
@@ -31,7 +31,11 @@
biolemmatizer-core1.2
- junitjunit
+
+ org.junit.jupiter
+ junit-jupiter-engine
+
+ JCoRe BioLemmatizerJULIE Lab Jena, Germany
@@ -43,8 +47,14 @@
BioNLP Repository
- http://svn.code.sf.net/p/bionlp/code/repo
+ https://svn.code.sf.net/p/bionlp/code/repo
+
+
+ maven.aksw.internal
+ AKSW Internal Release Repository
+ https://maven.aksw.org/repository/internal
+
diff --git a/jcore-biolemmatizer-ae/src/main/resources/de/julielab/jcore/ae/biolemmatizer/desc/jcore-biolemmatizer-ae.xml b/jcore-biolemmatizer-ae/src/main/resources/de/julielab/jcore/ae/biolemmatizer/desc/jcore-biolemmatizer-ae.xml
index 27b446003..9acb95f57 100644
--- a/jcore-biolemmatizer-ae/src/main/resources/de/julielab/jcore/ae/biolemmatizer/desc/jcore-biolemmatizer-ae.xml
+++ b/jcore-biolemmatizer-ae/src/main/resources/de/julielab/jcore/ae/biolemmatizer/desc/jcore-biolemmatizer-ae.xml
@@ -6,14 +6,25 @@
BioLemmatizer
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
-
+
+
+
+ de.julielab.jcore.types.Token
+ de.julielab.jcore.types.PennBioIEPOSTag
+
+
+ de.julielab.jcore.types.Lemma
+
+
+
+ truetrue
diff --git a/jcore-biolemmatizer-ae/src/test/java/de/julielab/jcore/ae/biolemmatizer/BioLemmatizerTest.java b/jcore-biolemmatizer-ae/src/test/java/de/julielab/jcore/ae/biolemmatizer/BioLemmatizerTest.java
index ada58be07..241aadaee 100644
--- a/jcore-biolemmatizer-ae/src/test/java/de/julielab/jcore/ae/biolemmatizer/BioLemmatizerTest.java
+++ b/jcore-biolemmatizer-ae/src/test/java/de/julielab/jcore/ae/biolemmatizer/BioLemmatizerTest.java
@@ -10,10 +10,10 @@
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.jcas.tcas.Annotation;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
/**
* Unit tests for jcore-de.julielab.jcore.ae.biolemmatizer-ae.
* @author
diff --git a/jcore-bionlpformat-consumer/component.meta b/jcore-bionlpformat-consumer/component.meta
index e4c0dedc0..e13edd578 100644
--- a/jcore-bionlpformat-consumer/component.meta
+++ b/jcore-bionlpformat-consumer/component.meta
@@ -22,7 +22,7 @@
"maven-artifact": {
"artifactId": "jcore-bionlpformat-consumer",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe BioNLP Format Consumer"
}
diff --git a/jcore-bionlpformat-consumer/pom.xml b/jcore-bionlpformat-consumer/pom.xml
index bf58e21a4..676993028 100644
--- a/jcore-bionlpformat-consumer/pom.xml
+++ b/jcore-bionlpformat-consumer/pom.xml
@@ -6,7 +6,7 @@
jcore-basede.julielab
- 2.5.1-SNAPSHOT
+ 2.6.0
@@ -29,8 +29,8 @@
test
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-engineJCoRe BioNLP Format Consumer
diff --git a/jcore-bionlpformat-consumer/src/main/java/de/julielab/jcore/consumer/bionlpformat/main/SegmentConsumer.java b/jcore-bionlpformat-consumer/src/main/java/de/julielab/jcore/consumer/bionlpformat/main/SegmentConsumer.java
index 287a79921..f09c3a48a 100644
--- a/jcore-bionlpformat-consumer/src/main/java/de/julielab/jcore/consumer/bionlpformat/main/SegmentConsumer.java
+++ b/jcore-bionlpformat-consumer/src/main/java/de/julielab/jcore/consumer/bionlpformat/main/SegmentConsumer.java
@@ -22,9 +22,6 @@
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.springframework.context.annotation.Configuration;
import java.io.*;
import java.util.Iterator;
diff --git a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-biomedical-sharedtask.xml b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-biomedical-sharedtask.xml
index 45463be92..5b908ba63 100644
--- a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-biomedical-sharedtask.xml
+++ b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-biomedical-sharedtask.xml
@@ -5,7 +5,7 @@
JCoRe BioNLP Event Consumer
- 2.5.1-SNAPSHOT
+ 2.6.0
diff --git a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-medical.xml b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-medical.xml
index 5ebfec59f..0cb5ea0e1 100644
--- a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-medical.xml
+++ b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-medical.xml
@@ -5,7 +5,7 @@
JCoRe BioNLP Format Event Consumer (Medical)
- 2.5.1-SNAPSHOT
+ 2.6.0
diff --git a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-segment.xml b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-segment.xml
index dc654b37b..57287e038 100644
--- a/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-segment.xml
+++ b/jcore-bionlpformat-consumer/src/main/resources/de/julielab/jcore/consumer/bionlpformat/desc/jcore-bionlpformat-consumer-segment.xml
@@ -7,7 +7,7 @@
JCoRe BioNLP Format Segment Consumer
- 2.5.1-SNAPSHOT
+ 2.6.0
diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/main/EventConsumerTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/main/EventConsumerTest.java
index 6668a969d..8a6659cfb 100644
--- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/main/EventConsumerTest.java
+++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/main/EventConsumerTest.java
@@ -13,16 +13,16 @@
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
public class EventConsumerTest {
private static final String EVENT_E8 = "E8 Phosphorylation:T14 Theme:T17 Site:T13";
@@ -44,7 +44,7 @@ public class EventConsumerTest {
private AnalysisEngine consumer;
private FilenameFilter filter;
- @Before
+ @BeforeEach
public void setUp() throws Exception {
cas = JCasFactory.createJCas("src/test/resources/types/jcore-all-types");
consumer = AnalysisEngineFactory.createEngine(BioEventConsumer.class,
@@ -113,7 +113,7 @@ public boolean accept(File file, String name) {
};
}
- @After
+ @AfterEach
public void tearDown() {
File dataDirectory = new File(TARGET_DIRECTORY);
diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/DocumentWriterTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/DocumentWriterTest.java
index bdd89cc38..12e2baa53 100644
--- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/DocumentWriterTest.java
+++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/DocumentWriterTest.java
@@ -10,8 +10,8 @@
import de.julielab.jcore.types.Title;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
-import org.junit.Before;
-import org.junit.Test;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.io.Writer;
@@ -28,7 +28,7 @@ public class DocumentWriterTest {
private DocumentWriter documentWriter;
private Writer writer;
- @Before
+ @BeforeEach
public void setUp() throws Exception{
cas = JCasFactory.createJCas("src/test/resources/types/jcore-all-types");
cas.setDocumentText(DOCUMENT_TITLE + "\n" + DOCUMENT_ABSTRACT);
diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EntityWriterTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EntityWriterTest.java
index d98cb4722..29cd9e064 100644
--- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EntityWriterTest.java
+++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EntityWriterTest.java
@@ -9,15 +9,15 @@
import de.julielab.jcore.types.EntityMention;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
-import org.junit.Before;
-import org.junit.Test;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.io.Writer;
import static org.easymock.classextension.EasyMock.*;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
public class EntityWriterTest {
private static final String ENTITY_T13 = "T13 Entity 322 330 tyrosine\n";
@@ -30,7 +30,7 @@ public class EntityWriterTest {
private Writer writer;
private EntityMention entityT13;
- @Before
+ @BeforeEach
public void setUp() throws Exception{
cas = JCasFactory.createJCas("src/test/resources/types/jcore-semantics-biology-types");
diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventTriggerWriterTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventTriggerWriterTest.java
index 317dd0cef..2a04a48f1 100644
--- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventTriggerWriterTest.java
+++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventTriggerWriterTest.java
@@ -9,15 +9,15 @@
import de.julielab.jcore.types.EventTrigger;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
-import org.junit.Before;
-import org.junit.Test;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.io.Writer;
import static org.easymock.classextension.EasyMock.*;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
public class EventTriggerWriterTest {
private static final String TRIGGER_T1 = "T1 Negative_regulation 12 19 inhibit\n";
@@ -28,7 +28,7 @@ public class EventTriggerWriterTest {
private Writer writer;
private EventTrigger triggerT1;
- @Before
+ @BeforeEach
public void setUp() throws Exception{
cas = JCasFactory.createJCas("src/test/resources/types/jcore-semantics-biology-types");
diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventWriterTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventWriterTest.java
index 5d8b717cf..58052dc0b 100644
--- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventWriterTest.java
+++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/EventWriterTest.java
@@ -10,8 +10,8 @@
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
-import org.junit.Before;
-import org.junit.Test;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
import java.io.FileWriter;
import java.io.IOException;
@@ -36,7 +36,7 @@ public class EventWriterTest {
private Gene proteinT17;
private EntityMention entityT13;
- @Before
+ @BeforeEach
public void setUp() throws Exception{
cas = JCasFactory.createJCas("src/test/resources/types/jcore-semantics-biology-types");
diff --git a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/ProteinWriterTest.java b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/ProteinWriterTest.java
index 3871f07ff..2cdc5be50 100644
--- a/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/ProteinWriterTest.java
+++ b/jcore-bionlpformat-consumer/src/test/java/de/julielab/jcore/consumer/bionlpformat/utils/ProteinWriterTest.java
@@ -9,16 +9,16 @@
import de.julielab.jcore.types.Gene;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
-import org.junit.Before;
-import org.junit.Test;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import static org.easymock.classextension.EasyMock.*;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
public class ProteinWriterTest {
@@ -35,7 +35,7 @@ public class ProteinWriterTest {
private static final String DOCUMENT_TEXT = "Interferons inhibit activation of STAT6 by interleukin 4 in human monocytes by inducing SOCS-1 gene expression.\n" +
"Interferons (IFNs) inhibit induction by IL-4 of multiple genes in human monocytes. However, the mechanism by which IFNs mediate this inhibition has not been defined. IL-4 activates gene expression by inducing tyrosine phosphorylation, homodimerization, and nuclear translocation of the latent transcription factor, STAT6 (signal transducer and activator of transcription-6). STAT6-responsive elements are characteristically present in the promoters of IL-4-inducible genes. Because STAT6 activation is essential for IL-4-induced gene expression, we examined the ability of type I and type II IFNs to regulate activation of STAT6 by IL-4 in primary human monocytes. Pretreatment of monocytes with IFN-beta or IFN-gamma, but not IL-1, IL-2, macrophage colony-stimulating factor, granulocyte/macrophage colony-stimulating factor, IL-6, or transforming growth factor beta suppressed activation of STAT6 by IL-4. This inhibition was associated with decreased tyrosine phosphorylation and nuclear translocation of STAT6 and was not evident unless the cells were preincubated with IFN for at least 1 hr before IL-4 stimulation. Furthermore, inhibition by IFN could be blocked by cotreatment with actinomycin D and correlated temporally with induction of the JAK/STAT inhibitory gene, SOCS-1. Forced expression of SOCS-1 in a macrophage cell line, RAW264, markedly suppressed trans-activation of an IL-4-inducible reporter as well as IL-6- and IFN-gamma-induced reporter gene activity. These findings demonstrate that IFNs inhibit IL-4-induced activation of STAT6 and STAT6-dependent gene expression, at least in part, by inducing expression of SOCS-1.";
- @Before
+ @BeforeEach
public void setUp() throws Exception{
cas = JCasFactory.createJCas("src/test/resources/types/jcore-semantics-biology-types");
diff --git a/jcore-bionlpformat-consumer/src/test/resources/types/jcore-all-types.xml b/jcore-bionlpformat-consumer/src/test/resources/types/jcore-all-types.xml
index 670239d8d..76d19c9c8 100644
--- a/jcore-bionlpformat-consumer/src/test/resources/types/jcore-all-types.xml
+++ b/jcore-bionlpformat-consumer/src/test/resources/types/jcore-all-types.xml
@@ -2,7 +2,7 @@
JCoRe All TypesThis is just a convenience file, assembling all JCoRe types
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-bionlpformat-consumer/src/test/resources/types/jcore-semantics-biology-types.xml b/jcore-bionlpformat-consumer/src/test/resources/types/jcore-semantics-biology-types.xml
index 0f6fca3ac..a525162fe 100644
--- a/jcore-bionlpformat-consumer/src/test/resources/types/jcore-semantics-biology-types.xml
+++ b/jcore-bionlpformat-consumer/src/test/resources/types/jcore-semantics-biology-types.xml
@@ -2,7 +2,7 @@
JCoRe Semantics Biology TypesThe type system contains types of the biomedical domain.
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-bionlpformat-reader/component.meta b/jcore-bionlpformat-reader/component.meta
index 6f10e9e95..60e877ec5 100644
--- a/jcore-bionlpformat-reader/component.meta
+++ b/jcore-bionlpformat-reader/component.meta
@@ -22,7 +22,7 @@
"maven-artifact": {
"artifactId": "jcore-bionlpformat-reader",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe BioNLP Format Reader"
}
diff --git a/jcore-bionlpformat-reader/pom.xml b/jcore-bionlpformat-reader/pom.xml
index 862c09d97..1c966f9e8 100644
--- a/jcore-bionlpformat-reader/pom.xml
+++ b/jcore-bionlpformat-reader/pom.xml
@@ -6,7 +6,7 @@
jcore-basede.julielab
- 2.5.1-SNAPSHOT
+ 2.6.0
@@ -41,8 +41,8 @@
${jcore-utilities-version}
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-engine
diff --git a/jcore-bionlpformat-reader/src/main/java/de/julielab/jcore/reader/bionlpformat/utils/AnnotationFileMapper.java b/jcore-bionlpformat-reader/src/main/java/de/julielab/jcore/reader/bionlpformat/utils/AnnotationFileMapper.java
index 70efe8571..5a265d736 100644
--- a/jcore-bionlpformat-reader/src/main/java/de/julielab/jcore/reader/bionlpformat/utils/AnnotationFileMapper.java
+++ b/jcore-bionlpformat-reader/src/main/java/de/julielab/jcore/reader/bionlpformat/utils/AnnotationFileMapper.java
@@ -17,8 +17,8 @@
import java.io.BufferedReader;
import java.io.IOException;
- import java.util.*;
import java.util.List;
+ import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
diff --git a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-biomedical-sharedtask.xml b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-biomedical-sharedtask.xml
index ccd6c46f6..66a5945ca 100644
--- a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-biomedical-sharedtask.xml
+++ b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-biomedical-sharedtask.xml
@@ -5,7 +5,7 @@
JCoRe BioNLP Event Reader
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-medical.xml b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-medical.xml
index 74cdb9e62..602240c4e 100644
--- a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-medical.xml
+++ b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-medical.xml
@@ -5,7 +5,7 @@
BioNLP Format Reader Medical
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-segment.xml b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-segment.xml
index aea0bc469..7ed45b45a 100644
--- a/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-segment.xml
+++ b/jcore-bionlpformat-reader/src/main/resources/de/julielab/jcore/reader/bionlpformat/desc/jcore-bionlpformat-reader-segment.xml
@@ -5,7 +5,7 @@
BioNLP Format Reader Segment
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/CoreferenceReadingTest.java b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/CoreferenceReadingTest.java
index ce2926f00..1b2a68ac9 100644
--- a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/CoreferenceReadingTest.java
+++ b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/CoreferenceReadingTest.java
@@ -16,8 +16,8 @@
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
-import org.junit.Ignore;
-import org.junit.Test;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import java.io.FileOutputStream;
@@ -25,7 +25,7 @@
import java.io.OutputStream;
// Ignore because the data path does generally not exist; a fix should only contain some test data, not the whole dataset
-@Ignore
+@Disabled
public class CoreferenceReadingTest {
@Test
public void testCoreferenceReading() throws UIMAException, IOException,
diff --git a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/EventReaderTest.java b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/EventReaderTest.java
index 9c7aea226..68c64fc94 100644
--- a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/EventReaderTest.java
+++ b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/main/EventReaderTest.java
@@ -17,23 +17,23 @@
import org.apache.uima.jcas.JCas;
import org.apache.uima.util.CasCreationUtils;
import org.apache.uima.util.XMLInputSource;
-import org.junit.Before;
-import org.junit.Ignore;
-import org.junit.Test;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
import java.util.Set;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
// This test's EventReaderTest.xml descriptor points to local directories of Ekaterina Buyko and as such, the test doesn't work this way. However it might, if the data is made available as proper test data.
-@Ignore
+@Disabled
public class EventReaderTest {
private static final String DESCRIPTOR_FILE = "src/test/resources/de/julielab/jcore/reader/bionlpformat/desc/EventReaderTest.xml";
private CollectionReader collectionReader;
- @Before
+ @BeforeEach
public void setUp() throws Exception {
CollectionReaderDescription readerDescription = (CollectionReaderDescription) UIMAFramework
.getXMLParser().parseCollectionReaderDescription(
diff --git a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AbstractFileMapperTest.java b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AbstractFileMapperTest.java
index 25685ec01..c95a9d148 100644
--- a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AbstractFileMapperTest.java
+++ b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AbstractFileMapperTest.java
@@ -18,15 +18,15 @@
import org.apache.uima.jcas.JCas;
import org.apache.uima.util.CasCreationUtils;
import org.apache.uima.util.XMLInputSource;
-import org.junit.Before;
-import org.junit.Test;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
import java.io.BufferedReader;
import static org.easymock.EasyMock.expect;
import static org.easymock.classextension.EasyMock.*;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
public class AbstractFileMapperTest {
@@ -34,7 +34,7 @@ public class AbstractFileMapperTest {
private JCas cas;
private TextFileMapper abstractFileMapper;
- @Before
+ @BeforeEach
public void setUp() throws Exception {
CollectionReaderDescription readerDescription = UIMAFramework.getXMLParser()
.parseCollectionReaderDescription(new XMLInputSource(DESCRIPTOR_FILE));
diff --git a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AnnotationFileMapperTest.java b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AnnotationFileMapperTest.java
index 46bf09ee6..85b582ed0 100644
--- a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AnnotationFileMapperTest.java
+++ b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/AnnotationFileMapperTest.java
@@ -28,8 +28,8 @@
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.util.CasCreationUtils;
import org.apache.uima.util.XMLInputSource;
-import org.junit.Before;
-import org.junit.Test;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
import java.io.BufferedReader;
import java.util.HashMap;
@@ -37,8 +37,8 @@
import static org.easymock.EasyMock.expect;
import static org.easymock.classextension.EasyMock.*;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
@@ -68,7 +68,7 @@ public class AnnotationFileMapperTest {
private Gene t3;
private Map mappedProteins;
- @Before
+ @BeforeEach
public void setUp() throws Exception {
CollectionReaderDescription readerDescription = (CollectionReaderDescription) UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(DESCRIPTOR_FILE));
CollectionReader collectionReader = UIMAFramework.produceCollectionReader(readerDescription);
diff --git a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/OntoFormatReaderTest.java b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/OntoFormatReaderTest.java
index 2abfcc03d..24a3d7805 100644
--- a/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/OntoFormatReaderTest.java
+++ b/jcore-bionlpformat-reader/src/test/java/de/julielab/jcore/reader/bionlp09event/utils/OntoFormatReaderTest.java
@@ -11,7 +11,7 @@
package de.julielab.jcore.reader.bionlp09event.utils;
import de.julielab.jcore.reader.bionlpformat.utils.OntoFormatReader;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import java.io.File;
import java.io.IOException;
diff --git a/jcore-bionlpformat-reader/src/test/resources/de/julielab/jcore/reader/bionlpformat/desc/EventReaderTest.xml b/jcore-bionlpformat-reader/src/test/resources/de/julielab/jcore/reader/bionlpformat/desc/EventReaderTest.xml
index 38ed5aed3..33b41c2c8 100644
--- a/jcore-bionlpformat-reader/src/test/resources/de/julielab/jcore/reader/bionlpformat/desc/EventReaderTest.xml
+++ b/jcore-bionlpformat-reader/src/test/resources/de/julielab/jcore/reader/bionlpformat/desc/EventReaderTest.xml
@@ -5,7 +5,7 @@
EventReader
- 2.5.1-SNAPSHOT
+ 2.6.0
diff --git a/jcore-biosem-ae/component.meta b/jcore-biosem-ae/component.meta
index dd5fcf39d..08c8a1bba 100644
--- a/jcore-biosem-ae/component.meta
+++ b/jcore-biosem-ae/component.meta
@@ -9,7 +9,7 @@
"maven-artifact": {
"artifactId": "jcore-biosem-ae",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe BioSem Event Annotator"
}
diff --git a/jcore-biosem-ae/pom.xml b/jcore-biosem-ae/pom.xml
index ece3b845a..7a667db9b 100644
--- a/jcore-biosem-ae/pom.xml
+++ b/jcore-biosem-ae/pom.xml
@@ -5,7 +5,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0jcore-biosem-aeJCoRe BioSem Event Annotator
@@ -32,13 +32,19 @@
de.julielabjcore-bionlpformat-reader
- 2.5.1-SNAPSHOT
+ 2.6.0testde.julielabbiosem-event-extractor1.1.7
+
+
+ commons-cli
+ commons-cli
+
+ de.julielab
@@ -48,12 +54,12 @@
de.julielabjcore-bionlpformat-consumer
- 2.5.1-SNAPSHOT
+ 2.6.0test
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-engine
diff --git a/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java b/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java
index 9a9f16a35..8a42dd9dc 100644
--- a/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java
+++ b/jcore-biosem-ae/src/main/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotator.java
@@ -1,11 +1,11 @@
-/**
- *
+/**
+ *
* Copyright (c) 2017, JULIE Lab.
- * All rights reserved. This program and the accompanying materials
+ * All rights reserved. This program and the accompanying materials
* are made available under the terms of the BSD-2-Clause License
*
- * Author:
- *
+ * Author:
+ *
* Description:
**/
package de.julielab.jcore.ae.biosem;
@@ -17,7 +17,10 @@
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
import org.apache.uima.cas.FSIterator;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
+import org.apache.uima.fit.descriptor.ResourceMetaData;
+import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.jcas.tcas.Annotation;
@@ -31,22 +34,27 @@
import utils.BioSemException;
import utils.DBUtils;
-import java.util.*;
import java.util.List;
+import java.util.*;
import java.util.Map.Entry;
+@ResourceMetaData(name="JCoRe BioSem Event Annotator", description = "Adds annotations for event triggers and events according to the BioNLP Shared Task event definition.")
+@TypeCapability(inputs = {"de.julielab.jcore.types.Gene"}, outputs = {"de.julielab.jcore.types.EventTrigger", "de.julielab.jcore.types.EventMention"})
public class BioSemEventAnnotator extends JCasAnnotator_ImplBase {
private final static Logger log = LoggerFactory.getLogger(BioSemEventAnnotator.class);
+ public static final String PARAM_COMPONENT_ID = "ComponentId";
public final static String RESOURCE_TRAINED_DB = "TrainedDB";
private DataLoader loader;
private DBUtils trainedDb;
- @ExternalResource(key = RESOURCE_TRAINED_DB, mandatory = true)
+ @ExternalResource(key = RESOURCE_TRAINED_DB)
private DBUtilsProvider dbUtilsProvider;
+ @ConfigurationParameter(name=PARAM_COMPONENT_ID, mandatory = false, defaultValue = "BioSemEventAnnotator", description = "Optional. If set, the 'componentId' feature of the created annotations will be set to the value of this parameter.")
+ private String componentId;
private EventExtraction xtr;
@@ -64,6 +72,7 @@ public class BioSemEventAnnotator extends JCasAnnotator_ImplBase {
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
try {
+ componentId = (String) aContext.getConfigParameterValue(PARAM_COMPONENT_ID);
dbUtilsProvider = (DBUtilsProvider) aContext.getResourceObject(RESOURCE_TRAINED_DB);
trainedDb = dbUtilsProvider.getTrainedDatabase();
} catch (ResourceAccessException e) {
@@ -198,6 +207,7 @@ private EventMention addEventToIndexes(PData event, Map proteinMap
PData eventArg1 = event.getPdata1();
PData eventArg2 = event.getPdata2();
uimaEvent = new EventMention(aJCas, begin, end);
+ uimaEvent.setComponentId(componentId);
uimaEvent.setId(event.PID);
uimaEvent.setSpecificType(uimaTrigger.getSpecificType());
uimaEvent.setTrigger(uimaTrigger);
@@ -227,7 +237,7 @@ private EventMention addEventToIndexes(PData event, Map proteinMap
}
/**
- *
+ *
* @param uimaEvent
* The UIMA event annotation to add a new argument to
* @param bioSemArg
@@ -279,6 +289,7 @@ private void addUimaEventArgument(EventMention uimaEvent, Object bioSemArg, int
// if we don't want to use the writer).
protein.setSpecificType("protein");
uimaArg = new ArgumentMention(aJCas, protein.getBegin(), protein.getEnd());
+ uimaArg.setComponentId(componentId);
uimaArg.setRef(protein);
uimaArg.setRole(determineArgumentRole(uimaEvent, uimaArg, argPos));
} else if (bioSemArg instanceof PData) {
@@ -293,9 +304,10 @@ private void addUimaEventArgument(EventMention uimaEvent, Object bioSemArg, int
}
if (null == uimaEventArg) {
throw new IllegalStateException("Creating UIMA EventMention annotation for BioSem event \""
- + eventArg.toString() + "\" failed, the UIMA EventMention is null.");
+ + eventArg + "\" failed, the UIMA EventMention is null.");
}
uimaArg = new ArgumentMention(aJCas, uimaEventArg.getBegin(), uimaEventArg.getEnd());
+ uimaArg.setComponentId(componentId);
uimaArg.setRef(uimaEventArg);
uimaArg.setRole(determineArgumentRole(uimaEvent, uimaArg, argPos));
} else {
@@ -330,7 +342,7 @@ private void addUimaEventArgument(EventMention uimaEvent, Object bioSemArg, int
}
/**
- *
+ *
* @param uimaEvent
* @param uimaArg
* @param argPos
@@ -359,6 +371,7 @@ private EventTrigger addTriggerToIndexes(Word trg, JCas aJCas) {
int end = trg.locs[1];
String type = trg.type;
EventTrigger uimaTrigger = new EventTrigger(aJCas, begin, end);
+ uimaTrigger.setComponentId(componentId);
uimaTrigger.setId(id);
uimaTrigger.setSpecificType(type);
return uimaTrigger;
@@ -370,7 +383,7 @@ private EventTrigger addTriggerToIndexes(Word trg, JCas aJCas) {
* ID<tab>Entity-Type[Protein]<tab>start<tab>end<tab>Mention name
*
* Example: T3 Protein 166 174 TGF-beta
- *
+ *
* @return
*/
private List getProteinLines(Map proteins, String docId) throws AnnotatorProcessException {
@@ -392,7 +405,7 @@ private List getProteinLines(Map proteins, String docId) t
/**
* Assigns an ID of the form Ti to each gene in the CAS, i
* being an enumeration number beginning at 0.
- *
+ *
* @param aJCas
* @return
*/
@@ -408,9 +421,7 @@ private Map enumerateProteins(JCas aJCas) {
Gene gene = (Gene) geneIt.next();
if (gene.getBegin() < lastEnd)
continue;
- String id = gene.getId();
- // if (StringUtils.isBlank(id))
- id = "T" + i++;
+ String id = "T" + i++;
gene.setId(id);
proteins.put(id, gene);
lastEnd = gene.getEnd();
diff --git a/jcore-biosem-ae/src/test/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotatorTest.java b/jcore-biosem-ae/src/test/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotatorTest.java
index ae49970cd..da7a683de 100644
--- a/jcore-biosem-ae/src/test/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotatorTest.java
+++ b/jcore-biosem-ae/src/test/java/de/julielab/jcore/ae/biosem/BioSemEventAnnotatorTest.java
@@ -22,15 +22,15 @@
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ExternalResourceDescription;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import java.io.File;
import java.io.FileInputStream;
import java.util.Collections;
import java.util.List;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
public class BioSemEventAnnotatorTest {
@Test
@@ -53,7 +53,7 @@ public void testProcess() throws Exception {
if (testOutputFile.exists())
testOutputFile.delete();
- assertTrue("Test document was not found by the BioNLP ST reader.", bioNlpSTReader.hasNext());
+ assertTrue(bioNlpSTReader.hasNext(), "Test document was not found by the BioNLP ST reader.");
bioNlpSTReader.getNext(jCas.getCas());
engine.process(jCas);
bioNlpSTWriter.process(jCas);
diff --git a/jcore-conll-consumer/component.meta b/jcore-conll-consumer/component.meta
index e754ff444..2e94ca29d 100644
--- a/jcore-conll-consumer/component.meta
+++ b/jcore-conll-consumer/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-conll-consumer",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe CONLL Consumer"
}
diff --git a/jcore-conll-consumer/pom.xml b/jcore-conll-consumer/pom.xml
index fef60e5bf..cff35237d 100644
--- a/jcore-conll-consumer/pom.xml
+++ b/jcore-conll-consumer/pom.xml
@@ -4,7 +4,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0jcore-conll-consumer
@@ -24,8 +24,8 @@
logback-classic
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-engine
diff --git a/jcore-conll-consumer/src/main/resources/de/julielab/jcore/consumer/conll/desc/jcore-conll-consumer.xml b/jcore-conll-consumer/src/main/resources/de/julielab/jcore/consumer/conll/desc/jcore-conll-consumer.xml
index 30f0366eb..288790254 100644
--- a/jcore-conll-consumer/src/main/resources/de/julielab/jcore/consumer/conll/desc/jcore-conll-consumer.xml
+++ b/jcore-conll-consumer/src/main/resources/de/julielab/jcore/consumer/conll/desc/jcore-conll-consumer.xml
@@ -6,7 +6,7 @@
JCoRe Conll Consumer
- 2.5.1-SNAPSHOT
+ 2.6.0
diff --git a/jcore-conll-consumer/src/test/java/de/julielab/jcore/consumer/cas2conll/test/ConllConsumerTest.java b/jcore-conll-consumer/src/test/java/de/julielab/jcore/consumer/cas2conll/test/ConllConsumerTest.java
index cb66ca825..ad46ef663 100644
--- a/jcore-conll-consumer/src/test/java/de/julielab/jcore/consumer/cas2conll/test/ConllConsumerTest.java
+++ b/jcore-conll-consumer/src/test/java/de/julielab/jcore/consumer/cas2conll/test/ConllConsumerTest.java
@@ -21,7 +21,7 @@
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import java.io.BufferedReader;
import java.io.File;
@@ -30,7 +30,7 @@
import java.util.ArrayList;
import java.util.List;
-import static org.junit.Assert.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertTrue;
public class ConllConsumerTest {
diff --git a/jcore-coordination-baseline-ae/component.meta b/jcore-coordination-baseline-ae/component.meta
index c79a816e4..622f6ef43 100644
--- a/jcore-coordination-baseline-ae/component.meta
+++ b/jcore-coordination-baseline-ae/component.meta
@@ -26,7 +26,7 @@
"maven-artifact": {
"artifactId": "jcore-coordination-baseline-ae",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe Coordination Tagger Baseline"
}
diff --git a/jcore-coordination-baseline-ae/pom.xml b/jcore-coordination-baseline-ae/pom.xml
index eaff316fa..64cc11f48 100644
--- a/jcore-coordination-baseline-ae/pom.xml
+++ b/jcore-coordination-baseline-ae/pom.xml
@@ -13,7 +13,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0
@@ -37,8 +37,8 @@
test
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-engine
diff --git a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-conjunct.xml b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-conjunct.xml
index 1e5a6c860..4da7a5bbe 100644
--- a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-conjunct.xml
+++ b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-conjunct.xml
@@ -6,7 +6,7 @@
JCoRe ConjunctAnnotator
-2.5.1-SNAPSHOT
+2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-coordination.xml b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-coordination.xml
index b5db7b69b..706c3df7e 100644
--- a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-coordination.xml
+++ b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-coordination.xml
@@ -6,7 +6,7 @@
JCoRe CoordinationAnnotator
-2.5.1-SNAPSHOT
+2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-eee.xml b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-eee.xml
index 50c01690b..41bb97345 100644
--- a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-eee.xml
+++ b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-eee.xml
@@ -6,7 +6,7 @@
JCoRe EEEAnnotator
-2.5.1-SNAPSHOT
+2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-ellipsis.xml b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-ellipsis.xml
index 8e73905d3..bb4bfb5c1 100644
--- a/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-ellipsis.xml
+++ b/jcore-coordination-baseline-ae/src/main/resources/de/julielab/jcore/ae/coordbaseline/desc/jcore-coordination-baseline-ae-ellipsis.xml
@@ -6,7 +6,7 @@
JCoRe EllipsisAnnotator
-2.5.1-SNAPSHOT
+2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/ConjunctAnnotatorTest.java b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/ConjunctAnnotatorTest.java
index fdca4b78e..6eb0c2ee6 100644
--- a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/ConjunctAnnotatorTest.java
+++ b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/ConjunctAnnotatorTest.java
@@ -7,7 +7,6 @@
package de.julielab.jcore.ae.coordbaseline.main;
import de.julielab.jcore.types.*;
-import junit.framework.TestCase;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.cas.FSIterator;
@@ -17,13 +16,16 @@
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;
import org.apache.uima.util.XMLInputSource;
+import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
+import static org.junit.jupiter.api.Assertions.assertTrue;
-public class ConjunctAnnotatorTest extends TestCase
+
+public class ConjunctAnnotatorTest
{
private static final Logger LOGGER = LoggerFactory.getLogger(ConjunctAnnotatorTest.class);
private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties";
@@ -32,19 +34,8 @@ public class ConjunctAnnotatorTest extends TestCase
private static final String coordinationLabels2 = "antecedent,conjunct,conjunction,conjunct,antecedent,antecedent";
private static final String TEST_DESC = "src/test/resources/desc/ConjunctAnnotatorTest.xml";
-
-
-
-
-
-
-/*--------------------------------------------------------------------------------*/
- protected void setUp() throws Exception
- {
- super.setUp();
- } // of setUp
-/*--------------------------------------------------------------------------------*/
- public void initCas(JCas jcas)
+
+ public void initCas(JCas jcas)
{
jcas.reset();
@@ -558,6 +549,7 @@ public void initCas(JCas jcas)
} // of initCas
/*--------------------------------------------------------------------------------*/
+ @Test
public void testProcess()
{
XMLInputSource descriptor = null;
@@ -595,7 +587,7 @@ public void testProcess()
try
{
ae.process(jcas, null);
- assertTrue("Invalid JCas!", checkJCas(jcas));
+ assertTrue(checkJCas(jcas), "Invalid JCas!");
} // of try
catch (Exception e)
{
diff --git a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/CoordinationAnnotatorTest.java b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/CoordinationAnnotatorTest.java
index dd5416a7a..32662f928 100644
--- a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/CoordinationAnnotatorTest.java
+++ b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/CoordinationAnnotatorTest.java
@@ -22,7 +22,6 @@
package de.julielab.jcore.ae.coordbaseline.main;
import de.julielab.jcore.types.*;
-import junit.framework.TestCase;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.cas.FSIterator;
@@ -32,13 +31,16 @@
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;
import org.apache.uima.util.XMLInputSource;
+import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
+import static org.junit.jupiter.api.Assertions.assertTrue;
-public class CoordinationAnnotatorTest extends TestCase
+
+public class CoordinationAnnotatorTest
{
private static final Logger LOGGER = LoggerFactory.getLogger(CoordinationAnnotatorTest.class);
@@ -55,11 +57,7 @@ public class CoordinationAnnotatorTest extends TestCase
private static final String TEST_DESC = "src/test/resources/desc/CoordinationAnnotatorTest.xml";
- protected void setUp() throws Exception
- {
- super.setUp();
- } // of setUp
-
+
public void initCas(JCas jcas)
{
jcas.reset();
@@ -562,7 +560,7 @@ public void initCas(JCas jcas)
} // of initCas
-
+ @Test
public void testProcess()
{
XMLInputSource descriptor = null;
@@ -598,7 +596,7 @@ public void testProcess()
try
{
ae.process(jcas, null);
- assertTrue("Invalid JCas!", checkJCas(jcas));
+ assertTrue(checkJCas(jcas), "Invalid JCas!");
} // of try
catch (Exception e)
diff --git a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EEEAnnotatorTest.java b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EEEAnnotatorTest.java
index a010c3178..4203cdc16 100644
--- a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EEEAnnotatorTest.java
+++ b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EEEAnnotatorTest.java
@@ -7,7 +7,6 @@
package de.julielab.jcore.ae.coordbaseline.main;
import de.julielab.jcore.types.*;
-import junit.framework.TestCase;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.cas.FSIterator;
@@ -17,13 +16,16 @@
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;
import org.apache.uima.util.XMLInputSource;
+import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
+import static org.junit.jupiter.api.Assertions.assertTrue;
-public class EEEAnnotatorTest extends TestCase
+
+public class EEEAnnotatorTest
{
private static final Logger LOGGER = LoggerFactory.getLogger(EEEAnnotatorTest.class);
private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties";
@@ -33,12 +35,7 @@ public class EEEAnnotatorTest extends TestCase
private static final String EEE2 = "simple upstream and downstream sequence elements";
private static final String TEST_DESC = "src/test/resources/desc/EEEAnnotatorTest.xml";
-/*--------------------------------------------------------------------------------*/
- protected void setUp() throws Exception
- {
- super.setUp();
- } // of setUp
-/*--------------------------------------------------------------------------------*/
+
public void initCas(JCas jcas)
{
jcas.reset();
@@ -538,6 +535,8 @@ public void initCas(JCas jcas)
entity3.addToIndexes();
} // of initCas
/*--------------------------------------------------------------------------------*/
+
+ @Test
public void testProcess()
{
XMLInputSource descriptor = null;
@@ -575,7 +574,7 @@ public void testProcess()
try
{
ae.process(jcas, null);
- assertTrue("Invalid JCas!", checkJCas(jcas));
+ assertTrue(checkJCas(jcas), "Invalid JCas!");
} // of try
diff --git a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EllipsisAnnotatorTest.java b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EllipsisAnnotatorTest.java
index 749371a51..94d697619 100644
--- a/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EllipsisAnnotatorTest.java
+++ b/jcore-coordination-baseline-ae/src/test/java/de/julielab/jcore/ae/coordbaseline/main/EllipsisAnnotatorTest.java
@@ -7,7 +7,6 @@
package de.julielab.jcore.ae.coordbaseline.main;
import de.julielab.jcore.types.*;
-import junit.framework.TestCase;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.cas.FSIterator;
@@ -17,12 +16,15 @@
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;
import org.apache.uima.util.XMLInputSource;
+import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
-public class EllipsisAnnotatorTest extends TestCase
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class EllipsisAnnotatorTest
{
private static final Logger LOGGER = LoggerFactory.getLogger(EllipsisAnnotatorTest.class);
private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties";
@@ -30,12 +32,7 @@ public class EllipsisAnnotatorTest extends TestCase
private static final String ellipsis1 = "X cells, Y cells, and Z cells";
private static final String ellipsis2 = "simple upstream sequence elements and simple downstream sequence elements";
private static final String TEST_DESC = "src/test/resources/desc/EllipsisAnnotatorTest.xml";
-/*--------------------------------------------------------------------------------*/
- protected void setUp() throws Exception
- {
- super.setUp();
- } // of setUp
-/*--------------------------------------------------------------------------------*/
+
public void initCas(JCas jcas)
{
jcas.reset();
@@ -697,6 +694,8 @@ public void initCas(JCas jcas)
c26.addToIndexes();
} // of initCas
/*---------------------------------------------------------------------------*/
+
+ @Test
public void testProcess()
{
XMLInputSource descriptor = null;
@@ -734,7 +733,7 @@ public void testProcess()
try
{
ae.process(jcas, null);
- assertTrue("Invalid JCas!", checkJCas(jcas));
+ assertTrue(checkJCas(jcas), "Invalid JCas!");
} // of try
catch (Exception e)
{
diff --git a/jcore-coordination-baseline-ae/src/test/resources/desc/ConjunctAnnotatorTest.xml b/jcore-coordination-baseline-ae/src/test/resources/desc/ConjunctAnnotatorTest.xml
index 50c97ebbc..29f9e5d35 100644
--- a/jcore-coordination-baseline-ae/src/test/resources/desc/ConjunctAnnotatorTest.xml
+++ b/jcore-coordination-baseline-ae/src/test/resources/desc/ConjunctAnnotatorTest.xml
@@ -6,7 +6,7 @@
ConjunctAnnotator
-2.5.1-SNAPSHOT
+2.6.0
diff --git a/jcore-coordination-baseline-ae/src/test/resources/desc/CoordinationAnnotatorTest.xml b/jcore-coordination-baseline-ae/src/test/resources/desc/CoordinationAnnotatorTest.xml
index ca9a48170..c3245f36b 100644
--- a/jcore-coordination-baseline-ae/src/test/resources/desc/CoordinationAnnotatorTest.xml
+++ b/jcore-coordination-baseline-ae/src/test/resources/desc/CoordinationAnnotatorTest.xml
@@ -6,7 +6,7 @@
CoordinationAnnotator
-2.5.1-SNAPSHOT
+2.6.0
diff --git a/jcore-coordination-baseline-ae/src/test/resources/desc/EEEAnnotatorTest.xml b/jcore-coordination-baseline-ae/src/test/resources/desc/EEEAnnotatorTest.xml
index 3683f5210..4fa87c0a9 100644
--- a/jcore-coordination-baseline-ae/src/test/resources/desc/EEEAnnotatorTest.xml
+++ b/jcore-coordination-baseline-ae/src/test/resources/desc/EEEAnnotatorTest.xml
@@ -6,7 +6,7 @@
EEEAnnotator
-2.5.1-SNAPSHOT
+2.6.0
diff --git a/jcore-coordination-baseline-ae/src/test/resources/desc/EllipsisAnnotatorTest.xml b/jcore-coordination-baseline-ae/src/test/resources/desc/EllipsisAnnotatorTest.xml
index beea12e3e..85ce7558b 100644
--- a/jcore-coordination-baseline-ae/src/test/resources/desc/EllipsisAnnotatorTest.xml
+++ b/jcore-coordination-baseline-ae/src/test/resources/desc/EllipsisAnnotatorTest.xml
@@ -6,7 +6,7 @@
EllipsisAnnotator
-2.5.1-SNAPSHOT
+2.6.0
diff --git a/jcore-cord19-reader/component.meta b/jcore-cord19-reader/component.meta
index 3fd15f733..fd42cd349 100644
--- a/jcore-cord19-reader/component.meta
+++ b/jcore-cord19-reader/component.meta
@@ -19,7 +19,7 @@
"maven-artifact": {
"artifactId": "jcore-cord19-reader",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe CORD-19 Reader"
}
diff --git a/jcore-cord19-reader/pom.xml b/jcore-cord19-reader/pom.xml
index a5a7d9d00..833b22db6 100644
--- a/jcore-cord19-reader/pom.xml
+++ b/jcore-cord19-reader/pom.xml
@@ -10,7 +10,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0
@@ -48,8 +48,8 @@
${jcore-utilities-version}
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-engineorg.assertj
diff --git a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReader.java b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReader.java
index 5789d935b..3b8b9ff35 100644
--- a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReader.java
+++ b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReader.java
@@ -73,7 +73,11 @@ public void getNext(JCas jCas) throws CollectionException {
Path p = currentFileBatch.get(currentBatchIndex);
if (p != Cord19FileVisitor.END) {
JCoReURI uri = new JCoReURI(jCas);
- uri.setUri(p.toUri().toString());
+ try {
+ uri.setUri(p.toUri().toString());
+ } catch (NullPointerException e) {
+ log.error("Could not retrieve URI string for path {}, resolved URI {}", p, p!= null ? p.toUri() : "");
+ }
uri.addToIndexes();
++completed;
}
diff --git a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19Reader.java b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19Reader.java
index 54a9f1d5c..60939db2b 100644
--- a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19Reader.java
+++ b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/Cord19Reader.java
@@ -121,31 +121,40 @@ private void addBody(JCas jCas, StringBuilder doctext, Cord19Document document)
}
private void addAbstract(JCas jCas, StringBuilder doctext, Cord19Document document) {
- List sections = new ArrayList<>(document.getAbstr().size());
- int abstractBegin = doctext.length();
- // Stores the end of the last paragraph before the newline
- int lastEnd = 0;
- for (Paragraph p : document.getAbstr()) {
- int paragraphBegin = doctext.length();
- AbstractSection as = new AbstractSection(jCas, paragraphBegin, doctext.length() + p.getText().length());
- doctext.append(p.getText());
- lastEnd = doctext.length();
- doctext.append(linesep);
- AbstractSectionHeading asHeading = new AbstractSectionHeading(jCas);
- asHeading.setTitleType("abstract");
- asHeading.setLabel(p.getSection());
- as.setAbstractSectionHeading(asHeading);
- sections.add(as);
- addReferences(p, Paragraph::getRefSpans, paragraphBegin, jCas);
- addReferences(p, Paragraph::getEqSpans, paragraphBegin, jCas);
- addReferences(p, Paragraph::getCiteSpans, paragraphBegin, jCas);
- }
- if (lastEnd - abstractBegin > 0) {
- AbstractText abstractText = new AbstractText(jCas, abstractBegin, lastEnd);
- abstractText.setAbstractType("main");
- abstractText.setStructuredAbstractParts(JCoReTools.addToFSArray(null, sections));
- abstractText.addToIndexes();
- doctext.append(linesep);
+ MetadataRecord metadataRecord = metadataIdMap.get(document.getPaperId());
+ if (metadataRecord != null && metadataRecord.getAbstractText() != null && !metadataRecord.getAbstractText().isBlank()) {
+ String abstractText = metadataRecord.getAbstractText();
+ AbstractText abstractAnnotation = new AbstractText(jCas, doctext.length(), doctext.length() + abstractText.length());
+ abstractAnnotation.setAbstractType("main");
+ abstractAnnotation.addToIndexes();
+ doctext.append(abstractText);
+ } else {
+ List sections = new ArrayList<>(document.getAbstr().size());
+ int abstractBegin = doctext.length();
+ // Stores the end of the last paragraph before the newline
+ int lastEnd = 0;
+ for (Paragraph p : document.getAbstr()) {
+ int paragraphBegin = doctext.length();
+ AbstractSection as = new AbstractSection(jCas, paragraphBegin, doctext.length() + p.getText().length());
+ doctext.append(p.getText());
+ lastEnd = doctext.length();
+ doctext.append(linesep);
+ AbstractSectionHeading asHeading = new AbstractSectionHeading(jCas);
+ asHeading.setTitleType("abstract");
+ asHeading.setLabel(p.getSection());
+ as.setAbstractSectionHeading(asHeading);
+ sections.add(as);
+ addReferences(p, Paragraph::getRefSpans, paragraphBegin, jCas);
+ addReferences(p, Paragraph::getEqSpans, paragraphBegin, jCas);
+ addReferences(p, Paragraph::getCiteSpans, paragraphBegin, jCas);
+ }
+ if (lastEnd - abstractBegin > 0) {
+ AbstractText abstractText = new AbstractText(jCas, abstractBegin, lastEnd);
+ abstractText.setAbstractType("main");
+ abstractText.setStructuredAbstractParts(JCoReTools.addToFSArray(null, sections));
+ abstractText.addToIndexes();
+ doctext.append(linesep);
+ }
}
}
@@ -164,7 +173,7 @@ private void addReferences(Paragraph p, Function>
private void addTitle(JCas jCas, Cord19Document document, MetadataRecord metadataRecord, StringBuilder doctext) {
if (metadataRecord != null) {
String title = metadataRecord.getTitle();
- if (title != null) {
+ if (title != null && !title.isBlank()) {
addTitle(jCas, title, doctext);
}
} else {
@@ -221,9 +230,10 @@ private void readMetaData(String metadataFile) {
String cordUid = record.get("cord_uid");
String sha = record.get("sha");
String title = record.get("title");
+ String abstractText = record.get("abstract");
String pmcid = record.get("pmcid");
String pmid = record.get("pubmed_id");
- MetadataRecord metadataRecord = new MetadataRecord(cordUid, sha, pmcid, pmid, title);
+ MetadataRecord metadataRecord = new MetadataRecord(cordUid, sha, pmcid, pmid, title, abstractText);
for (String hash : metadataRecord.hashes)
metadataIdMap.put(hash, metadataRecord);
if (pmcid != null)
@@ -244,13 +254,19 @@ private static class MetadataRecord {
private final String pmid;
private final String[] hashes;
private final String title;
+ private String abstractText;
- public MetadataRecord(String cordUid, String sha, String pmcid, String pmid, String title) {
+ public MetadataRecord(String cordUid, String sha, String pmcid, String pmid, String title, String abstractText) {
this.cordUid = cordUid;
this.pmcid = pmcid;
this.pmid = pmid;
this.title = title;
this.hashes = Arrays.stream(sha.split(";")).map(String::trim).toArray(String[]::new);
+ this.abstractText = abstractText;
+ }
+
+ public String getAbstractText() {
+ return abstractText;
}
public String getCordUid() {
diff --git a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/jsonformat/TabFigRef.java b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/jsonformat/TabFigRef.java
index bfe873c48..d35bc534e 100644
--- a/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/jsonformat/TabFigRef.java
+++ b/jcore-cord19-reader/src/main/java/de/julielab/jcore/reader/cord19/jsonformat/TabFigRef.java
@@ -19,6 +19,16 @@ public class TabFigRef {
private String text;
private String type;
private String latex;
+ private String html;
+
+ public String getHtml() {
+
+ return html;
+ }
+
+ public void setHtml(String html) {
+ this.html = html;
+ }
public String getLatex() {
return latex;
diff --git a/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier-reader.xml b/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier-reader.xml
index 90f5da426..4cdd4203f 100644
--- a/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier-reader.xml
+++ b/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier-reader.xml
@@ -5,7 +5,7 @@
JCoRe CORD-19 Multiplier ReaderThis component reads file paths to JSON files and the CORD-19 (https://pages.semanticscholar.org/coronavirus-research) meta data file to send them to CAS multipliers.
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier.xml b/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier.xml
index b539b1511..c3da5e650 100644
--- a/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier.xml
+++ b/jcore-cord19-reader/src/main/resources/de/julielab/jcore/reader/cord19/desc/jcore-cord19-multiplier.xml
@@ -6,7 +6,7 @@
JCoRe CORD-19 CAS MultiplierThis component reads the CORD-19 (https://pages.semanticscholar.org/coronavirus-research) JSON format into UIMA CAS instances.
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReaderTest.java b/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReaderTest.java
index b5922a816..0453a1cde 100644
--- a/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReaderTest.java
+++ b/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/Cord19MultiplierReaderTest.java
@@ -15,7 +15,7 @@
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import java.nio.file.Path;
import java.util.Collection;
@@ -63,9 +63,8 @@ private void checkSecondDocument(JCas cas) {
assertThat(documentTitles.get(0)).extracting(Annotation::getCoveredText).isEqualTo("Recombinant M protein-based ELISA test for detection of antibodies to canine coronavirus");
AbstractText abstractText = JCasUtil.selectSingle(cas, AbstractText.class);
- assertThat(abstractText.getCoveredText()).startsWith("The membrane (M) protein of canine");
- assertThat(abstractText.getCoveredText()).endsWith("antibodies to CCoV in dog sera. #");
- assertThat(abstractText.getStructuredAbstractParts()).hasSize(1);
+ assertThat(abstractText.getCoveredText()).startsWith("Abstract The membrane (M) protein of canine");
+ assertThat(abstractText.getCoveredText()).endsWith("antibodies to CCoV in dog sera.");
Collection paragraphs = JCasUtil.select(cas, Paragraph.class);
assertThat(paragraphs).hasSize(19);
diff --git a/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/JsonFormatTest.java b/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/JsonFormatTest.java
index dba932cac..5e39b79d0 100644
--- a/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/JsonFormatTest.java
+++ b/jcore-cord19-reader/src/test/java/de/julielab/jcore/reader/cord19/JsonFormatTest.java
@@ -4,7 +4,7 @@
import de.julielab.jcore.reader.cord19.jsonformat.Affiliation;
import de.julielab.jcore.reader.cord19.jsonformat.Author;
import de.julielab.jcore.reader.cord19.jsonformat.Cord19Document;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Path;
diff --git a/jcore-coreference-writer/LICENSE b/jcore-coreference-writer/LICENSE
new file mode 100644
index 000000000..7190118b3
--- /dev/null
+++ b/jcore-coreference-writer/LICENSE
@@ -0,0 +1,26 @@
+BSD 2-Clause License
+
+Copyright (c) 2021, JULIE Lab
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/jcore-coreference-writer/README.md b/jcore-coreference-writer/README.md
new file mode 100644
index 000000000..da767a4d1
--- /dev/null
+++ b/jcore-coreference-writer/README.md
@@ -0,0 +1,26 @@
+# JCoRe Acronym Writer
+
+**Descriptor Path**:
+```
+de.julielab.jcore.consumer.acronyms.desc.jcore-acronym-writer
+```
+
+Writes acronyms annotations from the CAS to a text file format.
+
+
+
+**1. Parameters**
+
+| Parameter Name | Parameter Type | Mandatory | Multivalued | Description |
+|----------------|----------------|-----------|-------------|-------------|
+| OutputFile | string | true | false | Path to the ourput file. |
+
+
+**2. Capabilities**
+
+| Type | Input | Output |
+|------|:-----:|:------:|
+| de.julielab.jcore.types.Abbreviation | `+` | |
+
+
+
diff --git a/jcore-coreference-writer/component.meta b/jcore-coreference-writer/component.meta
new file mode 100644
index 000000000..bbfba5b64
--- /dev/null
+++ b/jcore-coreference-writer/component.meta
@@ -0,0 +1,20 @@
+{
+ "categories": [
+ "consumer"
+ ],
+ "description": "Writes coreference annotations from the CAS to a text file format.",
+ "descriptors": [
+ {
+ "category": "consumer",
+ "location": "de.julielab.jcore.consumer.coreference.desc.jcore-coreference-writer"
+ }
+ ],
+ "exposable": true,
+ "group": "general",
+ "maven-artifact": {
+ "artifactId": "jcore-coreference-writer",
+ "groupId": "de.julielab",
+ "version": "2.6.0"
+ },
+ "name": "JCoRe Coreference Writer"
+}
diff --git a/jcore-coreference-writer/pom.xml b/jcore-coreference-writer/pom.xml
new file mode 100644
index 000000000..ad4aac828
--- /dev/null
+++ b/jcore-coreference-writer/pom.xml
@@ -0,0 +1,61 @@
+
+
+
+ 4.0.0
+ jcore-coreference-writer
+ jar
+
+
+ de.julielab
+ jcore-base
+ 2.6.0
+
+
+
+
+ de.julielab
+ jcore-descriptor-creator
+
+
+ de.julielab
+ jcore-utilities
+ ${jcore-utilities-version}
+
+
+ de.julielab
+ julielab-java-utilities
+
+
+ ch.qos.logback
+ logback-classic
+ test
+
+
+ org.slf4j
+ slf4j-api
+
+
+ de.julielab
+ jcore-types
+ ${jcore-types-version}
+
+
+ org.junit.jupiter
+ junit-jupiter-engine
+
+
+ JCoRe Coreference Writer
+
+ JULIE Lab Jena, Germany
+ http://www.julielab.de
+
+
+
+ BSD-2-Clause
+ https://opensource.org/licenses/BSD-2-Clause
+
+
+ https://github.com/JULIELab/jcore-base/tree/master/jcore-coreference-writer
+ Writes coreference annotations from the CAS to a text file format.
+
diff --git a/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java b/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java
new file mode 100644
index 000000000..c85dcfa82
--- /dev/null
+++ b/jcore-coreference-writer/src/main/java/de/julielab/jcore/consumer/coreference/CoreferenceWriter.java
@@ -0,0 +1,87 @@
+package de.julielab.jcore.consumer.coreference;
+
+import de.julielab.java.utilities.FileUtilities;
+import de.julielab.jcore.types.CorefExpression;
+import de.julielab.jcore.types.CorefRelation;
+import de.julielab.jcore.utility.JCoReTools;
+import org.apache.commons.io.IOUtils;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASRuntimeException;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.descriptor.ResourceMetaData;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.Iterator;
+
+@ResourceMetaData(name = "JCoRe Coreference Writer", description = "Writes co-reference annotation to a text file.")
+public class CoreferenceWriter extends JCasAnnotator_ImplBase {
+
+ public static final String PARAM_OUTPUTFILE = "OutputFile";
+
+ @ConfigurationParameter(name = PARAM_OUTPUTFILE)
+ private String outputFile;
+ private OutputStream os;
+
+ @Override
+ public void initialize(UimaContext aContext) throws ResourceInitializationException {
+ super.initialize(aContext);
+ outputFile = (String) aContext.getConfigParameterValue(PARAM_OUTPUTFILE);
+ try {
+ os = FileUtilities.getOutputStreamToFile(new File(outputFile));
+ } catch (IOException e) {
+ throw new ResourceInitializationException(e);
+ }
+ }
+
+ @Override
+ public void process(JCas jcas) throws AnalysisEngineProcessException {
+ try {
+ String pubmedId = JCoReTools.getDocId(jcas);
+ FSIterator it = jcas.getAnnotationIndex(CorefRelation.type).iterator();
+
+ int relcount = 0;
+ while (it.hasNext()) {
+ CorefRelation rel = it.next();
+ de.julielab.jcore.types.Annotation anaphora = rel.getAnaphora();
+
+ String abbrId = "Ana" + relcount;
+
+ IOUtils.write(String.join("\t", pubmedId, abbrId, String.valueOf(anaphora.getBegin()),
+ String.valueOf(anaphora.getEnd())) + "\n", os, "UTF-8");
+
+ Iterator antecedentsIt = rel.getAntecedents() != null ? rel.getAntecedents().iterator() : null;
+ while (antecedentsIt != null && antecedentsIt.hasNext()) {
+ CorefExpression antecedent = (CorefExpression) antecedentsIt.next();
+ if (antecedent != null) {
+ String antecedentGroup = "Ant" + relcount;
+ IOUtils.write(String.join("\t", pubmedId, antecedentGroup, String.valueOf(antecedent.getBegin()),
+ String.valueOf(antecedent.getEnd())) + "\n", os, "UTF-8");
+ }
+ }
+
+
+ ++relcount;
+ }
+ } catch (CASRuntimeException | IOException e) {
+ throw new AnalysisEngineProcessException(e);
+ }
+ }
+
+ @Override
+ public void collectionProcessComplete() throws AnalysisEngineProcessException {
+ try {
+ os.close();
+ } catch (IOException e) {
+ throw new AnalysisEngineProcessException(e);
+ }
+ }
+
+}
diff --git a/jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/jcore-coreference-writer.xml b/jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/jcore-coreference-writer.xml
new file mode 100644
index 000000000..b31bb30bb
--- /dev/null
+++ b/jcore-coreference-writer/src/main/resources/de/julielab/jcore/consumer/coreference/desc/jcore-coreference-writer.xml
@@ -0,0 +1,33 @@
+
+
+ org.apache.uima.java
+ true
+ de.julielab.jcore.consumer.coreference.CoreferenceWriter
+
+ JCoRe Coreference Writer
+ Writes coreference annotation to a text file.
+ 2.6.0
+
+
+ OutputFile
+
+ String
+ false
+ true
+
+
+
+
+
+
+
+
+
+
+
+ true
+ true
+ false
+
+
+
\ No newline at end of file
diff --git a/jcore-coreference-writer/src/test/java/de/julielab/jcore/consumer/coreference/CoreferenceWriterTest.java b/jcore-coreference-writer/src/test/java/de/julielab/jcore/consumer/coreference/CoreferenceWriterTest.java
new file mode 100644
index 000000000..7b7bf0429
--- /dev/null
+++ b/jcore-coreference-writer/src/test/java/de/julielab/jcore/consumer/coreference/CoreferenceWriterTest.java
@@ -0,0 +1,10 @@
+
+package de.julielab.jcore.consumer.coreference;
+
+/**
+ * Unit tests for jcore-coreference-writer.
+ *
+ */
+public class CoreferenceWriterTest {
+// TODO
+}
diff --git a/jcore-cpe-db-runner/pom.xml b/jcore-cpe-db-runner/pom.xml
index d84ab5a84..62e879169 100644
--- a/jcore-cpe-db-runner/pom.xml
+++ b/jcore-cpe-db-runner/pom.xml
@@ -71,8 +71,8 @@
${project.parent.version}
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-enginehttps://github.com/JULIELab/jcore-base/tree/master/jcore-cpe-db-runner
diff --git a/jcore-ct-reader/component.meta b/jcore-ct-reader/component.meta
index a131ea835..6e0600b4f 100644
--- a/jcore-ct-reader/component.meta
+++ b/jcore-ct-reader/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-ct-reader",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe Clinical Trials Reader"
}
diff --git a/jcore-ct-reader/pom.xml b/jcore-ct-reader/pom.xml
index bfc239518..4ea1f5969 100644
--- a/jcore-ct-reader/pom.xml
+++ b/jcore-ct-reader/pom.xml
@@ -9,7 +9,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0
@@ -41,8 +41,8 @@
${jcore-utilities-version}
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-engineJCoRe Clinical Trials Reader
diff --git a/jcore-ct-reader/src/main/resources/de/julielab/jcore/reader/ct/desc/jcore-clinicaltrials-reader.xml b/jcore-ct-reader/src/main/resources/de/julielab/jcore/reader/ct/desc/jcore-clinicaltrials-reader.xml
index 100df0acd..b0eaa2ae4 100644
--- a/jcore-ct-reader/src/main/resources/de/julielab/jcore/reader/ct/desc/jcore-clinicaltrials-reader.xml
+++ b/jcore-ct-reader/src/main/resources/de/julielab/jcore/reader/ct/desc/jcore-clinicaltrials-reader.xml
@@ -5,7 +5,7 @@
JCoRe Clinical Trials ReaderThis component reads the XML format provided by ClinicalTrials.gov. To this end, the JCoRe type system contains a number of types specifically created for this kind of document. Note that the CAS text created by this reader might be confusing without checking the corresponding annotations. This is due to the fact that the CT XML contains multiple enumerations which are not very well reflected in plain text. Also, enumerations with subitems, such as the outcomes, are not displayed in the expected groups of items. Instead, each item type is displayed separately. This could be changed, if necessary. Since all items are correctly annotated by their category, this might not even be an issue, depending on the downstream tasks.
- 2.5.1-SNAPSHOT
+ 2.6.0InputDirectory
diff --git a/jcore-ct-reader/src/test/java/de/julielab/jcore/reader/ct/ClinicalTrialsReaderTest.java b/jcore-ct-reader/src/test/java/de/julielab/jcore/reader/ct/ClinicalTrialsReaderTest.java
index b1aa75967..140b19874 100644
--- a/jcore-ct-reader/src/test/java/de/julielab/jcore/reader/ct/ClinicalTrialsReaderTest.java
+++ b/jcore-ct-reader/src/test/java/de/julielab/jcore/reader/ct/ClinicalTrialsReaderTest.java
@@ -11,13 +11,13 @@
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.Collection;
import static org.assertj.core.api.Assertions.assertThat;
-import static org.junit.Assert.*;
+import static org.junit.jupiter.api.Assertions.*;
/**
* Unit tests for jcore-ct-reader.
diff --git a/jcore-db-checkpoint-ae/README.md b/jcore-db-checkpoint-ae/README.md
index 6a4ed4f4b..a74f91d53 100644
--- a/jcore-db-checkpoint-ae/README.md
+++ b/jcore-db-checkpoint-ae/README.md
@@ -2,7 +2,8 @@
**Descriptor Path**:
```
-de.julielab.desc.jcore-db-checkpoint-ae
+de.julielab.jcore.ae.checkpoint.desc.jcore-db-checkpoint-ae
+de.julielab.jcore.ae.checkpoint.desc.jcore-db-checkpoint-consumer
```
This is a JeDiS[1] component. It can be used to set the 'last component' column in a subset table. This help to keep track of the pipeline status.
diff --git a/jcore-db-checkpoint-ae/component.meta b/jcore-db-checkpoint-ae/component.meta
index b703ae5c4..db83ca2a7 100644
--- a/jcore-db-checkpoint-ae/component.meta
+++ b/jcore-db-checkpoint-ae/component.meta
@@ -19,7 +19,7 @@
"maven-artifact": {
"artifactId": "jcore-db-checkpoint-ae",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe Database Checkpoint AE"
}
diff --git a/jcore-db-checkpoint-ae/pom.xml b/jcore-db-checkpoint-ae/pom.xml
index 3cac45687..13fede4b9 100644
--- a/jcore-db-checkpoint-ae/pom.xml
+++ b/jcore-db-checkpoint-ae/pom.xml
@@ -9,7 +9,7 @@
de.julielabjedis-parent
- 2.5.1-SNAPSHOT
+ 2.6.0../jedis-parent
diff --git a/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DBCheckpointAE.java b/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DBCheckpointAE.java
index 1a70c23cd..cf6f77e9e 100644
--- a/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DBCheckpointAE.java
+++ b/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DBCheckpointAE.java
@@ -69,6 +69,7 @@ public class DBCheckpointAE extends JCasAnnotator_ImplBase {
*/
@Override
public void initialize(final UimaContext aContext) throws ResourceInitializationException {
+ super.initialize(aContext);
componentDbName = (String) aContext.getConfigParameterValue(PARAM_CHECKPOINT_NAME);
dbcConfigPath = (String) aContext.getConfigParameterValue(PARAM_COSTOSYS_CONFIG);
indicateFinished = Optional.ofNullable((Boolean) aContext.getConfigParameterValue(PARAM_INDICATE_FINISHED)).orElse(false);
@@ -108,7 +109,7 @@ public void batchProcessComplete() throws AnalysisEngineProcessException {
@Override
public void collectionProcessComplete() throws AnalysisEngineProcessException {
super.collectionProcessComplete();
- log.debug("BatchProcessComplete called, stashing {} documents to be ready for marked as being finished", docIds.size());
+ log.debug("CollectionProcessComplete called, stashing {} documents to be ready for marked as being finished", docIds.size());
if (indicateFinished)
docReleaseCheckpoint.release(jedisSyncKey, docIds.stream());
try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) {
@@ -120,6 +121,7 @@ public void collectionProcessComplete() throws AnalysisEngineProcessException {
}
private void customBatchProcessingComplete() throws AnalysisEngineProcessException {
+ log.debug("CustomBatchProcessComplete called, stashing {} documents to be ready for marked as being finished", docIds.size());
if (indicateFinished)
docReleaseCheckpoint.release(jedisSyncKey, docIds.stream());
try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) {
@@ -134,6 +136,7 @@ private void customBatchProcessingComplete() throws AnalysisEngineProcessExcepti
*/
@Override
public void process(final JCas aJCas) throws AnalysisEngineProcessException {
+ log.trace("Processing jCas instance " + aJCas);
DocumentId documentId;
try {
final DBProcessingMetaData dbProcessingMetaData = JCasUtil.selectSingle(aJCas, DBProcessingMetaData.class);
@@ -198,13 +201,25 @@ private void setLastComponent(CoStoSysConnection conn, String
sqlMarkIsProcessed = String.format("UPDATE %s SET %s='%s', %s=TRUE, %s=FALSE WHERE %s", subsetTableName, Constants.LAST_COMPONENT, componentDbName, Constants.IS_PROCESSED, Constants.IN_PROCESS, primaryKeyPsString);
if (!documentIdsToSetLastComponent.isEmpty()) {
- log.debug("Setting the last component to {} for {} documents", componentDbName, documentIdsToSetLastComponent.size());
+ log.debug("Setting the last component to '{}' for {} documents", componentDbName, documentIdsToSetLastComponent.size());
updateSubsetTable(conn, documentIdsToSetLastComponent, sqlSetLastComponent);
}
if (markIsProcessed) {
- log.debug("Marking {} documents to having been processed by component \"{}\".", documentIdsToSetLastComponent.size(), componentDbName);
+ log.debug("Marking {} documents to having been processed by component \"{}\".", processedDocumentIds.size(), componentDbName);
+ log.debug("SQL: {}", sqlMarkIsProcessed);
+ log.trace("Marking the following document IDS as having been processed: {}", processedDocumentIds);
updateSubsetTable(conn, processedDocumentIds, sqlMarkIsProcessed);
}
+ try {
+ log.debug("Connection is auto commit: {}", conn.getAutoCommit());
+ if (!conn.getAutoCommit()) {
+ log.debug("Committing changes");
+ conn.commit();
+ }
+ } catch (SQLException e) {
+ log.error("Could not commit the document processing status changes.", e);
+ throw new AnalysisEngineProcessException(e);
+ }
}
private void updateSubsetTable(CoStoSysConnection conn, Collection documentIdsToMark, String sql) throws AnalysisEngineProcessException {
@@ -221,6 +236,7 @@ private void updateSubsetTable(CoStoSysConnection conn, Collection d
ps.addBatch();
}
try {
+ log.debug("Executing SQL command batch for being processed.");
ps.executeBatch();
} catch (BatchUpdateException e) {
if (e.getMessage().contains("deadlock detected")) {
diff --git a/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DocumentReleaseCheckpoint.java b/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DocumentReleaseCheckpoint.java
index e67750ed5..994063406 100644
--- a/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DocumentReleaseCheckpoint.java
+++ b/jcore-db-checkpoint-ae/src/main/java/de/julielab/jcore/ae/checkpoint/DocumentReleaseCheckpoint.java
@@ -1,11 +1,11 @@
package de.julielab.jcore.ae.checkpoint;
-import com.google.common.collect.HashMultiset;
-import com.google.common.collect.Multiset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.util.HashMap;
import java.util.HashSet;
+import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -14,7 +14,7 @@
*
This is class is a synchronization point for JeDIS components to report documents as being completely finished
* with processing.
*
Problem explanation: This synchronization is necessary because most database operating components work in batch mode for
- * performance reasons. However, if multiple components use batching with might be out of sync due to different
+ * performance reasons. However, if multiple components use batching wich might be out of sync due to different
* batch sizes and possibly other factors, one component may have sent a batch of document data to the database
* while other components have not at a particular point in time. If at such a time point the pipeline crashes
* or is manually interrupted, the actually written data is incoherent in the sense that some components have sent
@@ -33,17 +33,18 @@ public class DocumentReleaseCheckpoint {
"This is useful when document data is sent batchwise to the database by multiple components: In the case of a crash or manual cancellation of a pipeline run without synchronization is might happen " +
"that some components have sent their data and others haven't at the time of termination. To avoid an inconsistent database state," +
"a document will only be marked as finished " +
- "processed in the JeDIS subset table if all synchronied components in the pipeline have released the document. " +
+ "processed in the JeDIS subset table if all synchronized components in the pipeline have released the document. " +
"This is done by the DBCheckpointAE which must be at the end of the pipeline and have the 'IndicateFinished' parameter set to 'true'. " +
"Synchronized components are those that disclose this parameter and have a value set to it.";
public static final String PARAM_JEDIS_SYNCHRONIZATION_KEY = "JedisSynchronizationKey";
private final static Logger log = LoggerFactory.getLogger(DocumentReleaseCheckpoint.class);
private static DocumentReleaseCheckpoint checkpoint;
- private Multiset releasedDocuments;
+ private Map> releasedDocuments;
private Set registeredComponents;
+ private long lastwarning = 1000;
private DocumentReleaseCheckpoint() {
- releasedDocuments = HashMultiset.create();
+ releasedDocuments = new HashMap<>();
registeredComponents = new HashSet<>();
}
@@ -82,7 +83,15 @@ public void release(String componentKey, Stream releasedDocumentIds)
if (!registeredComponents.contains(componentKey))
throw new IllegalArgumentException("No component is registered for key " + componentKey);
synchronized (releasedDocuments) {
- releasedDocumentIds.forEach(d -> releasedDocuments.add(d));
+ releasedDocumentIds.forEach(d -> releasedDocuments.compute(d, (k, v) -> {
+ if (v == null) {
+ Set ret = new HashSet<>();
+ ret.add(componentKey);
+ return ret;
+ }
+ v.add(componentKey);
+ return v;
+ }));
}
}
@@ -99,13 +108,20 @@ public Set getReleasedDocumentIds() {
// Get all documents released by all components
Set returnedIds;
synchronized (releasedDocuments) {
- returnedIds = this.releasedDocuments.entrySet().stream().filter(e -> e.getCount() == getNumberOfRegisteredComponents()).map(Multiset.Entry::getElement).collect(Collectors.toSet());
+ log.trace("The following {} components are registered for document release: {}", getNumberOfRegisteredComponents(), registeredComponents);
+ log.trace("Released document counts: {}", this.releasedDocuments);
+ returnedIds = this.releasedDocuments.keySet().stream().filter(k -> this.releasedDocuments.get(k).containsAll(this.registeredComponents)).collect(Collectors.toSet());
+ log.trace("Final Document IDs to release: {}", returnedIds);
// Remove the completely released documents from the pool of potentially not yet completely released documents.
- returnedIds.forEach(id -> this.releasedDocuments.remove(id, Integer.MAX_VALUE));
+ returnedIds.forEach(id -> this.releasedDocuments.remove(id));
}
log.debug("Returning {} documents released by all registered components. {} document IDs remain that have not yet been released by all registered components.", returnedIds.size(), this.releasedDocuments.size());
- if (this.releasedDocuments.size() > 1000)
- log.warn("The number of document IDs that have not been released by all registered components has grown to {}. If it does not increase again, there is likely an errorneous component which does not release its documents.", releasedDocuments.size());
+ if (this.releasedDocuments.size() > lastwarning) {
+ log.warn("The number of document IDs that have not been released by all registered components has grown to {}. If it does not decrease again, there is likely an errorneous component which does not release its documents. Currently registered components: {}", releasedDocuments.size(), registeredComponents);
+ lastwarning *= 2;
+ } else if (this.releasedDocuments.size() < 50) {
+ lastwarning = 1000;
+ }
return returnedIds;
}
diff --git a/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-ae.xml b/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-ae.xml
index 31e3605e8..6340c7355 100644
--- a/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-ae.xml
+++ b/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-ae.xml
@@ -6,7 +6,7 @@
JCoRe Database Checkpoint AEThis component can be used when using a JCoRe database reader that reads from a CoStoSys/JeDIS subset. Enters the configured component name in the 'last component' column. Can also mark documents as being completely processed.
- 2.5.1-SNAPSHOT
+ 2.6.0CheckpointName
diff --git a/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-consumer.xml b/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-consumer.xml
index 5ac25514c..be7df82ea 100644
--- a/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-consumer.xml
+++ b/jcore-db-checkpoint-ae/src/main/resources/de/julielab/jcore/ae/checkpoint/desc/jcore-db-checkpoint-consumer.xml
@@ -6,7 +6,7 @@
JCoRe Database Checkpoint WriterThis component can be used when using a JCoRe database reader that reads from a CoStoSys/JeDIS subset. Enters the configured component name in the 'last component' column. Can also mark documents as being completely processed.
- 2.5.1-SNAPSHOT
+ 2.6.0CheckpointName
diff --git a/jcore-db-reader/component.meta b/jcore-db-reader/component.meta
index a6793b944..1272e620f 100644
--- a/jcore-db-reader/component.meta
+++ b/jcore-db-reader/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-db-reader",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe Database Reader"
}
diff --git a/jcore-db-reader/pom.xml b/jcore-db-reader/pom.xml
index 2129cc7e0..577dca679 100644
--- a/jcore-db-reader/pom.xml
+++ b/jcore-db-reader/pom.xml
@@ -1,16 +1,17 @@
-
+jedis-parentde.julielab
- 2.5.1-SNAPSHOT
+ 2.6.0../jedis-parent4.0.0jcore-db-readerJCoRe Database ReaderAbstract database reader for database driven processing
-
+
de.julielab
@@ -44,7 +45,7 @@
de.julielabjcore-xml-mapper
- 2.5.1-SNAPSHOT
+ 2.6.0test
@@ -57,38 +58,114 @@
jcore-db-test-utilitiestest
+
+ org.junit.jupiter
+ junit-jupiter-engine
+ org.apache.uimauima-ducc-user${uima-ducc-version}
- org.apache.uimauimaj-as-activemq
- org.apache.activemqactivemq-camel
- org.apache.camelcamel-core
- org.apache.camelcamel-xstream
- org.apache.commonscommons-pool2
- org.eclipse.jettyjetty-server
- xpp3xpp3
- org.apache.httpcomponentshttpclient
- xmlpullxmlpull
- org.apache.httpcomponentshttpclient-cache
- org.apache.httpcomponentshttpcore
- org.slf4jjcl-over-slf4j
- org.apache.camelcamel-context
- org.apache.camelcamel-http4
- org.apache.camelcamel-http
- org.apache.camelcamel-http-common
- org.apache.camelcamel-jetty-common
- org.apache.camelcamel-mina
- org.apache.camelcamel-xmlbeans
- org.apache.minamina-core
- org.apache.camelcamel-servlet
- org.apache.camelcamel-test-spring
- org.apache.camelcamel-test
- org.apache.camelcamel-stream
+
+ org.apache.uima
+ uimaj-as-activemq
+
+
+ org.apache.activemq
+ activemq-camel
+
+
+ org.apache.camel
+ camel-core
+
+
+ org.apache.camel
+ camel-xstream
+
+
+ org.apache.commons
+ commons-pool2
+
+
+ org.eclipse.jetty
+ jetty-server
+
+
+ xpp3
+ xpp3
+
+
+ org.apache.httpcomponents
+ httpclient
+
+
+ xmlpull
+ xmlpull
+
+
+ org.apache.httpcomponents
+ httpclient-cache
+
+
+ org.apache.httpcomponents
+ httpcore
+
+
+ org.slf4j
+ jcl-over-slf4j
+
+
+ org.apache.camel
+ camel-context
+
+
+ org.apache.camel
+ camel-http4
+
+
+ org.apache.camel
+ camel-http
+
+
+ org.apache.camel
+ camel-http-common
+
+
+ org.apache.camel
+ camel-jetty-common
+
+
+ org.apache.camel
+ camel-mina
+
+
+ org.apache.camel
+ camel-xmlbeans
+
+
+ org.apache.mina
+ mina-core
+
+
+ org.apache.camel
+ camel-servlet
+
+
+ org.apache.camel
+ camel-test-spring
+
+
+ org.apache.camel
+ camel-test
+
+
+ org.apache.camel
+ camel-stream
+
-
+
BSD-2-Clause
diff --git a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java
index 195e30de7..2dcc1e0d9 100644
--- a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java
+++ b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplier.java
@@ -9,15 +9,20 @@
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.OperationalProperties;
import org.apache.uima.fit.descriptor.ResourceMetaData;
+import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.jcas.cas.StringArray;
import org.apache.uima.resource.ResourceInitializationException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.io.FileNotFoundException;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
+import java.util.stream.Collectors;
/**
* A multiplier retrieving feature structures of type of {@link RowBatch} in its {@link #process(JCas)} method.
@@ -34,8 +39,9 @@
"populate CASes with them. This component is a part of the Jena Document Information System, JeDIS.",
vendor = "JULIE Lab Jena, Germany", copyright = "JULIE Lab Jena, Germany")
@OperationalProperties(outputsNewCases = true)
+@TypeCapability(inputs = {"de.julielab.jcore.types.casmultiplier.RowBatch"})
public abstract class DBMultiplier extends JCasMultiplier_ImplBase {
-
+private final static Logger log = LoggerFactory.getLogger(DBMultiplier.class);
protected DataBaseConnector dbc;
protected DBCIterator documentDataIterator;
protected String[] tables;
@@ -56,7 +62,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept
initialized = false;
}
- private DataBaseConnector getDataBaseConnector(String costosysConfig) throws AnalysisEngineProcessException {
+ protected DataBaseConnector getDataBaseConnector(String costosysConfig) throws AnalysisEngineProcessException {
DataBaseConnector dbc;
try {
dbc = new DataBaseConnector(costosysConfig);
@@ -70,6 +76,10 @@ private DataBaseConnector getDataBaseConnector(String costosysConfig) throws Ana
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
RowBatch rowbatch = JCasUtil.selectSingle(aJCas, RowBatch.class);
+ if (rowbatch.getIdentifiers() == null)
+ throw new AnalysisEngineProcessException(new IllegalArgumentException("The identifiers of the passed row batch are null."));
+ if (rowbatch.getIdentifiers().size() == 0)
+ throw new AnalysisEngineProcessException(new IllegalArgumentException("The identifiers of the passed row batch are empty."));
tables = rowbatch.getTables().toStringArray();
schemaNames = rowbatch.getTableSchemas().toStringArray();
tableName = rowbatch.getTableName();
@@ -98,6 +108,9 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException {
StringArray primaryKey = (StringArray) identifiers.get(i);
documentIdsForQuery.add(primaryKey.toArray());
}
+ if (log.isTraceEnabled()) {
+ log.trace("Received document IDs: {}", documentIdsForQuery.stream().map(o -> Arrays.stream(o).map(Object::toString).collect(Collectors.joining(","))).collect(Collectors.joining(" ; ")));
+ }
documentDataIterator = dbc.retrieveColumnsByTableSchema(documentIdsForQuery,
tables,
schemaNames);
diff --git a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplierReader.java b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplierReader.java
index 83370feae..37922d46d 100644
--- a/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplierReader.java
+++ b/jcore-db-reader/src/main/java/de/julielab/jcore/reader/db/DBMultiplierReader.java
@@ -19,6 +19,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -49,6 +50,9 @@ public class DBMultiplierReader extends DBSubsetReader {
@Override
public void initialize(UimaContext context) throws ResourceInitializationException {
super.initialize(context);
+ // reset the state in case of reconfigure()
+ retriever = null;
+ dataTableDocumentIds = null;
// Check whether a subset table name or a data table name was given.
if (readDataTable) {
@@ -62,9 +66,12 @@ public void initialize(UimaContext context) throws ResourceInitializationExcepti
}
@Override
- public void getNext(JCas jCas) throws CollectionException {
+ public void getNext(JCas jCas) throws CollectionException, IOException {
+ log.trace("jCas instance: " + jCas);
log.trace("Requesting next batch of document IDs from the database.");
List
+ */
+ private static void createTestPipelineComponents() throws Exception {
+ TypeSystemDescription tsDesc = TypeSystemDescriptionFactory.createTypeSystemDescription("de.julielab.jcore.types.jcore-document-meta-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-casflow-types", "de.julielab.jcore.types.jcore-xmi-splitter-types");
+
+ testCr = CollectionReaderFactory.createReader(DBMultiplierReader.class,
+ tsDesc,
+ DBMultiplierReader.PARAM_TABLE, XML_SUBSET_TABLE,
+ DBMultiplierReader.PARAM_RESET_TABLE, false,
+ DBMultiplierReader.PARAM_COSTOSYS_CONFIG_NAME, costosysConfigSourceTable,
+ // We set a batch size of 1 to have more refined testing.
+ // Otherwise, the multiplier would receive all 3 test documents at once and
+ // would process them all in one batch
+ DBMultiplierReader.PARAM_BATCH_SIZE, 1
+ );
+
+ AnalysisEngineDescription testAe1 = AnalysisEngineFactory.createEngineDescription(TestAnnotator.class, tsDesc, "name", "TestAE 1");
+ AnalysisEngineDescription testAe2 = AnalysisEngineFactory.createEngineDescription(TestAnnotator.class, tsDesc, "name", "TestAE 2");
+ AnalysisEngineDescription testCc1 = AnalysisEngineFactory.createEngineDescription(TestAnnotator.class, tsDesc, "name", "TestCC 1");
+ AnalysisEngineDescription testCc2 = AnalysisEngineFactory.createEngineDescription(TestAnnotator.class, tsDesc, "name", "TestCC 2");
+ AnalysisEngineDescription xmiDbWriter = AnalysisEngineFactory.createEngineDescription(XMIDBWriter.class,
+ XMIDBWriter.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE,
+ XMIDBWriter.PARAM_ANNOS_TO_STORE, new String[]{"de.julielab.jcore.types.Annotation"},
+ XMIDBWriter.PARAM_STORE_ALL, false,
+ XMIDBWriter.PARAM_STORE_BASE_DOCUMENT, true,
+ XMIDBWriter.PARAM_STORE_RECURSIVELY, false,
+ XMIDBWriter.PARAM_ADD_SHA_HASH, "document_text",
+ XMIDBWriter.PARAM_COSTOSYS_CONFIG, costosysConfigTargetTable,
+ XMIDBWriter.PARAM_UPDATE_MODE, true,
+ XMIDBWriter.PARAM_DO_GZIP, false
+ );
+ AnalysisEngineDescription dbCheckpointAe = AnalysisEngineFactory.createEngineDescription(DBCheckpointAE.class,
+ DBCheckpointAE.PARAM_CHECKPOINT_NAME, "end",
+ DBCheckpointAE.PARAM_COSTOSYS_CONFIG, costosysConfigSourceTable,
+ DBCheckpointAE.PARAM_INDICATE_FINISHED, true
+ );
+
+ FlowControllerDescription flowControllerDescription = FlowControllerFactory.createFlowControllerDescription(AnnotationDefinedFlowController.class);
+ AnalysisEngineDescription aeAaeDesc = AnalysisEngineFactory.createEngineDescription(List.of(testAe1, testAe2), List.of("TestAE 1", "TestAE 2"), null, null, flowControllerDescription);
+ AnalysisEngineDescription ccAaeDesc = AnalysisEngineFactory.createEngineDescription(List.of(testCc1, testCc2, xmiDbWriter, dbCheckpointAe), List.of("TestCC 1", "TestCC 2", "XMI Writer", "Checkpoint Writer"), null, null, flowControllerDescription);
+
+ AnalysisEngineDescription multiplierDescription = AnalysisEngineFactory.createEngineDescription(XMLDBMultiplier.class,
+ tsDesc,
+ XMLDBMultiplier.PARAM_MAPPING_FILE, Path.of("src", "test", "resources", "medlineMappingFile.xml").toString(),
+ // The core of this whole test: The components to be visited in case of matching hash codes.
+ // We want to skip all components except the checkpoint writer that marks the document as
+ // "processed" in the XML subset table
+ XMLDBMultiplier.PARAM_TO_VISIT_KEYS, new String[]{"Checkpoint Writer"},
+ // The next three parameters are required for the hash comparison
+ XMLDBMultiplier.PARAM_ADD_SHA_HASH, "document_text",
+ XMLDBMultiplier.PARAM_TABLE_DOCUMENT, TARGET_XMI_TABLE,
+ XMLDBMultiplier.PARAM_TABLE_DOCUMENT_SCHEMA, "xmi_text");
+
+ testAggregate = AnalysisEngineFactory.createEngine(List.of(multiplierDescription, aeAaeDesc, ccAaeDesc), List.of("multiplier", "AeAAE", "CcAAE"), null, null);
+
+ cas = JCasFactory.createJCas(tsDesc);
+ }
+
+ @Test
+ public void testInitialProcessingProcessing() throws Exception {
+ assertThat(testCr.hasNext());
+ while (testCr.hasNext()) {
+ testCr.getNext(cas.getCas());
+ testAggregate.process(cas);
+ // Check that all components have been visited as expected from a normal, fixed flow
+ assertThat(namesOfRunComponents).containsExactly("TestAE 1", "TestAE 2", "TestCC 1", "TestCC 2");
+ namesOfRunComponents.clear();
+ cas.reset();
+ }
+ testAggregate.collectionProcessComplete();
+ assertThat(dbc.tableExists(TARGET_XMI_TABLE));
+ // After this first processing, the XMI document table exists. We can now create a mirror on it. This is important
+ // because we want to see that the mirror is only reset for rows that have actually changed in subsequent tests.
+ dbc.defineMirrorSubset(XMI_MIRROR_TABLE, TARGET_XMI_TABLE, true, "The XMI test mirror table.", "xmi_text");
+ // We mark the XMI mirror subset as completely processed. This simulates a state where the initial batch of
+ // documents has been completely processed, before the update comes in.
+ dbc.markAsProcessed(XMI_MIRROR_TABLE);
+ SubsetStatus status = dbc.status(XML_SUBSET_TABLE, EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED, DataBaseConnector.StatusElement.IN_PROCESS));
+ // Check that all rows have been processed in the XML source subset table.
+ assertThat(status.isProcessed).isEqualTo(3);
+ assertThat(status.inProcess).isEqualTo(0);
+
+ assertThat(idsOfProcessedDocuments).hasSize(3);
+ // Check that there are actual IDs, not null string or something like that
+ for (String id : idsOfProcessedDocuments)
+ assertThat(id).matches("[0-9]+");
+ }
+
+ /**
+ * Adds its name to {@link #namesOfRunComponents}.
+ */
+ public static class TestAnnotator extends JCasAnnotator_ImplBase {
+ @ConfigurationParameter(name = "name")
+ private String name;
+
+ @Override
+ public void initialize(UimaContext aContext) throws ResourceInitializationException {
+ super.initialize(aContext);
+ this.name = (String) aContext.getConfigParameterValue("name");
+ }
+
+ @Override
+ public void process(JCas jCas) {
+ assertThat(jCas.getDocumentText()).isNotBlank();
+ namesOfRunComponents.add(name);
+ idsOfProcessedDocuments.add(JCoReTools.getDocId(jCas));
+ new Annotation(jCas).addToIndexes();
+ }
+ }
+
+ @Nested
+ class AfterInitialProcessing {
+ @Test
+ public void updateXML() throws Exception {
+ dbc.updateFromXML(Path.of("src", "test", "resources", "pubmed21n1016_excerpt_partially_changed.xml.gz").toString(), SOURCE_XML_TABLE, true);
+ // The update contains all three originally imported XML documents. Only that the second has not been changed.
+ // But the XML mirror should have been reset completely.
+ SubsetStatus status = dbc.status(XML_SUBSET_TABLE, EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED, DataBaseConnector.StatusElement.IN_PROCESS));
+ // Check that the XML mirror subset has been reset due to the update
+ assertThat(status.isProcessed).isEqualTo(0);
+ assertThat(status.inProcess).isEqualTo(0);
+ }
+
+ @Nested
+ class AfterUpdatingXML {
+ @Test
+ public void testOnlyNewDocumentsProcessed() throws Exception {
+
+ testCr.reconfigure();
+ testAggregate.reconfigure();
+ assertThat(testCr.hasNext()).withFailMessage("The XML DB Collection reader does not report any non-processed rows.").isTrue();
+ // Run the whole pipeline again. Only this time we only expect all the components run in a single case.
+ List allNamesOfRunComponents = new ArrayList<>();
+ while (testCr.hasNext()) {
+ cas.reset();
+ testCr.getNext(cas.getCas());
+ testAggregate.process(cas);
+ // Check that all components have been visited as expected from a normal, fixed flow
+ allNamesOfRunComponents.addAll(namesOfRunComponents);
+ namesOfRunComponents.clear();
+ cas.reset();
+ }
+ testAggregate.collectionProcessComplete();
+ // There should be only two components documents now that have visited all components
+ assertThat(allNamesOfRunComponents).containsExactly("TestAE 1", "TestAE 2", "TestCC 1", "TestCC 2", "TestAE 1", "TestAE 2", "TestCC 1", "TestCC 2");
+ testAggregate.collectionProcessComplete();
+ // Check again that all the XML documents have been processed.
+ SubsetStatus status = dbc.status(XML_SUBSET_TABLE, EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED));
+ // Check that all rows have been processed in the XML source subset table.
+ assertThat(status.isProcessed).isEqualTo(3);
+
+ // Now the more interesting part: In the XMI mirror there should now be two unprocessed tables, namely
+ // the two documents with a changed document text. The unchanged document should still be marked as
+ // processed.
+ SubsetStatus xmiMirrorStatus = dbc.status(XMI_MIRROR_TABLE, EnumSet.of(DataBaseConnector.StatusElement.IS_PROCESSED));
+ // Check that all rows have been processed in the XML source subset table.
+ assertThat(xmiMirrorStatus.isProcessed).isEqualTo(1);
+ }
+ }
+ }
+}
diff --git a/jcore-jedis-integration-tests/src/test/resources/logback-test.xml b/jcore-jedis-integration-tests/src/test/resources/logback-test.xml
new file mode 100644
index 000000000..e2ec34c57
--- /dev/null
+++ b/jcore-jedis-integration-tests/src/test/resources/logback-test.xml
@@ -0,0 +1,19 @@
+
+
+
+
+
+ %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/jcore-jedis-integration-tests/src/test/resources/medlineMappingFile.xml b/jcore-jedis-integration-tests/src/test/resources/medlineMappingFile.xml
new file mode 100644
index 000000000..cd9892953
--- /dev/null
+++ b/jcore-jedis-integration-tests/src/test/resources/medlineMappingFile.xml
@@ -0,0 +1,457 @@
+
+
+
+ /MedlineCitation/Article/ArticleTitle
+
+
+ /MedlineCitation/Article/Abstract
+ de.julielab.jcore.reader.xmlmapper.mapper.StructuredAbstractParser
+
+
+
+ /MedlineCitation/OtherAbstract
+
+
+ /MedlineCitation/Article/VernacularTitle
+
+
+
+
+ de.julielab.jcore.types.Title
+
+
+ 0
+
+
+
+ titleType
+ java.lang.String
+
+ document
+
+
+
+
+
+
+ de.julielab.jcore.types.pubmed.AbstractText
+
+
+
+ 2
+
+
+
+ abstractType
+ java.lang.String
+
+ other
+
+
+
+
+
+ de.julielab.jcore.types.Title
+
+
+ 3
+
+
+
+ titleType
+ java.lang.String
+
+ document_vernacular
+
+
+
+
+
+
+ de.julielab.jcore.types.pubmed.Header
+
+
+
+ /MedlineCitation/ArticleIdList/ArticleId[@IdType="doi"]
+
+ doi
+ java.lang.String
+
+
+ /MedlineCitation/PMID
+ docId
+ java.lang.String
+
+
+ /MedlineCitation/@Status
+ citationStatus
+ java.lang.String
+
+
+
+ /MedlineCitation/Article/Language
+
+ language
+ java.lang.String
+
+ de
+
+
+ en
+
+
+ es
+
+
+ fr
+
+
+ it
+
+
+ pt
+
+
+ eng
+
+
+ ger
+
+
+ fre
+
+
+ ita
+
+
+ other
+
+
+
+ source
+ java.lang.String
+
+ de.julielab.jcore.reader.xmlmapper.typeParser.SourceParser
+
+
+
+ authors
+
+ org.apache.uima.jcas.cas.FSArray
+
+ de.julielab.jcore.reader.xmlmapper.typeParser.FSArrayParser
+
+ true
+
+ authorInfo
+
+ de.julielab.jcore.types.AuthorInfo
+
+ true
+
+
+ /MedlineCitation/Article/AuthorList/Author[LastName]
+
+
+ foreName
+ java.lang.String
+ ForeName
+
+
+ foreName
+ java.lang.String
+ FirstName
+
+
+ lastName
+ java.lang.String
+ LastName
+
+
+ initials
+ java.lang.String
+ Initials
+
+
+ affiliation
+ java.lang.String
+
+ AffiliationInfo/Affiliation
+
+
+
+
+
+
+ org.apache.uima.jcas.cas.FSArray
+
+ pubTypeList
+ true
+
+
+ de.julielab.jcore.types.Journal
+
+
+ /MedlineCitation/Article/PublicationTypeList/PublicationType
+
+ Journal
+ true
+
+ java.lang.String
+ name
+ .
+
+
+ java.lang.String
+ ISSN
+
+ /MedlineCitation/Article/Journal/ISSN
+
+
+
+ java.lang.String
+ Volume
+
+ /MedlineCitation/Article/Journal/JournalIssue/Volume
+
+
+
+ java.lang.String
+ Issue
+
+ /MedlineCitation/Article/Journal/JournalIssue/Issue
+
+
+
+ java.lang.String
+ Title
+
+ /MedlineCitation/Article/Journal/Title
+
+
+
+ java.lang.String
+ ShortTitle
+
+ /MedlineCitation/MedlineJournalInfo/MedlineTA
+
+
+
+ java.lang.String
+ nlmId
+
+ /MedlineCitation/MedlineJournalInfo/NlmUniqueID
+
+
+
+ java.lang.String
+ Pages
+
+ /MedlineCitation/Article/Pagination/MedlinePgn
+
+
+
+ true
+
+ de.julielab.jcore.types.Date
+
+ PubDate
+
+ de.julielab.jcore.reader.xmlmapper.typeParser.PubDateParser
+
+
+ /MedlineCitation/Article/Journal/JournalIssue/PubDate
+
+
+ int
+ month
+
+
+ int
+ year
+
+
+ int
+ day
+
+
+
+
+
+ org.apache.uima.jcas.cas.FSArray
+ otherIDs
+ true
+
+ de.julielab.jcore.types.pubmed.OtherID
+
+ /MedlineCitation/OtherID
+ true
+
+ id
+ java.lang.String
+ .
+
+
+ source
+ java.lang.String
+ @Source
+
+
+
+
+
+
+ de.julielab.jcore.types.pubmed.ManualDescriptor
+
+
+ /MedlineCitation/GeneSymbolList
+ GeneSymbolList
+ true
+
+ org.apache.uima.jcas.cas.StringArray
+
+
+
+ KeywordList
+ true
+
+ org.apache.uima.jcas.cas.FSArray
+
+
+ Keyword
+ true
+
+ /MedlineCitation/KeywordList/Keyword
+
+
+ de.julielab.jcore.types.Keyword
+
+
+ Name
+ .
+ java.lang.String
+
+
+
+
+ ChemicalList
+ true
+
+ org.apache.uima.jcas.cas.FSArray
+
+
+ Chemical
+ true
+
+ /MedlineCitation/ChemicalList/Chemical
+
+
+ de.julielab.jcore.types.Chemical
+
+
+ RegistryNumber
+ RegistryNumber
+ java.lang.String
+
+
+ NameOfSubstance
+ NameOfSubstance
+ java.lang.String
+
+
+
+
+ DBInfoList
+ true
+
+ org.apache.uima.jcas.cas.FSArray
+
+
+ DBInfo
+ true
+
+ /MedlineCitation/DataBankList/DataBank
+
+
+ de.julielab.jcore.types.DBInfo
+
+
+ Name
+ DataBankName
+ java.lang.String
+
+
+ AcList
+
+ AccessionNumberList
+
+
+ true
+
+ org.apache.uima.jcas.cas.StringArray
+
+
+
+
+
+ MeSHList
+ true
+
+ org.apache.uima.jcas.cas.FSArray
+
+
+ meshHeading
+ true
+
+ /MedlineCitation/MeshHeadingList/MeshHeading
+
+
+ de.julielab.jcore.types.MeshHeading
+
+
+ DescriptorName
+ java.lang.String
+ DescriptorName
+
+
+
+ DescriptorNameMajorTopic
+
+ DescriptorName/@MajorTopicYN
+ boolean
+
+ Y
+ true
+
+
+ N
+ false
+
+
+
+ QualifierName
+ java.lang.String
+ QualifierName
+
+
+
+ QualifierNameMajorTopic
+
+ QualifierName/@MajorTopicYN
+ boolean
+
+ Y
+ true
+
+
+ N
+ false
+
+
+
+
+
+
\ No newline at end of file
diff --git a/jcore-jedis-integration-tests/src/test/resources/pubmed21n1016_excerpt_original.xml.gz b/jcore-jedis-integration-tests/src/test/resources/pubmed21n1016_excerpt_original.xml.gz
new file mode 100644
index 000000000..365b8d3e0
Binary files /dev/null and b/jcore-jedis-integration-tests/src/test/resources/pubmed21n1016_excerpt_original.xml.gz differ
diff --git a/jcore-jedis-integration-tests/src/test/resources/pubmed21n1016_excerpt_partially_changed.xml.gz b/jcore-jedis-integration-tests/src/test/resources/pubmed21n1016_excerpt_partially_changed.xml.gz
new file mode 100644
index 000000000..ee6542535
Binary files /dev/null and b/jcore-jedis-integration-tests/src/test/resources/pubmed21n1016_excerpt_partially_changed.xml.gz differ
diff --git a/jcore-jedis-integration-tests/src/test/resources/pubmedMappingFile.xml b/jcore-jedis-integration-tests/src/test/resources/pubmedMappingFile.xml
new file mode 100644
index 000000000..9a76854ae
--- /dev/null
+++ b/jcore-jedis-integration-tests/src/test/resources/pubmedMappingFile.xml
@@ -0,0 +1,436 @@
+
+
+
+ /PubmedArticle/MedlineCitation/Article/ArticleTitle
+
+
+ /PubmedArticle/MedlineCitation/Article/Abstract
+ de.julielab.jcore.reader.xmlmapper.mapper.StructuredAbstractParser
+
+
+
+ /PubmedArticle/MedlineCitation/OtherAbstract
+
+
+ /PubmedArticle/MedlineCitation/Article/VernacularTitle
+
+
+
+
+ de.julielab.jcore.types.Title
+
+
+ 0
+
+
+
+ titleType
+ java.lang.String
+
+ document
+
+
+
+
+
+
+ de.julielab.jcore.types.pubmed.AbstractText
+
+
+
+ 2
+
+
+
+ abstractType
+ java.lang.String
+
+ other
+
+
+
+
+
+ de.julielab.jcore.types.Title
+
+
+ 3
+
+
+
+ titleType
+ java.lang.String
+
+ document_vernacular
+
+
+
+
+
+
+ de.julielab.jcore.types.pubmed.Header
+
+
+
+ /PubmedArticle/MedlineCitation/ArticleIdList/ArticleId[@IdType="doi"]
+
+ doi
+ java.lang.String
+
+
+ /PubmedArticle/MedlineCitation/PMID
+ docId
+ java.lang.String
+
+
+ /PubmedArticle/MedlineCitation/@Status
+ citationStatus
+ java.lang.String
+
+
+
+ /PubmedArticle/MedlineCitation/Article/Language
+
+ language
+ java.lang.String
+
+ de
+
+
+ en
+
+
+ es
+
+
+ fr
+
+
+ it
+
+
+ pt
+
+
+ eng
+
+
+ ger
+
+
+ fre
+
+
+ ita
+
+
+ other
+
+
+
+ source
+ java.lang.String
+
+ de.julielab.jcore.reader.xmlmapper.typeParser.SourceParser
+
+
+
+ authors
+
+ org.apache.uima.jcas.cas.FSArray
+
+ de.julielab.jcore.reader.xmlmapper.typeParser.FSArrayParser
+
+ true
+
+ authorInfo
+
+ de.julielab.jcore.types.AuthorInfo
+
+ true
+
+
+ /PubmedArticle/MedlineCitation/Article/AuthorList/Author[LastName]
+
+
+ foreName
+ java.lang.String
+ ForeName
+
+
+ foreName
+ java.lang.String
+ FirstName
+
+
+ lastName
+ java.lang.String
+ LastName
+
+
+ initials
+ java.lang.String
+ Initials
+
+
+ affiliation
+ java.lang.String
+
+ AffiliationInfo/Affiliation
+
+
+
+
+
+
+ org.apache.uima.jcas.cas.FSArray
+
+ pubTypeList
+ true
+
+
+ de.julielab.jcore.types.Journal
+
+
+ /PubmedArticle/MedlineCitation/Article/PublicationTypeList/PublicationType
+
+ Journal
+ true
+
+ java.lang.String
+ name
+ .
+
+
+ java.lang.String
+ ISSN
+
+ /PubmedArticle/MedlineCitation/Article/Journal/ISSN
+
+
+
+ java.lang.String
+ Volume
+
+ /PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/Volume
+
+
+
+ java.lang.String
+ Issue
+
+ /PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/Issue
+
+
+
+ java.lang.String
+ Title
+
+ /PubmedArticle/MedlineCitation/Article/Journal/Title
+
+
+
+ java.lang.String
+ ShortTitle
+
+ /PubmedArticle/MedlineCitation/MedlineJournalInfo/MedlineTA
+
+
+
+ java.lang.String
+ nlmId
+
+ /PubmedArticle/MedlineCitation/MedlineJournalInfo/NlmUniqueID
+
+
+
+ java.lang.String
+ Pages
+
+ /PubmedArticle/MedlineCitation/Article/Pagination/MedlinePgn
+
+
+
+ true
+
+ de.julielab.jcore.types.Date
+
+ PubDate
+
+ de.julielab.jcore.reader.xmlmapper.typeParser.PubDateParser
+
+
+ /PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/PubDate
+
+
+ int
+ month
+
+
+ int
+ year
+
+
+ int
+ day
+
+
+
+
+
+
+
+ de.julielab.jcore.types.pubmed.ManualDescriptor
+
+
+ /PubmedArticle/MedlineCitation/GeneSymbolList
+ GeneSymbolList
+ true
+
+ org.apache.uima.jcas.cas.StringArray
+
+
+
+ KeywordList
+ true
+
+ org.apache.uima.jcas.cas.FSArray
+
+
+ Keyword
+ true
+
+ /PubmedArticle/MedlineCitation/KeywordList/Keyword
+
+
+ de.julielab.jcore.types.Keyword
+
+
+ Name
+ .
+ java.lang.String
+
+
+
+
+ ChemicalList
+ true
+
+ org.apache.uima.jcas.cas.FSArray
+
+
+ Chemical
+ true
+
+ /PubmedArticle/MedlineCitation/ChemicalList/Chemical
+
+
+ de.julielab.jcore.types.Chemical
+
+
+ RegistryNumber
+ RegistryNumber
+ java.lang.String
+
+
+ NameOfSubstance
+ NameOfSubstance
+ java.lang.String
+
+
+
+
+ DBInfoList
+ true
+
+ org.apache.uima.jcas.cas.FSArray
+
+
+ DBInfo
+ true
+
+ /PubmedArticle/MedlineCitation/DataBankList/DataBank
+
+
+ de.julielab.jcore.types.DBInfo
+
+
+ Name
+ DataBankName
+ java.lang.String
+
+
+ AcList
+
+ AccessionNumberList
+
+
+ true
+
+ org.apache.uima.jcas.cas.StringArray
+
+
+
+
+
+ MeSHList
+ true
+
+ org.apache.uima.jcas.cas.FSArray
+
+
+ meshHeading
+ true
+
+ /PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading
+
+
+ de.julielab.jcore.types.MeshHeading
+
+
+ DescriptorName
+ java.lang.String
+ DescriptorName
+
+
+
+ DescriptorNameMajorTopic
+
+ DescriptorName/@MajorTopicYN
+ boolean
+
+ Y
+ true
+
+
+ N
+ false
+
+
+
+ QualifierName
+ java.lang.String
+ QualifierName
+
+
+
+ QualifierNameMajorTopic
+
+ QualifierName/@MajorTopicYN
+ boolean
+
+ Y
+ true
+
+
+ N
+ false
+
+
+
+
+
+
\ No newline at end of file
diff --git a/jcore-jemas-ae/src/main/resources/de/julielab/jcore/ae/jemas/desc/jcore-jemas-ae.xml b/jcore-jemas-ae/src/main/resources/de/julielab/jcore/ae/jemas/desc/jcore-jemas-ae.xml
index 436c249b2..3d2f6c9fd 100644
--- a/jcore-jemas-ae/src/main/resources/de/julielab/jcore/ae/jemas/desc/jcore-jemas-ae.xml
+++ b/jcore-jemas-ae/src/main/resources/de/julielab/jcore/ae/jemas/desc/jcore-jemas-ae.xml
@@ -5,7 +5,7 @@
JCoRe JEmASA UIMA-based implementation of the core functionality of JEmAS, the Jena Emotion Analysis System.
- 2.5.1-SNAPSHOT
+ 2.6.0
diff --git a/jcore-jnet-ae/component.meta b/jcore-jnet-ae/component.meta
index dbdfe4186..b39b004c5 100644
--- a/jcore-jnet-ae/component.meta
+++ b/jcore-jnet-ae/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-jnet-ae",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe JNET AE"
}
diff --git a/jcore-jnet-ae/pom.xml b/jcore-jnet-ae/pom.xml
index cfd5c716c..d67df3ba0 100644
--- a/jcore-jnet-ae/pom.xml
+++ b/jcore-jnet-ae/pom.xml
@@ -11,14 +11,15 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0
+ org.apache.maven.pluginsmaven-assembly-plugin
- 2.4
+ 3.3.0jar-with-dependencies
@@ -106,6 +107,12 @@
de.julielabuea-stemmer0.1
+
+
+ junit
+ junit
+
+ de.julielab
@@ -117,8 +124,8 @@
julielab-java-utilities
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-engine
diff --git a/jcore-jnet-ae/src/main/resources/de/julielab/jcore/ae/jnet/desc/jcore-jnet-ae.xml b/jcore-jnet-ae/src/main/resources/de/julielab/jcore/ae/jnet/desc/jcore-jnet-ae.xml
index db23c98b2..f4b666e6d 100644
--- a/jcore-jnet-ae/src/main/resources/de/julielab/jcore/ae/jnet/desc/jcore-jnet-ae.xml
+++ b/jcore-jnet-ae/src/main/resources/de/julielab/jcore/ae/jnet/desc/jcore-jnet-ae.xml
@@ -6,7 +6,7 @@
JCoRe JNET AE
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/cli/JNETApplicationTest.java b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/cli/JNETApplicationTest.java
index cdfe60693..153d2714c 100644
--- a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/cli/JNETApplicationTest.java
+++ b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/cli/JNETApplicationTest.java
@@ -6,12 +6,12 @@
package de.julielab.jcore.ae.jnet.cli;
-import org.junit.After;
-import org.junit.Test;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Test;
import java.io.File;
-import static org.junit.Assert.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertTrue;
public class JNETApplicationTest {
private static final String PREFIX = "src/test/resources/de/julielab/jcore/ae/jnet/cli/";
@@ -32,7 +32,7 @@ public class JNETApplicationTest {
- @After
+ @AfterEach
public void deleteModel() {
File modelFile = new File(UNITTEST_MODEL_GZ);
if (modelFile.exists())
diff --git a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/tagger/NETaggerTest.java b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/tagger/NETaggerTest.java
index f21a11d09..e05e6a6c1 100644
--- a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/tagger/NETaggerTest.java
+++ b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/tagger/NETaggerTest.java
@@ -2,7 +2,7 @@
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import java.io.InputStream;
import java.util.ArrayList;
diff --git a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/ConsistencyPreservationTest.java b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/ConsistencyPreservationTest.java
index 3031116d3..f551411fd 100644
--- a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/ConsistencyPreservationTest.java
+++ b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/ConsistencyPreservationTest.java
@@ -18,7 +18,6 @@
package de.julielab.jcore.ae.jnet.uima;
import de.julielab.jcore.types.*;
-import junit.framework.TestCase;
import org.apache.uima.UIMAFramework;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
@@ -28,6 +27,7 @@
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.util.CasCreationUtils;
import org.apache.uima.util.XMLInputSource;
+import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -35,6 +35,8 @@
import java.util.Iterator;
import java.util.TreeSet;
+import static org.junit.jupiter.api.Assertions.*;
+
/**
* Please note that in the original test there were "GoodEntityMentions" and
* "BadEntityMentions". Both types were only used for this test which caused
@@ -45,7 +47,7 @@
* @author faessler
*
*/
-public class ConsistencyPreservationTest extends TestCase {
+public class ConsistencyPreservationTest {
private static final Logger LOGGER = LoggerFactory.getLogger(ConsistencyPreservationTest.class);
@@ -133,12 +135,14 @@ private void initJCas4DoAbbreviationBased(final JCas jcas) throws Exception {
e5.addToIndexes();
}
+ @Test
public void testConsistencyPreservation() throws Exception {
final String modeString = ConsistencyPreservation.MODE_STRING + "," + ConsistencyPreservation.MODE_ACRO2FULL
+ "," + ConsistencyPreservation.MODE_FULL2ACRO;
new ConsistencyPreservation(modeString);
}
+ @Test
public void testAcroMatch() throws Exception {
final String modeString = ConsistencyPreservation.MODE_FULL2ACRO + "," + ConsistencyPreservation.MODE_ACRO2FULL;
@@ -186,6 +190,7 @@ public void testAcroMatch() throws Exception {
}
+ @Test
public void testStringMatch() throws Exception {
LOGGER.info("testStringMatch() - starting...");
final CAS cas = CasCreationUtils.createCas(
@@ -229,6 +234,7 @@ public void testStringMatch() throws Exception {
assertTrue(allOK);
}
+ @Test
public void testStringMatch2() throws Exception {
// This test checks whether the consistence preservation algorithm
// correctly detects already existing annotations even when there are
@@ -269,6 +275,7 @@ public void testStringMatch2() throws Exception {
assertEquals(3, count);
}
+ @Test
public void testStringMatch3() throws Exception {
// This test checks whether the consistence preservation algorithm
// correctly detects already existing annotations even when there are
@@ -309,6 +316,7 @@ public void testStringMatch3() throws Exception {
assertEquals(5, count);
}
+ @Test
public void testStringMatchTokenBoundaries() throws Exception {
// This test checks whether the consistency preservation algorithm
// sticks to token boundaries if the respective mode is on
@@ -350,6 +358,7 @@ public void testStringMatchTokenBoundaries() throws Exception {
assertEquals(1, count);
}
+ @Test
public void testStringMatchTokenBoundaries2() throws Exception {
// Test for multi token entities
String text = "This is BCA alpha. But we haven't annotated BCA alpha in all cases. Also not some other BCA.";
@@ -430,7 +439,8 @@ else if (g.getSpecificType().equals("type2"))
}
assertEquals(2, oCount);
}
-
+
+ @Test
public void testStringMatchTokenBoundaries3() throws Exception {
// Test for multi token entities with correct prefix but wrong ending
String text = "Group 1. And Group B.";
diff --git a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.java b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.java
index 44dd4e90d..e2143f3e9 100644
--- a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.java
+++ b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.java
@@ -20,7 +20,6 @@
import de.julielab.jcore.types.*;
import de.julielab.jcore.utility.index.JCoReCoverIndex;
import de.julielab.jnet.tagger.Unit;
-import junit.framework.TestCase;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
@@ -37,6 +36,7 @@
import org.apache.uima.util.InvalidXMLException;
import org.apache.uima.util.XMLInputSource;
import org.apache.uima.util.XMLParser;
+import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
@@ -52,7 +52,9 @@
import java.util.regex.Pattern;
import java.util.stream.Collectors;
-public class EntityAnnotatorTest extends TestCase {
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class EntityAnnotatorTest {
/**
* Logger for this class
@@ -66,12 +68,8 @@ public class EntityAnnotatorTest extends TestCase {
private static final String ENTITY_ANNOTATOR_DESC = PREFIX+"EntityAnnotatorTest.xml";
private static final String NEGATIVE_LIST = PREFIX+"negativeList";
- @Override
- protected void setUp() throws Exception {
- super.setUp();
- // PropertyConfigurator.configure("src/test/java/log4j.properties");
- }
+ @Test
public void testIgnoreLabel() throws ResourceInitializationException {
// load AE
@@ -124,6 +122,7 @@ public void testIgnoreLabel() throws ResourceInitializationException {
/**
* test whether Annotator can be initialized properly from given descriptor
*/
+ @Test
public void testInitialize() {
LOGGER.debug("testInitialize()");
AnalysisEngine entityAnnotator = null;
@@ -150,6 +149,7 @@ public void testInitialize() {
* test whether process method runs successfully. Output must be checked by
* a human manually
*/
+ @Test
public void testProcess() throws InvalidXMLException, ResourceInitializationException, IOException, SAXException,
CASException, AnalysisEngineProcessException {
LOGGER.debug("testProcess()");
@@ -176,6 +176,7 @@ public void testProcess() throws InvalidXMLException, ResourceInitializationExce
* unit sentence and removing duplicates. Prediction is "simulated" (labels
* are set).
*/
+ @Test
public void testSimulatedProcess() throws IllegalAccessException, NoSuchFieldException,
ResourceInitializationException, InvalidXMLException, IOException, CASException, SAXException {
LOGGER.debug("testCreateUnitSentence() - starting");
@@ -280,6 +281,7 @@ else if (unit.getRep().equals("ceta"))
* @throws IllegalAccessException
* @throws IllegalArgumentException
*/
+ @Test
public void testWriteToCAS() throws SecurityException, NoSuchFieldException, ResourceInitializationException,
InvalidXMLException, IOException, CASException, IllegalArgumentException, IllegalAccessException {
LOGGER.debug("testWriteToCAS()");
diff --git a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/MiniTestapp.java b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/MiniTestapp.java
index 1b1ed323f..006328391 100644
--- a/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/MiniTestapp.java
+++ b/jcore-jnet-ae/src/test/java/de/julielab/jcore/ae/jnet/uima/MiniTestapp.java
@@ -35,8 +35,8 @@
import org.apache.uima.util.CasCreationUtils;
import org.apache.uima.util.XMLInputSource;
import org.apache.uima.util.XMLSerializer;
-import org.junit.After;
-import org.junit.Test;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
@@ -47,7 +47,7 @@
import java.io.IOException;
import java.nio.charset.Charset;
-import static org.junit.Assert.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
public class MiniTestapp {
@@ -61,7 +61,7 @@ public class MiniTestapp {
private static final String ANNOTATOR_DESC = PREFIX + "EntityAnnotatorTest.xml";
- @After
+ @AfterEach
public void clean() {
if (new File(TEST_XMI_OUT).isFile()) {
new File(TEST_XMI_OUT).delete();
diff --git a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.xml b/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.xml
index 12859863d..b07631439 100644
--- a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.xml
+++ b/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/EntityAnnotatorTest.xml
@@ -6,7 +6,7 @@
EntityTaggerAnnotator
- 2.5.1-SNAPSHOT
+ 2.6.0julielab
diff --git a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/miniapp_out_template.xmi b/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/miniapp_out_template.xmi
deleted file mode 100644
index 029dc8db3..000000000
--- a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/miniapp_out_template.xmi
+++ /dev/null
@@ -1 +0,0 @@
-
\ No newline at end of file
diff --git a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/tsDescriptor.xml b/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/tsDescriptor.xml
index b26a4688d..6bfe94e8e 100644
--- a/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/tsDescriptor.xml
+++ b/jcore-jnet-ae/src/test/resources/de/julielab/jcore/ae/jnet/uima/tsDescriptor.xml
@@ -2,7 +2,7 @@
aceComplete
-2.5.1-SNAPSHOT
+2.6.0
diff --git a/jcore-jpos-ae/component.meta b/jcore-jpos-ae/component.meta
index 86f05e5d5..6cacfad71 100644
--- a/jcore-jpos-ae/component.meta
+++ b/jcore-jpos-ae/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-jpos-ae",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe JPOS AE"
}
diff --git a/jcore-jpos-ae/pom.xml b/jcore-jpos-ae/pom.xml
index 480afdf16..04e41a7e3 100644
--- a/jcore-jpos-ae/pom.xml
+++ b/jcore-jpos-ae/pom.xml
@@ -11,7 +11,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0
@@ -114,8 +114,8 @@
2.1.2
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-engine
diff --git a/jcore-jpos-ae/src/main/resources/de/julielab/jcore/ae/jpos/desc/jcore-jpos.xml b/jcore-jpos-ae/src/main/resources/de/julielab/jcore/ae/jpos/desc/jcore-jpos.xml
index be5593812..e8777ae38 100644
--- a/jcore-jpos-ae/src/main/resources/de/julielab/jcore/ae/jpos/desc/jcore-jpos.xml
+++ b/jcore-jpos-ae/src/main/resources/de/julielab/jcore/ae/jpos/desc/jcore-jpos.xml
@@ -6,7 +6,7 @@
JCoRe JPOS AE
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-jpos-ae/src/test/java/de/julielab/jcore/ae/jpos/postagger/POSAnnotatorTest.java b/jcore-jpos-ae/src/test/java/de/julielab/jcore/ae/jpos/postagger/POSAnnotatorTest.java
index c7a03c06d..50c639d51 100644
--- a/jcore-jpos-ae/src/test/java/de/julielab/jcore/ae/jpos/postagger/POSAnnotatorTest.java
+++ b/jcore-jpos-ae/src/test/java/de/julielab/jcore/ae/jpos/postagger/POSAnnotatorTest.java
@@ -17,9 +17,9 @@
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceSpecifier;
import org.apache.uima.util.XMLInputSource;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
-import static org.junit.Assert.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
public class POSAnnotatorTest {
diff --git a/jcore-jpos-ae/src/test/resources/POSTagAnnotatorTest.xml b/jcore-jpos-ae/src/test/resources/POSTagAnnotatorTest.xml
index 384265369..3c7f1b099 100644
--- a/jcore-jpos-ae/src/test/resources/POSTagAnnotatorTest.xml
+++ b/jcore-jpos-ae/src/test/resources/POSTagAnnotatorTest.xml
@@ -6,7 +6,7 @@
JPOSAnnotator
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab
diff --git a/jcore-jsbd-ae/component.meta b/jcore-jsbd-ae/component.meta
index 025d9b87f..5ab9a4df2 100644
--- a/jcore-jsbd-ae/component.meta
+++ b/jcore-jsbd-ae/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-jsbd-ae",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe Sentence Annotator"
}
diff --git a/jcore-jsbd-ae/pom.xml b/jcore-jsbd-ae/pom.xml
index d5622f97b..b0b6524c2 100644
--- a/jcore-jsbd-ae/pom.xml
+++ b/jcore-jsbd-ae/pom.xml
@@ -11,14 +11,15 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0
+ org.apache.maven.pluginsmaven-assembly-plugin
- 2.4
+ 3.3.0jar-with-dependencies
@@ -76,7 +77,6 @@
org.assertjassertj-core
- 3.9.1de.julielab
@@ -102,6 +102,16 @@
cc.malletmallet2.0.8
+
+
+ junit
+ junit
+
+
+
+
+ org.apache.commons
+ commons-lang3de.julielab
@@ -112,6 +122,10 @@
de.julielabjcore-descriptor-creator
+
+ org.junit.jupiter
+ junit-jupiter
+ JULIE Lab Jena, Germany
diff --git a/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java b/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java
index a27107477..583db41a1 100644
--- a/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java
+++ b/jcore-jsbd-ae/src/main/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.java
@@ -29,6 +29,7 @@
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
@@ -146,77 +147,89 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept
* @throws AnalysisEngineProcessException
*/
public void process(JCas aJCas) throws AnalysisEngineProcessException {
- if (StringUtils.isBlank(aJCas.getDocumentText())) {
- final String docId = JCoReTools.getDocId(aJCas);
- LOGGER.warn("The document text of document {} is empty.", docId);
- return;
- }
- JCoReCondensedDocumentText documentText;
try {
- // If there are no cut-away types, the document text will remain unchanged.
- documentText = new JCoReCondensedDocumentText(aJCas, cutAwayTypes);
- } catch (ClassNotFoundException e1) {
- throw new AnalysisEngineProcessException(e1);
- }
-
- if (sentenceDelimiterTypes != null) {
+ if (StringUtils.isBlank(aJCas.getDocumentText())) {
+ final String docId = JCoReTools.getDocId(aJCas);
+ LOGGER.warn("The document text of document {} is empty.", docId);
+ final AnnotationIndex annotationIndex = aJCas.getAnnotationIndex();
+ LOGGER.warn("All annotations in CAS:");
+ for (Annotation a : annotationIndex) {
+ System.out.println(a);
+ }
+ return;
+ }
+ JCoReCondensedDocumentText documentText;
try {
- // the index merger gives us access to all delimiter type
- // indexes in one
- JCoReAnnotationIndexMerger indexMerger = new JCoReAnnotationIndexMerger(sentenceDelimiterTypes, false,
- null, aJCas);
+ // If there are no cut-away types, the document text will remain unchanged.
+ documentText = new JCoReCondensedDocumentText(aJCas, cutAwayTypes, Set.of(','), true);
+ } catch (ClassNotFoundException e1) {
+ LOGGER.error("Could not create the text without annotations to be cut away in document {}", JCoReTools.getDocId(aJCas), e1);
+ throw new AnalysisEngineProcessException(e1);
+ }
- // the idea: collect all start and end offsets of sentence
- // delimiter annotations (sections, titles, captions, ...) in a
- // list and sort ascending; then, perform sentence segmentation
- // between every two adjacent offsets. This way, no sentence can
- // cross any delimiter annotation border
- List borders = new ArrayList<>();
- borders.add(0);
- borders.add(aJCas.getDocumentText().length());
- while (indexMerger.incrementAnnotation()) {
- Annotation a = (Annotation) indexMerger.getAnnotation();
- // Here we convert the original offsets to the condensed offsets. If there are
- // no cut-away types, the offsets will just remain unchanged. Otherwise we now
- // have the borders of the condensed text passages associated with the sentence
- // delimiter annotation.
- borders.add(documentText.getCondensedOffsetForOriginalOffset(a.getBegin()));
- borders.add(documentText.getCondensedOffsetForOriginalOffset(a.getEnd()));
- }
- borders.sort(null);
+ if (sentenceDelimiterTypes != null) {
+ try {
+ // the index merger gives us access to all delimiter type
+ // indexes in one
+ JCoReAnnotationIndexMerger indexMerger = new JCoReAnnotationIndexMerger(sentenceDelimiterTypes, false,
+ null, aJCas);
- // now do sentence segmentation between annotation borders
- for (int i = 1; i < borders.size(); ++i) {
- int start = borders.get(i - 1);
- int end = borders.get(i);
+ // the idea: collect all start and end offsets of sentence
+ // delimiter annotations (sections, titles, captions, ...) in a
+ // list and sort ascending; then, perform sentence segmentation
+ // between every two adjacent offsets. This way, no sentence can
+ // cross any delimiter annotation border
+ List borders = new ArrayList<>();
+ borders.add(0);
+ borders.add(documentText.getCondensedOffsetForOriginalOffset(aJCas.getDocumentText().length()));
+ while (indexMerger.incrementAnnotation()) {
+ Annotation a = (Annotation) indexMerger.getAnnotation();
+ // Here we convert the original offsets to the condensed offsets. If there are
+ // no cut-away types, the offsets will just remain unchanged. Otherwise we now
+ // have the borders of the condensed text passages associated with the sentence
+ // delimiter annotation.
+ borders.add(documentText.getCondensedOffsetForOriginalOffset(a.getBegin()));
+ borders.add(documentText.getCondensedOffsetForOriginalOffset(a.getEnd()));
+ }
+ borders.sort(null);
- // skip leading whites spaces
- while (start < end && Character.isWhitespace(aJCas.getDocumentText().charAt(start)))
- ++start;
+ // now do sentence segmentation between annotation borders
+ for (int i = 1; i < borders.size(); ++i) {
+ int start = borders.get(i - 1);
+ int end = borders.get(i);
- // get the string between the current annotation borders and recognize sentences
- String textSpan = documentText.getCodensedText().substring(start, end);
- if (!StringUtils.isBlank(textSpan))
- doSegmentation(documentText, textSpan, start);
- }
+ // skip leading whites spaces
+ while (start < end && (Character.isWhitespace(documentText.getCodensedText().charAt(start))))
+ ++start;
- } catch (ClassNotFoundException e) {
- throw new AnalysisEngineProcessException(e);
- }
- } else {
- // if no processingScope set -> use documentText
- if (aJCas.getDocumentText() != null && aJCas.getDocumentText().length() > 0) {
- doSegmentation(documentText, documentText.getCodensedText(), 0);
- } else {
- if (numEmptyCases.get() < 10) {
- LOGGER.debug("document text empty. Skipping this document.");
- numEmptyCases.incrementAndGet();
- } else if (numEmptyCases.get() == 10) {
- LOGGER.warn("Encountered 10 documents with an empty text body. This message will not appear again " +
- "to avoid scrolling in cases where this is expected.");
+ // get the string between the current annotation borders and recognized sentences
+ String textSpan = documentText.getCodensedText().substring(start, end);
+ if (!StringUtils.isBlank(textSpan))
+ doSegmentation(documentText, textSpan, start);
+ }
+
+ } catch (ClassNotFoundException e) {
+ throw new AnalysisEngineProcessException(e);
}
+ } else {
+ // sentence delimiter types are not given
+ // if no processingScope set -> use documentText
+ if (aJCas.getDocumentText() != null && aJCas.getDocumentText().length() > 0) {
+ doSegmentation(documentText, documentText.getCodensedText(), 0);
+ } else {
+ if (numEmptyCases.get() < 10) {
+ LOGGER.debug("document text empty. Skipping this document.");
+ numEmptyCases.incrementAndGet();
+ } else if (numEmptyCases.get() == 10) {
+ LOGGER.warn("Encountered 10 documents with an empty text body. This message will not appear again " +
+ "to avoid scrolling in cases where this is expected.");
+ }
+ }
}
+ } catch (Throwable t) {
+ LOGGER.error("Could not perform sentence splitting of document {}", JCoReTools.getDocId(aJCas), t);
+ throw t;
}
}
@@ -359,7 +372,7 @@ private void splitAtWhitespaces(JCoReCondensedDocumentText documentText, Sentenc
lastEnd = s.getEnd();
currentSentenceLength = 0;
} else {
- LOGGER.warn("Not creating whitespace-segmented sub-sentence because its offsets would be invalid: {}-{}", subBegin, subEnd);
+ LOGGER.debug("Not creating whitespace-segmented sub-sentence because its offsets would be invalid: {}-{}", subBegin, subEnd);
}
}
currentSentenceLength += wsMatcher.end();
@@ -372,7 +385,7 @@ private void splitAtWhitespaces(JCoReCondensedDocumentText documentText, Sentenc
s.setComponentId(this.getClass().getName());
subSentences.add(s);
} else {
- LOGGER.warn("Not creating whitespace-segmented sub-sentence because its offsets would be invalid: {}-{}", subBegin, subEnd);
+ LOGGER.debug("Not creating whitespace-segmented sub-sentence because its offsets would be invalid: {}-{}", subBegin, subEnd);
}
}
diff --git a/jcore-jsbd-ae/src/main/resources/de/julielab/jcore/ae/jsbd/desc/jcore-jsbd-ae.xml b/jcore-jsbd-ae/src/main/resources/de/julielab/jcore/ae/jsbd/desc/jcore-jsbd-ae.xml
index 8bb60791a..b1293df62 100644
--- a/jcore-jsbd-ae/src/main/resources/de/julielab/jcore/ae/jsbd/desc/jcore-jsbd-ae.xml
+++ b/jcore-jsbd-ae/src/main/resources/de/julielab/jcore/ae/jsbd/desc/jcore-jsbd-ae.xml
@@ -6,7 +6,7 @@
de.julielab.jcore.ae.jsbd.main.SentenceAnnotatorDescriptor automatically generated by uimaFIT
- 2.5.1-SNAPSHOT
+ 2.6.0de.julielab.jcore.ae.jsbd.main
diff --git a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/Abstract2UnitPipeTest.java b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/Abstract2UnitPipeTest.java
index 3d7f63cc7..91ffa9f45 100644
--- a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/Abstract2UnitPipeTest.java
+++ b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/Abstract2UnitPipeTest.java
@@ -15,8 +15,8 @@
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import org.assertj.core.data.Offset;
-import org.junit.Before;
-import org.junit.Test;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
import java.util.ArrayList;
import java.util.List;
@@ -27,7 +27,7 @@ public class Abstract2UnitPipeTest {
protected static Pipe pipe;
- @Before
+ @BeforeEach
public void init() {
pipe = new Abstract2UnitPipe(false);
}
diff --git a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/SentenceSplitterTest.java b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/SentenceSplitterTest.java
index 8715c714b..a3ce21a17 100644
--- a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/SentenceSplitterTest.java
+++ b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/SentenceSplitterTest.java
@@ -18,7 +18,7 @@
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -26,8 +26,8 @@
import java.util.ArrayList;
import java.util.List;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
/**
* Test for the class {@link SentenceSplitter}
diff --git a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java
index 5506d38b8..1e820d945 100644
--- a/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java
+++ b/jcore-jsbd-ae/src/test/java/de/julielab/jcore/ae/jsbd/main/SentenceAnnotatorTest.java
@@ -1,17 +1,17 @@
-/**
+/**
* SentenceAnnotatorTest.java
- *
+ *
* Copyright (c) 2015, JULIE Lab.
- * All rights reserved. This program and the accompanying materials
+ * All rights reserved. This program and the accompanying materials
* are made available under the terms of the BSD-2-Clause License
- *
+ *
* Author: tomanek
- *
+ *
* Current version: 2.2
* Since version: 1.0
- *
- * Creation date: Nov 29, 2006
- *
+ *
+ * Creation date: Nov 29, 2006
+ *
* This is a JUnit test for the SentenceAnnotator.
**/
@@ -25,6 +25,7 @@
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.util.JCasUtil;
@@ -34,276 +35,330 @@
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;
import org.apache.uima.util.XMLInputSource;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
+import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;
import static org.assertj.core.api.Assertions.assertThat;
-import static org.junit.Assert.*;
+import static org.assertj.core.api.Assertions.assertThatCode;
+import static org.junit.jupiter.api.Assertions.*;
+
public class SentenceAnnotatorTest {
- /**
- * Logger for this class
- */
- private static final Logger LOGGER = LoggerFactory.getLogger(SentenceAnnotatorTest.class);
-
- private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties";
-
- // uncomment to test with/without scope
- // private static final String DESCRIPTOR =
- // "src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml";
- private static final String DESCRIPTOR = "src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml";
-
- // last sentence has no EOS symbol to test that also this is handled
- // correctly
- private static final String[] TEST_TEXT = { "First sentence. Second \t sentence! \n Last sentence?",
- "Hallo, jemand da? Nein, niemand.", "A test. It can't be just one sentence. Testing the test.", "" };
-
- private static final String[] TEST_TEXT_OFFSETS = { "0-15;16-34;40-54", "0-17;18-32", "0-7;8-38;39-56", "" };
-
- private static final int[] endOffsets = { 54, 32, 27, 0 };
-
- /**
- * Use the model in resources, split the text in TEST_TEXT and compare the
- * split result against TEST_TEXT_OFFSETS
- */
- @Test
- public void testProcess() {
-
- boolean annotationsOK = true;
-
- XMLInputSource sentenceXML = null;
- ResourceSpecifier sentenceSpec = null;
- AnalysisEngine sentenceAnnotator = null;
-
- try {
- sentenceXML = new XMLInputSource(DESCRIPTOR);
- sentenceSpec = UIMAFramework.getXMLParser().parseResourceSpecifier(sentenceXML);
- sentenceAnnotator = UIMAFramework.produceAnalysisEngine(sentenceSpec);
- } catch (Exception e) {
- LOGGER.error("testProcess()", e);
- }
-
- for (int i = 0; i < TEST_TEXT.length; i++) {
-
- JCas jcas = null;
- try {
- jcas = sentenceAnnotator.newJCas();
- } catch (ResourceInitializationException e) {
- LOGGER.error("testProcess()", e);
- }
-
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug("testProcess() - testing text: " + TEST_TEXT[i]);
- }
- jcas.setDocumentText(TEST_TEXT[i]);
-
- // make one test scope ranging over complete document text
- // annotations for the processing scope
- TestScope scope1 = new TestScope(jcas, 0, endOffsets[i]);
- scope1.addToIndexes();
- // TestScope scope2 = new TestScope(jcas,37,54);
-
-
- try {
- sentenceAnnotator.process(jcas, null);
- } catch (Exception e) {
- LOGGER.error("testProcess()", e);
- }
-
- // get the offsets of the sentences
- JFSIndexRepository indexes = jcas.getJFSIndexRepository();
- Iterator sentIter = indexes.getAnnotationIndex(Sentence.type).iterator();
-
- String predictedOffsets = getPredictedOffsets(i, sentIter);
-
- // compare offsets
- if (!predictedOffsets.equals(TEST_TEXT_OFFSETS[i])) {
- annotationsOK = false;
- continue;
- }
- }
- assertTrue(annotationsOK);
- }
-
-
- private String getPredictedOffsets(int i, Iterator sentIter) {
- String predictedOffsets = "";
- while (sentIter.hasNext()) {
- Sentence s = (Sentence) sentIter.next();
- LOGGER.debug("sentence: " + s.getCoveredText() + ": " + s.getBegin() + " - " + s.getEnd());
- predictedOffsets += (predictedOffsets.length() > 0) ? ";" : "";
- predictedOffsets += s.getBegin() + "-" + s.getEnd();
- }
-
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug("testProcess() - predicted: " + predictedOffsets);
- }
- if (LOGGER.isDebugEnabled()) {
- LOGGER.debug("testProcess() - wanted: " + TEST_TEXT_OFFSETS[i]);
- }
- return predictedOffsets;
- }
-
- @Test
- public void testUimaFitIntegration() throws UIMAException, IOException {
- AnalysisEngine sentenceAE = AnalysisEngineFactory.createEngine(SentenceAnnotator.class,
- SentenceAnnotator.PARAM_MODEL_FILE, "de/julielab/jcore/ae/jsbd/model/test-model.gz",
- SentenceAnnotator.PARAM_POSTPROCESSING, "biomed");
- JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types");
- String abstractText = FileUtils.readFileToString(new File("src/test/resources/test-abstract.txt"), "UTF-8");
- cas.setDocumentText(abstractText);
- sentenceAE.process(cas);
- Collection sentences = JCasUtil.select(cas, Sentence.class);
- for (Sentence sentence : sentences) {
- System.out.println(sentence.getCoveredText());
- }
- assertEquals(14, sentences.size());
- }
-
- @Test
- public void testModelClassPathResource() throws Exception {
- AnalysisEngine sentenceAE = AnalysisEngineFactory.createEngine(SentenceAnnotator.class,
- SentenceAnnotator.PARAM_MODEL_FILE, "de/julielab/jcore/ae/jsbd/model/test-model.gz",
- SentenceAnnotator.PARAM_POSTPROCESSING, "biomed");
- JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types");
- String abstractText = FileUtils.readFileToString(new File("src/test/resources/test-abstract.txt"), "UTF-8");
- cas.setDocumentText(abstractText);
- sentenceAE.process(cas);
- Collection sentences = JCasUtil.select(cas, Sentence.class);
- System.out.println(sentences.size());
- for (Sentence sentence : sentences) {
- System.out.println(sentence.getCoveredText());
- }
- assertEquals(14, sentences.size());
- }
-
- @Test
- public void testSentenceDelimiterTypes() throws Exception {
- JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types",
- "de.julielab.jcore.types.jcore-document-structure-types");
-
- jCas.setDocumentText("Introduction " + "We here show good results. This is a figure caption "
- + "And this is a paragraph without a fullstop for some reason " + "Conclusion "
- + "We are the greatest.");
- Title t1 = new Title(jCas, 0, 12);
- Caption c = new Caption(jCas, 40, 64);
- Paragraph p = new Paragraph(jCas, 65, 123);
- Title t2 = new Title(jCas, 124, 134);
- t1.addToIndexes();
- c.addToIndexes();
- p.addToIndexes();
- t2.addToIndexes();
- assertEquals("Introduction", t1.getCoveredText());
- assertEquals("This is a figure caption", c.getCoveredText());
- assertEquals("And this is a paragraph without a fullstop for some reason", p.getCoveredText());
- assertEquals("Conclusion", t2.getCoveredText());
-
- AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE,
- "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES,
- new LinkedHashSet(
- Arrays.asList(Title.class.getName(), Caption.class.getName(), Paragraph.class.getName())));
-
- jsbd.process(jCas.getCas());
-
- Set> expectedSpans = new HashSet<>();
- expectedSpans.add(Range.between(0, 12));
- expectedSpans.add(Range.between(13, 39));
- expectedSpans.add(Range.between(40, 64));
- expectedSpans.add(Range.between(65, 123));
- expectedSpans.add(Range.between(124, 134));
- expectedSpans.add(Range.between(135, 155));
-
- FSIterator it = jCas.getAnnotationIndex(Sentence.type).iterator();
- assertTrue(it.hasNext());
- while (it.hasNext()) {
- Annotation sentence = it.next();
- Range sentenceRange = Range.between(sentence.getBegin(), sentence.getEnd());
- assertTrue("Range " + sentenceRange + " was not expected", expectedSpans.remove(sentenceRange));
- }
- assertTrue(expectedSpans.isEmpty());
- }
-
- @Test
- public void testSentenceWhitespaces() throws Exception {
- JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types",
- "de.julielab.jcore.types.jcore-document-structure-types");
-
- // This text is taken from pmid 23092121
- jCas.setDocumentText(" : We present a theoretical study of the electronic subband structure and collective electronic excitation associated with plasmon and surface plasmon modes in metal-based hollow nanosphere. The dependence of the electronic subband energy on the sample parameters of the hollow nanosphere is examined.");
-
- AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE,
- "de/julielab/jcore/ae/jsbd/model/test-model.gz");
-
- jsbd.process(jCas.getCas());
+ /**
+ * Logger for this class
+ */
+ private static final Logger LOGGER = LoggerFactory.getLogger(SentenceAnnotatorTest.class);
+
+ private static final String LOGGER_PROPERTIES = "src/test/java/log4j.properties";
+
+ // uncomment to test with/without scope
+ // private static final String DESCRIPTOR =
+ // "src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml";
+ private static final String DESCRIPTOR = "src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml";
+
+ // last sentence has no EOS symbol to test that also this is handled
+ // correctly
+ private static final String[] TEST_TEXT = {"First sentence. Second \t sentence! \n Last sentence?",
+ "Hallo, jemand da? Nein, niemand.", "A test. It can't be just one sentence. Testing the test.", ""};
+
+ private static final String[] TEST_TEXT_OFFSETS = {"0-15;16-34;40-54", "0-17;18-32", "0-7;8-38;39-56", ""};
+
+ private static final int[] endOffsets = {54, 32, 27, 0};
+
+ /**
+ * Use the model in resources, split the text in TEST_TEXT and compare the
+ * split result against TEST_TEXT_OFFSETS
+ */
+ @Test
+ public void testProcess() {
+
+ boolean annotationsOK = true;
+
+ XMLInputSource sentenceXML = null;
+ ResourceSpecifier sentenceSpec = null;
+ AnalysisEngine sentenceAnnotator = null;
+
+ try {
+ sentenceXML = new XMLInputSource(DESCRIPTOR);
+ sentenceSpec = UIMAFramework.getXMLParser().parseResourceSpecifier(sentenceXML);
+ sentenceAnnotator = UIMAFramework.produceAnalysisEngine(sentenceSpec);
+ } catch (Exception e) {
+ LOGGER.error("testProcess()", e);
+ }
+
+ for (int i = 0; i < TEST_TEXT.length; i++) {
+
+ JCas jcas = null;
+ try {
+ jcas = sentenceAnnotator.newJCas();
+ } catch (ResourceInitializationException e) {
+ LOGGER.error("testProcess()", e);
+ }
+
+ if (LOGGER.isDebugEnabled()) {
+ LOGGER.debug("testProcess() - testing text: " + TEST_TEXT[i]);
+ }
+ jcas.setDocumentText(TEST_TEXT[i]);
+
+ // make one test scope ranging over complete document text
+ // annotations for the processing scope
+ TestScope scope1 = new TestScope(jcas, 0, endOffsets[i]);
+ scope1.addToIndexes();
+ // TestScope scope2 = new TestScope(jcas,37,54);
+
+
+ try {
+ sentenceAnnotator.process(jcas, null);
+ } catch (Exception e) {
+ LOGGER.error("testProcess()", e);
+ }
+
+ // get the offsets of the sentences
+ JFSIndexRepository indexes = jcas.getJFSIndexRepository();
+ Iterator sentIter = indexes.getAnnotationIndex(Sentence.type).iterator();
+
+ String predictedOffsets = getPredictedOffsets(i, sentIter);
+
+ // compare offsets
+ if (!predictedOffsets.equals(TEST_TEXT_OFFSETS[i])) {
+ annotationsOK = false;
+ continue;
+ }
+ }
+ assertTrue(annotationsOK);
+ }
+
+
+ private String getPredictedOffsets(int i, Iterator sentIter) {
+ String predictedOffsets = "";
+ while (sentIter.hasNext()) {
+ Sentence s = (Sentence) sentIter.next();
+ LOGGER.debug("sentence: " + s.getCoveredText() + ": " + s.getBegin() + " - " + s.getEnd());
+ predictedOffsets += (predictedOffsets.length() > 0) ? ";" : "";
+ predictedOffsets += s.getBegin() + "-" + s.getEnd();
+ }
+
+ if (LOGGER.isDebugEnabled()) {
+ LOGGER.debug("testProcess() - predicted: " + predictedOffsets);
+ }
+ if (LOGGER.isDebugEnabled()) {
+ LOGGER.debug("testProcess() - wanted: " + TEST_TEXT_OFFSETS[i]);
+ }
+ return predictedOffsets;
+ }
+
+ @Test
+ public void testUimaFitIntegration() throws UIMAException, IOException {
+ AnalysisEngine sentenceAE = AnalysisEngineFactory.createEngine(SentenceAnnotator.class,
+ SentenceAnnotator.PARAM_MODEL_FILE, "de/julielab/jcore/ae/jsbd/model/test-model.gz",
+ SentenceAnnotator.PARAM_POSTPROCESSING, "biomed");
+ JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types");
+ String abstractText = FileUtils.readFileToString(new File("src/test/resources/test-abstract.txt"), "UTF-8");
+ cas.setDocumentText(abstractText);
+ sentenceAE.process(cas);
+ Collection sentences = JCasUtil.select(cas, Sentence.class);
+ for (Sentence sentence : sentences) {
+ System.out.println(sentence.getCoveredText());
+ }
+ assertEquals(14, sentences.size());
+ }
+
+ @Test
+ public void testModelClassPathResource() throws Exception {
+ AnalysisEngine sentenceAE = AnalysisEngineFactory.createEngine(SentenceAnnotator.class,
+ SentenceAnnotator.PARAM_MODEL_FILE, "de/julielab/jcore/ae/jsbd/model/test-model.gz",
+ SentenceAnnotator.PARAM_POSTPROCESSING, "biomed");
+ JCas cas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types");
+ String abstractText = FileUtils.readFileToString(new File("src/test/resources/test-abstract.txt"), "UTF-8");
+ cas.setDocumentText(abstractText);
+ sentenceAE.process(cas);
+ Collection sentences = JCasUtil.select(cas, Sentence.class);
+ System.out.println(sentences.size());
+ for (Sentence sentence : sentences) {
+ System.out.println(sentence.getCoveredText());
+ }
+ assertEquals(14, sentences.size());
+ }
+
+ @Test
+ public void testSentenceDelimiterTypes() throws Exception {
+ JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types",
+ "de.julielab.jcore.types.jcore-document-structure-types");
+
+ jCas.setDocumentText("Introduction " + "We here show good results. This is a figure caption "
+ + "And this is a paragraph without a fullstop for some reason " + "Conclusion "
+ + "We are the greatest.");
+ Title t1 = new Title(jCas, 0, 12);
+ Caption c = new Caption(jCas, 40, 64);
+ Paragraph p = new Paragraph(jCas, 65, 123);
+ Title t2 = new Title(jCas, 124, 134);
+ t1.addToIndexes();
+ c.addToIndexes();
+ p.addToIndexes();
+ t2.addToIndexes();
+ assertEquals("Introduction", t1.getCoveredText());
+ assertEquals("This is a figure caption", c.getCoveredText());
+ assertEquals("And this is a paragraph without a fullstop for some reason", p.getCoveredText());
+ assertEquals("Conclusion", t2.getCoveredText());
+
+ AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE,
+ "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES,
+ new LinkedHashSet(
+ Arrays.asList(Title.class.getName(), Caption.class.getName(), Paragraph.class.getName())));
+
+ jsbd.process(jCas.getCas());
+
+ Set> expectedSpans = new HashSet<>();
+ expectedSpans.add(Range.between(0, 12));
+ expectedSpans.add(Range.between(13, 39));
+ expectedSpans.add(Range.between(40, 64));
+ expectedSpans.add(Range.between(65, 123));
+ expectedSpans.add(Range.between(124, 134));
+ expectedSpans.add(Range.between(135, 155));
+
+ FSIterator it = jCas.getAnnotationIndex(Sentence.type).iterator();
+ assertTrue(it.hasNext());
+ while (it.hasNext()) {
+ Annotation sentence = it.next();
+ Range sentenceRange = Range.between(sentence.getBegin(), sentence.getEnd());
+ assertTrue(expectedSpans.remove(sentenceRange), "Range " + sentenceRange + " was not expected");
+ }
+ assertTrue(expectedSpans.isEmpty());
+ }
+
+ @Test
+ public void testSentenceWhitespaces() throws Exception {
+ JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types",
+ "de.julielab.jcore.types.jcore-document-structure-types");
+
+ // This text is taken from pmid 23092121
+ jCas.setDocumentText(" : We present a theoretical study of the electronic subband structure and collective electronic excitation associated with plasmon and surface plasmon modes in metal-based hollow nanosphere. The dependence of the electronic subband energy on the sample parameters of the hollow nanosphere is examined.");
+
+ AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE,
+ "de/julielab/jcore/ae/jsbd/model/test-model.gz");
+
+ jsbd.process(jCas.getCas());
Sentence sentence = JCasUtil.select(jCas, Sentence.class).iterator().next();
assertFalse(sentence.getCoveredText().startsWith(" "));
}
- @Test
- public void testTrailingNewline() throws Exception {
- JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types",
- "de.julielab.jcore.types.jcore-document-structure-types");
+ @Test
+ public void testTrailingNewline() throws Exception {
+ JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types",
+ "de.julielab.jcore.types.jcore-document-structure-types");
- // This text is taken from PMC3408706. Note the "paragraph separator" at the end
- jCas.setDocumentText("In1 the next step, we plan to use higher level QM/MM methods to calculate the energy barrier of the reaction catalyzed by endonuclease APE1, in compliance with the mechanism proposed, and to screen for effective inhibitors with the use of the constructed mechanistic full-atomic model of the enzyme. \u2029");
+ // This text is taken from PMC3408706. Note the "paragraph separator" at the end
+ jCas.setDocumentText("In1 the next step, we plan to use higher level QM/MM methods to calculate the energy barrier of the reaction catalyzed by endonuclease APE1, in compliance with the mechanism proposed, and to screen for effective inhibitors with the use of the constructed mechanistic full-atomic model of the enzyme. \u2029");
new InternalReference(jCas, 2, 3).addToIndexes();
- AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE,
- "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{InternalReference.class.getCanonicalName()});
-
- jsbd.process(jCas.getCas());
-
-
- Sentence sentence = JCasUtil.select(jCas, Sentence.class).iterator().next();
- assertFalse(sentence.getCoveredText().endsWith("\u2029"));
- }
-
- @Test
- public void testSplitAtNewlines() throws Exception {
- JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types",
- "de.julielab.jcore.types.jcore-document-structure-types");
-
- String ls = System.getProperty("line.separator");
- jCas.setDocumentText("line1"+ls+"line2"+ls+"line3");
-
- AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE,
- "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_ALWAYS_SPLIT_NEWLINE, true);
-
- jsbd.process(jCas.getCas());
-
-
- Collection sentences = JCasUtil.select(jCas, Sentence.class).stream().map(Annotation::getCoveredText).collect(Collectors.toList());
- assertThat(sentences).containsExactly("line1", "line2", "line3");
- }
-
-//
-// @Test
-// public void testmuh() throws Exception {
-// JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types",
-// "de.julielab.jcore.types.jcore-document-structure-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types",
-// "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types");
-//
-// XmiCasDeserializer.deserialize(new FileInputStream("/Users/faessler/uima-pipelines/jedis-doc-to-xmi/data/output-xmi/4768370.xmi"), jCas.getCas());
-// JCasUtil.select(jCas, Sentence.class).forEach(Annotation::removeFromIndexes);
-// AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE,
-// "/Users/faessler/Coding/git/jcore-projects/jcore-jsbd-ae-biomedical-english/src/main/resources/de/julielab/jcore/ae/jsbd/model/jsbd-biomed-oversampled-abstracts-split-at-punctuation.mod.gz", SentenceAnnotator.PARAM_MAX_SENTENCE_LENGTH, 1000);
-//
-// jsbd.process(jCas.getCas());
-//
-// Set set = new TreeSet<>();
-// for (Sentence s : JCasUtil.select(jCas, Sentence.class)) {
-// set.add(s.getEnd() - s.getBegin());
-// }
-// XmiCasSerializer.serialize(jCas.getCas(), new FileOutputStream("smallSentences.xmi"));
-// }
+ AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE,
+ "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{InternalReference.class.getCanonicalName()});
+
+ jsbd.process(jCas.getCas());
+
+
+ Sentence sentence = JCasUtil.select(jCas, Sentence.class).iterator().next();
+ assertFalse(sentence.getCoveredText().endsWith("\u2029"));
+ }
+
+ @Test
+ public void testSplitAtNewlines() throws Exception {
+ JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types",
+ "de.julielab.jcore.types.jcore-document-structure-types");
+
+ String ls = System.getProperty("line.separator");
+ jCas.setDocumentText("line1" + ls + "line2" + ls + "line3");
+
+ AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE,
+ "de/julielab/jcore/ae/jsbd/model/test-model.gz", SentenceAnnotator.PARAM_ALWAYS_SPLIT_NEWLINE, true);
+
+ jsbd.process(jCas.getCas());
+
+
+ Collection sentences = JCasUtil.select(jCas, Sentence.class).stream().map(Annotation::getCoveredText).collect(Collectors.toList());
+ assertThat(sentences).containsExactly("line1", "line2", "line3");
+ }
+
+
+ @Test
+ public void testErrordoc() throws Exception {
+ // The XMI document uses here is from PMC and is an example of a source of error the previously occurred.
+ JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types",
+ "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types",
+ "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types");
+
+ XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "errordocs", "PMC5478802.xmi").toFile()), jCas.getCas());
+ JCasUtil.select(jCas, Sentence.class).forEach(Annotation::removeFromIndexes);
+ AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE,
+ "de/julielab/jcore/ae/jsbd/model/test-model.gz",
+ SentenceAnnotator.PARAM_MAX_SENTENCE_LENGTH, 1000,
+ SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, new String[]{
+ "de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"},
+ SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{de.julielab.jcore.types.pubmed.InternalReference.class.getCanonicalName()}
+ );
+
+ assertThatCode(() -> jsbd.process(jCas.getCas())).doesNotThrowAnyException();
+ }
+
+ @Test
+ public void testErrordoc2() throws Exception {
+ // This XMI file has larger cut away types where an original offset request actually lies inside of a
+ // cut away annotation. This case led to errors prior to a respective bug fix in the
+ // JCoReCondensedDocumentText
+ JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types",
+ "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types",
+ "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types");
+
+ XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "errordocs", "PMC8205280.xmi").toFile()), jCas.getCas());
+ JCasUtil.select(jCas, Sentence.class).forEach(Annotation::removeFromIndexes);
+ AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE,
+ "de/julielab/jcore/ae/jsbd/model/test-model.gz",
+ SentenceAnnotator.PARAM_MAX_SENTENCE_LENGTH, 1000,
+ SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, new String[]{
+ "de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"},
+ SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{de.julielab.jcore.types.pubmed.InternalReference.class.getCanonicalName()}
+ );
+
+ assertThatCode(() -> jsbd.process(jCas.getCas())).doesNotThrowAnyException();
+ }
+
+ @Test
+ public void testErrordoc3() throws Exception {
+ // This document has multiple sentences that begin with a Figure reference mention ("Figure 2 shows...").
+ // By cutting away all the internal reference annotation spans for sentence tagging, the "Figure 2" was
+ // ultimately appended to the previous sentence, causing errors. Thus, the option to omit internal references
+ // with letters was added to the condensed document text. This is a test that everything is working as intended.
+ JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types",
+ "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types",
+ "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.jcore-semantics-biology-types", "de.julielab.jcore.types.extensions.jcore-semantics-mention-extension-types");
+
+ XmiCasDeserializer.deserialize(new FileInputStream(Path.of("src", "test", "resources", "errordocs", "PMC5070457.xmi").toFile()), jCas.getCas());
+ JCasUtil.select(jCas, Sentence.class).forEach(Annotation::removeFromIndexes);
+ AnalysisEngine jsbd = AnalysisEngineFactory.createEngine(SentenceAnnotator.class, SentenceAnnotator.PARAM_MODEL_FILE,
+ "de/julielab/jcore/ae/jsbd/model/test-model.gz",
+ SentenceAnnotator.PARAM_MAX_SENTENCE_LENGTH, 1000,
+ SentenceAnnotator.PARAM_SENTENCE_DELIMITER_TYPES, new String[]{
+ "de.julielab.jcore.types.Title", "de.julielab.jcore.types.AbstractText", "de.julielab.jcore.types.AbstractSectionHeading", "de.julielab.jcore.types.AbstractSection", "de.julielab.jcore.types.Section", "de.julielab.jcore.types.Paragraph", "de.julielab.jcore.types.Zone", "de.julielab.jcore.types.Caption", "de.julielab.jcore.types.Figure", "de.julielab.jcore.types.Table"},
+ SentenceAnnotator.PARAM_CUT_AWAY_TYPES, new String[]{de.julielab.jcore.types.pubmed.InternalReference.class.getCanonicalName()}
+ );
+ assertThatCode(() -> jsbd.process(jCas.getCas())).doesNotThrowAnyException();
+ Collection sentences = JCasUtil.select(jCas, Sentence.class);
+ for (var s : sentences) {
+ String coveredText = s.getCoveredText();
+ if (coveredText.contains("They concluded"))
+ assertThat(coveredText).endsWith("filament19.");
+ }
+ }
}
diff --git a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml
index 66314d4bf..0bcda6a91 100644
--- a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml
+++ b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotatorTest.xml
@@ -6,7 +6,7 @@
JCoRe Sentence AnnotatorThis is the UIMA Wrapper for the JULIE Sentence Boundary Detector.
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml
index 63b003324..41089e381 100644
--- a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml
+++ b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/SentenceAnnotator_with-scope_Test.xml
@@ -6,7 +6,7 @@
JCoRe Sentence AnnotatorThis is the UIMA Wrapper for the JULIE Sentence Boundary Detector.
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
@@ -54,6 +54,7 @@
+
diff --git a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/paragraph-scope-type.xml b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/paragraph-scope-type.xml
index 282896d88..8b95a7994 100644
--- a/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/paragraph-scope-type.xml
+++ b/jcore-jsbd-ae/src/test/resources/de/julielab/jcore/ae/jsbd/desc/paragraph-scope-type.xml
@@ -2,7 +2,7 @@
test-entity-type.xmlA mini type system with one type only, used for testing consistency preservation
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-jsbd-ae/src/test/resources/errordocs/PMC5070457.xmi b/jcore-jsbd-ae/src/test/resources/errordocs/PMC5070457.xmi
new file mode 100644
index 000000000..dd0c227ca
--- /dev/null
+++ b/jcore-jsbd-ae/src/test/resources/errordocs/PMC5070457.xmi
@@ -0,0 +1,5 @@
+
+PMC5070457
\ No newline at end of file
diff --git a/jcore-jsbd-ae/src/test/resources/errordocs/PMC5478802.xmi b/jcore-jsbd-ae/src/test/resources/errordocs/PMC5478802.xmi
new file mode 100644
index 000000000..c4d8ca95a
--- /dev/null
+++ b/jcore-jsbd-ae/src/test/resources/errordocs/PMC5478802.xmi
@@ -0,0 +1,5 @@
+
+PMC5478802
\ No newline at end of file
diff --git a/jcore-jsbd-ae/src/test/resources/errordocs/PMC8205280.xmi b/jcore-jsbd-ae/src/test/resources/errordocs/PMC8205280.xmi
new file mode 100644
index 000000000..b2063eca5
--- /dev/null
+++ b/jcore-jsbd-ae/src/test/resources/errordocs/PMC8205280.xmi
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/jcore-jsbd-ae/src/test/resources/errordocs/README.md b/jcore-jsbd-ae/src/test/resources/errordocs/README.md
new file mode 100644
index 000000000..d2278611f
--- /dev/null
+++ b/jcore-jsbd-ae/src/test/resources/errordocs/README.md
@@ -0,0 +1,4 @@
+# Errored Documents for Tests
+
+Documents in this directory were subject of sentence splitting errors. The errors are fixed
+using the documents in a test.
\ No newline at end of file
diff --git a/jcore-jtbd-ae/component.meta b/jcore-jtbd-ae/component.meta
index 377c042d7..0cd1c8929 100644
--- a/jcore-jtbd-ae/component.meta
+++ b/jcore-jtbd-ae/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-jtbd-ae",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe Token Annotator"
}
diff --git a/jcore-jtbd-ae/pom.xml b/jcore-jtbd-ae/pom.xml
index 03523ba12..e811fa22f 100644
--- a/jcore-jtbd-ae/pom.xml
+++ b/jcore-jtbd-ae/pom.xml
@@ -10,14 +10,15 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0
+ org.apache.maven.pluginsmaven-assembly-plugin
- 2.4
+ 3.3.0jar-with-dependencies
@@ -85,14 +86,25 @@
jcore-types${jcore-types-version}
+
+ de.julielab
+ jcore-utilities
+ ${jcore-utilities-version}
+ cc.malletmallet2.0.8
+
+
+ junit
+ junit
+
+
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-engineJCoRe Token Annotator
diff --git a/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipe.java b/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipe.java
index c52e1ad12..833f97e8f 100755
--- a/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipe.java
+++ b/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipe.java
@@ -524,7 +524,7 @@ else if (superUnitRep.length() <= 8)
// check whether superunit might be a chemical
// therefor we check the number typical special characters contained
- if ((superUnitRep.length() > 6)
+ if ((superUnitRep.length() > 6 && superUnitRep.length() < 200)
&& superUnitRep.matches("(.*[\\W].*){5,}")
&& !superUnitRep.contains("-->"))
token.setFeatureValue("SU_isChemical", 1);
diff --git a/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotator.java b/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotator.java
index 1ddd664f7..c073983a2 100644
--- a/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotator.java
+++ b/jcore-jtbd-ae/src/main/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotator.java
@@ -26,6 +26,7 @@
import de.julielab.jcore.ae.jtbd.Unit;
import de.julielab.jcore.types.Sentence;
import de.julielab.jcore.types.Token;
+import de.julielab.jcore.utility.JCoReTools;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
@@ -153,8 +154,12 @@ public void process(final JCas aJCas) throws AnalysisEngineProcessException {
int length = sentence.getEnd() - sentence
.getBegin();
LOGGER.debug("going to next sentence having length: " + length);
- if (length > 1000)
- LOGGER.warn("Current sentence has length {}.", length);
+ if (length > 1000) {
+ if (LOGGER.isWarnEnabled()) {
+ String docId = JCoReTools.getDocId(aJCas);
+ LOGGER.warn("Current sentence has length {} (document ID {}).", length, docId);
+ }
+ }
final String text = sentence.getCoveredText();
writeTokensToCAS(text, sentence.getBegin(), aJCas);
}
diff --git a/jcore-jtbd-ae/src/main/resources/de/julielab/jcore/ae/jtbd/desc/jcore-jtbd.xml b/jcore-jtbd-ae/src/main/resources/de/julielab/jcore/ae/jtbd/desc/jcore-jtbd.xml
index 337463371..3e8e5a5e0 100644
--- a/jcore-jtbd-ae/src/main/resources/de/julielab/jcore/ae/jtbd/desc/jcore-jtbd.xml
+++ b/jcore-jtbd-ae/src/main/resources/de/julielab/jcore/ae/jtbd/desc/jcore-jtbd.xml
@@ -6,7 +6,7 @@
JCoRe Token Annotator
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipeTest.java b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipeTest.java
index 46d4826c1..140945584 100644
--- a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipeTest.java
+++ b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/Sentence2TokenPipeTest.java
@@ -17,19 +17,22 @@
package de.julielab.jcore.ae.jtbd;
-import junit.framework.TestCase;
+import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
-public class Sentence2TokenPipeTest extends TestCase {
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class Sentence2TokenPipeTest {
private static final Logger LOGGER = LoggerFactory
.getLogger(Sentence2TokenPipeTest.class);
private static final String TEST_SENTENCE = "this is a \t junit -test";
+ @Test
public void testMakeLabel() {
final ArrayList expectedLabels = new ArrayList();
expectedLabels.add("P");
@@ -55,6 +58,7 @@ public void testMakeLabel() {
assertTrue(allOK);
}
+ @Test
public void testMakeUnits() {
final ArrayList expectedUnits = new ArrayList();
expectedUnits.add("this");
diff --git a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/TokenizerTest.java b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/TokenizerTest.java
index c953307c1..e99c1f2f2 100644
--- a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/TokenizerTest.java
+++ b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/TokenizerTest.java
@@ -24,7 +24,7 @@
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -39,7 +39,7 @@
import java.util.List;
import java.util.stream.Collectors;
-import static org.junit.Assert.*;
+import static org.junit.jupiter.api.Assertions.*;
/**
* Test for the class {@link Tokenizer}
diff --git a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotatorTest.java b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotatorTest.java
index 4e3dfe9b3..543abf443 100644
--- a/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotatorTest.java
+++ b/jcore-jtbd-ae/src/test/java/de/julielab/jcore/ae/jtbd/main/TokenAnnotatorTest.java
@@ -18,7 +18,6 @@
import de.julielab.jcore.types.Sentence;
import de.julielab.jcore.types.Token;
-import junit.framework.TestCase;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.jcas.JCas;
@@ -26,13 +25,15 @@
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;
import org.apache.uima.util.XMLInputSource;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Iterator;
-public class TokenAnnotatorTest extends TestCase {
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class TokenAnnotatorTest {
/**
* Logger for this class
diff --git a/jcore-jtbd-ae/src/test/resources/de/julielab/jcore/ae/jtbd/desc/TokenAnnotatorTest.xml b/jcore-jtbd-ae/src/test/resources/de/julielab/jcore/ae/jtbd/desc/TokenAnnotatorTest.xml
index 6a670af49..415da5d4c 100644
--- a/jcore-jtbd-ae/src/test/resources/de/julielab/jcore/ae/jtbd/desc/TokenAnnotatorTest.xml
+++ b/jcore-jtbd-ae/src/test/resources/de/julielab/jcore/ae/jtbd/desc/TokenAnnotatorTest.xml
@@ -6,7 +6,7 @@
JCoRe Token Annotator
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
diff --git a/jcore-julielab-entity-evaluator-consumer/component.meta b/jcore-julielab-entity-evaluator-consumer/component.meta
index 9ffe2edc3..78d9a4f68 100644
--- a/jcore-julielab-entity-evaluator-consumer/component.meta
+++ b/jcore-julielab-entity-evaluator-consumer/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-julielab-entity-evaluator-consumer",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe JULIE Lab Entity Evaluator Converter"
}
diff --git a/jcore-julielab-entity-evaluator-consumer/pom.xml b/jcore-julielab-entity-evaluator-consumer/pom.xml
index 7ad4d9597..4b5547be5 100644
--- a/jcore-julielab-entity-evaluator-consumer/pom.xml
+++ b/jcore-julielab-entity-evaluator-consumer/pom.xml
@@ -4,7 +4,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0jcore-julielab-entity-evaluator-consumerJCoRe JULIE Lab Entity Evaluator Converter
@@ -18,7 +18,7 @@
de.julielabjulielab-entity-evaluator
- 1.2.0
+ 1.3.0de.julielab
@@ -45,8 +45,8 @@
julielab-java-utilities
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-engineorg.apache.commons
diff --git a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java
index b92b32ad1..5dadad803 100644
--- a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java
+++ b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumer.java
@@ -56,6 +56,7 @@ public class EntityEvaluatorConsumer extends JCasAnnotator_ImplBase {
public static final String PARAM_TYPE_PREFIX = "TypePrefix";
public final static String PARAM_ENTITY_TYPES = "EntityTypes";
public static final String PARAM_FEATURE_FILTERS = "FeatureFilters";
+ public static final String PARAM_ALLOW_REGEX_FOR_FILTERS = "AllowRegexForFilters";
public final static String PARAM_OFFSET_MODE = "OffsetMode";
public final static String PARAM_OFFSET_SCOPE = "OffsetScope";
public final static String PARAM_OUTPUT_FILE = "OutputFile";
@@ -77,6 +78,8 @@ public class EntityEvaluatorConsumer extends JCasAnnotator_ImplBase {
private String typePrefix;
@ConfigurationParameter(name = PARAM_FEATURE_FILTERS, mandatory = false, description = "Optional. Only lets those entities contribute to the output file that fulfill the given feature value(s). The syntax is :=. The ':' prefix is optional. If omitted, the filters will be applied to all entities given in the " + PARAM_ENTITY_TYPES + " parameter. An arbitrary number of filter expressions may be specified. In such cases, it is important to understand the boolean structure after which the expressions are evaluated in order to omit an annotation or take it into account for the output. The filter expressions are first grouped by feature path. Within such a group, the filter values form a disjunction. Thus, if any filter in a group is satisfied, the whole group is satisfied. The different groups form a conjunction. Thus, if any group is not satisfied, the whole conjunction is unsatisfied and the respective annotation will be omitted from output.")
private String[] featureFilterDefinitions;
+ @ConfigurationParameter(name = PARAM_ALLOW_REGEX_FOR_FILTERS, mandatory = false, description = "Optional. If set to true, the filter values specified with the " + PARAM_FEATURE_FILTERS + " parameter are interpreted as regular expressions. The actual feature values are than matched by regular expression resolution instead of testing string equality.")
+ boolean allowRegexForFilters;
@ConfigurationParameter(name = PARAM_OUTPUT_FILE, description = "Output file to which all entity information is written in the format\n"
+ "docId EGID begin end confidence\n"
+ "Where the fields are separated by tab stops. If the file name ends with .gz, the output file will automatically be gzipped.")
@@ -157,13 +160,11 @@ private void addOffsetsColumn(JCas aJCas) {
}
private void addDocumentIdColumn(JCas aJCas) throws CASException {
- if (outputColumnNames.contains(DOCUMENT_ID_COLUMN)) {
- Column c = columns.get(DOCUMENT_ID_COLUMN);
- if (c == null)
- c = new Column(DOCUMENT_ID_COLUMN + ":" + Header.class.getCanonicalName() + "=/docId", null, aJCas.getTypeSystem());
- c = new DocumentIdColumn(c);
- columns.put(DOCUMENT_ID_COLUMN, c);
- }
+ Column c = columns.get(DOCUMENT_ID_COLUMN);
+ if (c == null)
+ c = new Column(DOCUMENT_ID_COLUMN + ":" + Header.class.getCanonicalName() + "=/docId", null, aJCas.getTypeSystem());
+ c = new DocumentIdColumn(c);
+ columns.put(DOCUMENT_ID_COLUMN, c);
}
private void addDocumentTextSha256Column() {
@@ -183,7 +184,7 @@ private void addSentenceIdColumn(JCas aJCas) throws CASException {
Column docIdColumn = columns.get(DOCUMENT_ID_COLUMN);
String documentId = null;
if (docIdColumn != null)
- documentId = docIdColumn.getValue(aJCas.getDocumentAnnotationFs(), aJCas).getFirst();
+ documentId = docIdColumn.getValue(null, aJCas).getFirst();
Type sentenceType = c.getSingleType();
// put all sentences into an index with an
// overlap-comparator - this way the index can be
@@ -249,10 +250,11 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept
super.initialize(aContext);
outputColumnNamesArray = (String[]) aContext.getConfigParameterValue(PARAM_OUTPUT_COLUMNS);
- columnDefinitionDescriptions = (String[]) aContext.getConfigParameterValue(PARAM_COLUMN_DEFINITIONS);
+ columnDefinitionDescriptions = Optional.ofNullable((String[]) aContext.getConfigParameterValue(PARAM_COLUMN_DEFINITIONS)).orElse(new String[0]);
typePrefix = (String) aContext.getConfigParameterValue(PARAM_TYPE_PREFIX);
featureFilterDefinitions = (String[]) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_FEATURE_FILTERS)).orElse(new String[0]);
+ allowRegexForFilters = (Boolean) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_ALLOW_REGEX_FOR_FILTERS)).orElse(false);
outputFilePath = (String) aContext.getConfigParameterValue(PARAM_OUTPUT_FILE);
appendThreadNameToOutputFile = Optional.ofNullable((Boolean) aContext.getConfigParameterValue(PARAM_APPEND_THREAD_NAME_TO_OUTPUT_FILE)).orElse(false);
entityTypeStrings = (String[]) aContext.getConfigParameterValue(PARAM_ENTITY_TYPES);
@@ -265,7 +267,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept
offsetMode = null == offsetModeStr ? OffsetMode.CharacterSpan : OffsetMode.valueOf(offsetModeStr);
if (null == offsetScopeStr) {
- offsetScope = outputColumnNames.contains(SENTENCE_ID_COLUMN) ? OffsetScope.Sentence : OffsetScope.Document;
+ offsetScope = OffsetScope.Document;
} else {
offsetScope = OffsetScope.valueOf(offsetScopeStr);
}
@@ -281,6 +283,7 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept
log.info("{}: {}", PARAM_OUTPUT_COLUMNS, outputColumnNames);
log.info("{}: {}", PARAM_COLUMN_DEFINITIONS, columnDefinitionDescriptions);
log.info("{}: {}", PARAM_FEATURE_FILTERS, featureFilterDefinitions);
+ log.info("{}: {}", PARAM_ALLOW_REGEX_FOR_FILTERS, allowRegexForFilters);
log.info("{}: {}", PARAM_ENTITY_TYPES, entityTypeStrings);
log.info("{}: {}", PARAM_TYPE_PREFIX, typePrefix);
log.info("{}: {}", PARAM_OUTPUT_FILE, outputFilePath);
@@ -329,7 +332,7 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException {
throw new IllegalArgumentException("No entity names are given, neither by the " + PARAM_ENTITY_TYPES + " parameter nor in the " + PARAM_COLUMN_DEFINITIONS + " parameter.");
removeSubsumedTypes(entityTypes, ts);
- featureFilters = Stream.of(featureFilterDefinitions).map(d -> new FeatureValueFilter(d, typePrefix, ts)).collect(Collectors.groupingBy(filter -> filter.getPathValuePair().fp.getFeaturePath()));
+ featureFilters = Stream.of(featureFilterDefinitions).map(d -> new FeatureValueFilter(d, typePrefix, ts, allowRegexForFilters)).collect(Collectors.groupingBy(filter -> filter.getPathValuePair().fp.getFeaturePath()));
addDocumentIdColumn(aJCas);
addDocumentTextSha256Column();
diff --git a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/FeatureValueFilter.java b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/FeatureValueFilter.java
index c84ba2ade..25a1a25d2 100644
--- a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/FeatureValueFilter.java
+++ b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/FeatureValueFilter.java
@@ -17,6 +17,7 @@
import java.util.Collections;
import java.util.Set;
+import java.util.function.BiFunction;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@@ -27,10 +28,12 @@ public class FeatureValueFilter {
protected Set types;
protected PathValuePair pathValuePair;
private Matcher mfull;
+ private BiFunction featureValueMatchTest;
- public FeatureValueFilter(String columnDefinition, String typePrefix, TypeSystem ts) {
+ public FeatureValueFilter(String columnDefinition, String typePrefix, TypeSystem ts, boolean allowRegexForFilters) {
this();
parseAndAddDefinition(columnDefinition, typePrefix, ts);
+ featureValueMatchTest = allowRegexForFilters ? String::matches : String::equals;
}
public FeatureValueFilter() {
@@ -60,7 +63,7 @@ public boolean contradictsFeatureFilter(TOP a) {
return false;
String fpValue = pathValuePair.fp.getValueAsString(a);
if (fpValue != null)
- return pathValuePair.targetValue == null || !fpValue.equals(pathValuePair.targetValue);
+ return pathValuePair.targetValue == null || !featureValueMatchTest.apply(fpValue, pathValuePair.targetValue);
return pathValuePair.targetValue != null;
}
diff --git a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/OffsetsColumn.java b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/OffsetsColumn.java
index 44d08b055..0b5c599d5 100644
--- a/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/OffsetsColumn.java
+++ b/jcore-julielab-entity-evaluator-consumer/src/main/java/de/julielab/jcore/consumer/entityevaluator/OffsetsColumn.java
@@ -15,11 +15,13 @@
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.TOP;
import org.apache.uima.jcas.tcas.Annotation;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.util.*;
public class OffsetsColumn extends Column {
-
+private final static Logger log = LoggerFactory.getLogger(OffsetsColumn.class);
private OffsetMode offsetMode;
private JCoReTreeMapAnnotationIndex sentenceIndex;
private OffsetScope offsetScope;
@@ -61,9 +63,14 @@ public Deque getValue(TOP a, JCas aJCas) {
if (offsetScope == OffsetScope.Sentence) {
Annotation s = sentenceIndex.get(an);
- if (this.offsetMode == OffsetMode.NonWsCharacters)
- numWsMap = getNumWsMapForSentence(s);
- annotationOffset = s.getBegin();
+ if (s != null) {
+ if (this.offsetMode == OffsetMode.NonWsCharacters)
+ numWsMap = getNumWsMapForSentence(s);
+ annotationOffset = s.getBegin();
+ } else {
+ log.warn("There was no sentence for annotation {}, returning begin offset as -1.", an);
+ annotationOffset = -1;
+ }
}
final String offsets = getOffsets(an, numWsMap, annotationOffset);
diff --git a/jcore-julielab-entity-evaluator-consumer/src/main/resources/de/julielab/jcore/consumer/entityevaluator/desc/jcore-julielab-entity-evaluator-consumer.xml b/jcore-julielab-entity-evaluator-consumer/src/main/resources/de/julielab/jcore/consumer/entityevaluator/desc/jcore-julielab-entity-evaluator-consumer.xml
index 4ffda6700..f46b9c244 100644
--- a/jcore-julielab-entity-evaluator-consumer/src/main/resources/de/julielab/jcore/consumer/entityevaluator/desc/jcore-julielab-entity-evaluator-consumer.xml
+++ b/jcore-julielab-entity-evaluator-consumer/src/main/resources/de/julielab/jcore/consumer/entityevaluator/desc/jcore-julielab-entity-evaluator-consumer.xml
@@ -6,7 +6,7 @@
JCoRe Entity Evaluator and TSV ConsumerThis component was originally created to output the tab separated format used the JULIE Entity Evaluator. However, this component can be used to create a TSV file from any annotation or annotation set. The component allows to define columns by specifying the annotation type to draw feature values from and a feature path that specifies the location of the desired feature. All feature paths will be applied to each configured annotation, returning null values if an annotation does not exhibit a value for a column's feature path.
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab Jena, Germany
@@ -58,6 +58,13 @@
truefalse
+
+ AllowRegexForFilters
+ Optional. If set to true, the filter values specified with the FeatureFilters parameter are interpreted as regular expressions. The actual feature values are than matched by regular expression resolution instead of testing string equality.
+ Boolean
+ false
+ false
+ OutputFileOutput file to which all entity information is written in the format
diff --git a/jcore-julielab-entity-evaluator-consumer/src/test/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumerTest.java b/jcore-julielab-entity-evaluator-consumer/src/test/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumerTest.java
index 69010da56..b50a25edd 100644
--- a/jcore-julielab-entity-evaluator-consumer/src/test/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumerTest.java
+++ b/jcore-julielab-entity-evaluator-consumer/src/test/java/de/julielab/jcore/consumer/entityevaluator/EntityEvaluatorConsumerTest.java
@@ -15,6 +15,7 @@
import de.julielab.jcore.types.pubmed.ManualDescriptor;
import de.julielab.jcore.utility.JCoReTools;
import org.apache.commons.codec.binary.Base64;
+import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.JCasFactory;
@@ -22,7 +23,7 @@
import org.apache.uima.jcas.cas.DoubleArray;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.jcas.cas.StringArray;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import java.io.ByteArrayInputStream;
import java.io.File;
@@ -34,16 +35,14 @@
import java.util.zip.GZIPInputStream;
import static de.julielab.jcore.consumer.entityevaluator.EntityEvaluatorConsumer.*;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
public class EntityEvaluatorConsumerTest {
@Test
public void testEntityEvaluatorConsumerSingleEntity() throws Exception {
- JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types",
- "de.julielab.jcore.types.jcore-semantics-biology-types",
- "de.julielab.jcore.types.jcore-document-meta-types");
+ JCas jcas = getjCas();
AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class,
PARAM_COLUMN_DEFINITIONS,
new String[] { DOCUMENT_ID_COLUMN + ": Header = /docId",
@@ -75,11 +74,49 @@ public void testEntityEvaluatorConsumerSingleEntity() throws Exception {
assertEquals("document1 document1:0 23 gene", lines.get(0));
}
+ private JCas getjCas() throws UIMAException {
+ return JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types",
+ "de.julielab.jcore.types.jcore-semantics-biology-types",
+ "de.julielab.jcore.types.jcore-document-meta-pubmed-types");
+ }
+
+ @Test
+ public void testEntityEvaluatorConsumerSingleEntity2() throws Exception {
+ // The same test as above but minus the DocumentId column
+ JCas jcas = getjCas();
+ AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class,
+ PARAM_COLUMN_DEFINITIONS,
+ new String[] { "geneid:Gene=/resourceEntryList[0]/entryId", "name:/:coveredText()" },
+ // We here use the default SentenceId column, we did not provide a definition!
+ PARAM_OUTPUT_COLUMNS, new String[] { SENTENCE_ID_COLUMN, "geneid", "name" },
+ PARAM_TYPE_PREFIX, "de.julielab.jcore.types", PARAM_OUTPUT_FILE, "src/test/resources/outfile-test.tsv");
+
+ jcas.setDocumentText("One gene one sentence.");
+ Header h = new Header(jcas);
+ h.setDocId("document1");
+ h.addToIndexes();
+ Sentence s = new Sentence(jcas, 0, jcas.getDocumentText().length());
+ s.setId("sentence1");
+ s.addToIndexes();
+ Gene g = new Gene(jcas, 4, 8);
+ GeneResourceEntry re = new GeneResourceEntry(jcas);
+ re.setEntryId("23");
+ FSArray array = new FSArray(jcas, 1);
+ array.set(0, re);
+ g.setResourceEntryList(array);
+ g.addToIndexes();
+
+ consumer.process(jcas.getCas());
+ consumer.collectionProcessComplete();
+
+ List lines = Files.readLines(new File("src/test/resources/outfile-test.tsv"), Charset.forName("UTF-8"));
+ assertEquals(1, lines.size());
+ assertEquals("document1:0 23 gene", lines.get(0));
+ }
+
@Test
public void testEntityEvaluatorConsumerNoEntities() throws Exception {
- JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types",
- "de.julielab.jcore.types.jcore-semantics-biology-types",
- "de.julielab.jcore.types.jcore-document-meta-types");
+ JCas jcas = getjCas();
AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class,
PARAM_COLUMN_DEFINITIONS,
new String[] { DOCUMENT_ID_COLUMN + ": Header = /docId",
@@ -107,9 +144,7 @@ public void testEntityEvaluatorConsumerNoEntities() throws Exception {
@Test
public void testEntityEvaluatorConsumerSingleEntityDocumentTextHash() throws Exception {
- JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types",
- "de.julielab.jcore.types.jcore-semantics-biology-types",
- "de.julielab.jcore.types.jcore-document-meta-types");
+ JCas jcas = getjCas();
AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class,
PARAM_COLUMN_DEFINITIONS,
new String[] {
@@ -143,9 +178,7 @@ public void testEntityEvaluatorConsumerSingleEntityDocumentTextHash() throws Exc
@Test
public void testEntityEvaluatorConsumerMultipleEntities() throws Exception {
- JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types",
- "de.julielab.jcore.types.jcore-semantics-biology-types",
- "de.julielab.jcore.types.jcore-document-meta-types");
+ JCas jcas = getjCas();
AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class,
PARAM_COLUMN_DEFINITIONS,
new String[] { SENTENCE_ID_COLUMN + ": Sentence=/id",
@@ -180,9 +213,7 @@ public void testEntityEvaluatorConsumerMultipleEntities() throws Exception {
@Test
public void testEntityEvaluatorConsumerSingleEntityNoWSOffsets() throws Exception {
- JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types",
- "de.julielab.jcore.types.jcore-semantics-biology-types",
- "de.julielab.jcore.types.jcore-document-meta-types");
+ JCas jcas = getjCas();
AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class,
PARAM_COLUMN_DEFINITIONS,
new String[] { DOCUMENT_ID_COLUMN + ": Header = /docId", SENTENCE_ID_COLUMN + ": Sentence=/id",
@@ -219,9 +250,7 @@ public void testEntityEvaluatorConsumerSuperType() throws Exception {
// other, e.g. EntityMention and Gene, then we don't want to traverse
// the subsumed types on their own. They are contained in the annotation
// index of their super type.
- JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types",
- "de.julielab.jcore.types.jcore-semantics-biology-types",
- "de.julielab.jcore.types.jcore-document-meta-types");
+ JCas jcas = getjCas();
AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class,
PARAM_COLUMN_DEFINITIONS,
new String[] { DOCUMENT_ID_COLUMN + ": Header = /docId", SENTENCE_ID_COLUMN + ": Sentence=/id",
@@ -261,23 +290,21 @@ public void testCreateNonWsOffsetMap() throws Exception {
TreeMap numWsMap = (TreeMap) method.invoke(null, "one two three");
// first check the actual map entries (after each white space position
// there should be an entry)
- assertEquals(new Integer(0), numWsMap.get(0));
- assertEquals(new Integer(1), numWsMap.get(4));
- assertEquals(new Integer(2), numWsMap.get(8));
+ assertEquals(Integer.valueOf(0), numWsMap.get(0));
+ assertEquals(Integer.valueOf(1), numWsMap.get(4));
+ assertEquals(Integer.valueOf(2), numWsMap.get(8));
// now check the intended use; using the floor element, we should be
// able to the correct value even for those positions we don't have an
// explicit mapping for
- assertEquals(new Integer(0), numWsMap.floorEntry(2).getValue());
- assertEquals(new Integer(1), numWsMap.floorEntry(5).getValue());
- assertEquals(new Integer(2), numWsMap.floorEntry(11).getValue());
+ assertEquals(Integer.valueOf(0), numWsMap.floorEntry(2).getValue());
+ assertEquals(Integer.valueOf(1), numWsMap.floorEntry(5).getValue());
+ assertEquals(Integer.valueOf(2), numWsMap.floorEntry(11).getValue());
}
@Test
public void testEntityEvaluatorConsumerFeatureFilter() throws Exception {
- JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types",
- "de.julielab.jcore.types.jcore-semantics-biology-types",
- "de.julielab.jcore.types.jcore-document-meta-types");
+ JCas jcas = getjCas();
AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class,
PARAM_COLUMN_DEFINITIONS,
new String[] { DOCUMENT_ID_COLUMN + ": Header = /docId", SENTENCE_ID_COLUMN + ": Sentence=/id",
@@ -320,6 +347,53 @@ public void testEntityEvaluatorConsumerFeatureFilter() throws Exception {
assertEquals("document1 document1:0 42 One", lines.get(0));
}
+ @Test
+ public void testEntityEvaluatorConsumerFeatureFilterRegEx() throws Exception {
+ JCas jcas = getjCas();
+ AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class,
+ PARAM_COLUMN_DEFINITIONS,
+ new String[] { DOCUMENT_ID_COLUMN + ": Header = /docId", SENTENCE_ID_COLUMN + ": Sentence=/id",
+ "genetype:Gene=/specificType", "name:/:coveredText()" },
+ PARAM_OUTPUT_COLUMNS, new String[] { DOCUMENT_ID_COLUMN, SENTENCE_ID_COLUMN, "genetype", "name" },
+ PARAM_TYPE_PREFIX, "de.julielab.jcore.types", PARAM_OUTPUT_FILE, "src/test/resources/outfile-test.tsv",
+ PARAM_FEATURE_FILTERS, new String[] { "Gene:/specificType=Group[3-4]{2,3}s?" },
+ PARAM_ALLOW_REGEX_FOR_FILTERS, true);
+
+ jcas.setDocumentText("One gene one sentence.");
+ Header h = new Header(jcas);
+ h.setDocId("document1");
+ h.addToIndexes();
+ Sentence s = new Sentence(jcas, 0, jcas.getDocumentText().length());
+ s.setId("sentence1");
+ s.addToIndexes();
+ {
+ Gene g = new Gene(jcas, 4, 8);
+ // should not pass filter
+ g.setSpecificType("Group123");
+ g.addToIndexes();
+ }
+ {
+ Gene g = new Gene(jcas, 0, 3);
+ // should pass filter
+ g.setSpecificType("Group33s");
+ g.addToIndexes();
+ }
+ {
+ Gene g = new Gene(jcas, 0, 3);
+ // should pass filter
+ g.setSpecificType("Group344");
+ g.addToIndexes();
+ }
+
+ consumer.process(jcas.getCas());
+ consumer.collectionProcessComplete();
+
+ List lines = Files.readLines(new File("src/test/resources/outfile-test.tsv"), Charset.forName("UTF-8"));
+ assertEquals(2, lines.size());
+ assertEquals("document1 document1:0 Group33s One", lines.get(0));
+ assertEquals("document1 document1:0 Group344 One", lines.get(1));
+ }
+
@Test
public void testParallelMultiValues() throws Exception {
JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types",
@@ -367,9 +441,7 @@ public void testParallelMultiValues() throws Exception {
@Test
public void testCartesianMultiValues() throws Exception {
- JCas jcas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-semantics-mention-types",
- "de.julielab.jcore.types.jcore-semantics-biology-types",
- "de.julielab.jcore.types.jcore-document-meta-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types");
+ JCas jcas = getjCas();
AnalysisEngine consumer = AnalysisEngineFactory.createEngine(EntityEvaluatorConsumer.class,
PARAM_COLUMN_DEFINITIONS,
new String[] {
diff --git a/jcore-likelihood-assignment-ae/component.meta b/jcore-likelihood-assignment-ae/component.meta
index 671dbf79e..f73f0297a 100644
--- a/jcore-likelihood-assignment-ae/component.meta
+++ b/jcore-likelihood-assignment-ae/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-likelihood-assignment-ae",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe Likelihood Assignment AE"
}
diff --git a/jcore-likelihood-assignment-ae/pom.xml b/jcore-likelihood-assignment-ae/pom.xml
index e49c1a243..d28f1775b 100644
--- a/jcore-likelihood-assignment-ae/pom.xml
+++ b/jcore-likelihood-assignment-ae/pom.xml
@@ -10,7 +10,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0
@@ -27,14 +27,19 @@
de.julielabjcore-descriptor-creator
+
+ de.julielab
+ jcore-utilities
+ ${jcore-utilities-version}
+ de.julielabjcore-types${jcore-types-version}
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-engineJCoRe Likelihood Assignment AE
diff --git a/jcore-likelihood-assignment-ae/src/main/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotator.java b/jcore-likelihood-assignment-ae/src/main/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotator.java
index 622c6cded..4c31a62f9 100644
--- a/jcore-likelihood-assignment-ae/src/main/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotator.java
+++ b/jcore-likelihood-assignment-ae/src/main/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotator.java
@@ -1,209 +1,298 @@
-
package de.julielab.jcore.ae.likelihoodassignment;
import de.julielab.jcore.types.ConceptMention;
import de.julielab.jcore.types.LikelihoodIndicator;
import de.julielab.jcore.types.Sentence;
+import de.julielab.jcore.utility.JCoReAnnotationIndexMerger;
+import de.julielab.jcore.utility.JCoReAnnotationTools;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.fit.descriptor.TypeCapability;
+import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.NavigableMap;
-import java.util.TreeMap;
+import java.util.*;
-@ResourceMetaData(name="JCoRe Likelihood Assignment AE", description = "Analysis Engine to assign likelihood indicators to their corresponding entities and events.")
-@TypeCapability(inputs="de.julielab.jcore.types.LikelihoodIndicator")
+@ResourceMetaData(name = "JCoRe Likelihood Assignment AE", description = "Analysis Engine to assign likelihood indicators to their corresponding entities and events.")
+@TypeCapability(inputs = "de.julielab.jcore.types.LikelihoodIndicator")
public class LikelihoodAssignmentAnnotator extends JCasAnnotator_ImplBase {
- private static final Logger LOGGER = LoggerFactory
- .getLogger(LikelihoodAssignmentAnnotator.class);
-
- /**
- * Maps sentence ends to sentence begins.
- */
- private TreeMap sentMap;
- /**
- * Maps concept mentions to their begins.
- */
- private TreeMap> conceptMap;
- /**
- * Maps likelihood indicators to their begins.
- */
- private TreeMap likelihoodMap;
-
- /**
- * Quantifies likelihood values.
- */
- private HashMap likelihoodValueMap;
-
- public void initialize(UimaContext aContext)
- throws ResourceInitializationException {
- super.initialize(aContext);
-
- // ordinal scale for likelihood indicators;
- // used when there are multiple occurrences (the lowest category is
- // chosen)
- likelihoodValueMap = new HashMap<>();
- likelihoodValueMap.put("negation", 1);
- likelihoodValueMap.put("low", 2);
- likelihoodValueMap.put("investigation", 3);
- likelihoodValueMap.put("moderate", 4);
- likelihoodValueMap.put("high", 5);
- }
-
- @Override
- public void process(JCas aJCas) throws AnalysisEngineProcessException {
- assignLikelihood(aJCas);
- }
-
- /**
- * If a sentence contains a likelihood indicator, this indicator is assigned
- * to all concept mentions occurring in the sentence. If a sentence does not
- * contain a likelihood indicator, the default likelihood category (i.e.
- * 'assertion') is assigned to all concept mentions occurring in the
- * sentence. In case of multiple likelihood indicators the lowest likelihood
- * category is chosen.
- *
- * @param aJCas
- */
- private void assignLikelihood(JCas aJCas) {
- buildTreeMaps(aJCas);
-
- // create default likelihood indicator for assertions (has begin = 0 and
- // end = 0)
- LikelihoodIndicator assertionIndicator = new LikelihoodIndicator(aJCas);
- assertionIndicator.setLikelihood("assertion");
- assertionIndicator.setComponentId(this.getClass().getName());
- assertionIndicator.addToIndexes();
-
- // iterate over sentences
- for (int sentBegin : sentMap.keySet()) {
- int sentEnd = sentMap.get(sentBegin);
- boolean sentHasLikelihood = false;
- boolean multipleLikelihood = false;
- Integer firstLikelihoodBegin = 0;
- Integer lastLikelihoodBegin = 0;
-
- // determine whether the sentence contains a likelihood indicator at
- // all and whether it even contains multiple likelihood indicators
- firstLikelihoodBegin = likelihoodMap.ceilingKey(sentBegin);
- if (firstLikelihoodBegin != null) {
- if (firstLikelihoodBegin > sentEnd) {
- sentHasLikelihood = false;
- } else {
- sentHasLikelihood = true;
- }
- }
- if (sentHasLikelihood == true) {
- lastLikelihoodBegin = likelihoodMap.floorKey(sentEnd);
- if (firstLikelihoodBegin == lastLikelihoodBegin) {
- multipleLikelihood = false;
- } else {
- multipleLikelihood = true;
- }
- }
-
- // determine which likelihood category to assign to concept mentions
- // in the sentence and create the corresponding likelihood indicator
- LikelihoodIndicator assignedLikelihood = null;
- if (sentHasLikelihood == true) {
- if (multipleLikelihood = true) {
- // determine the lowest likelihood category in the sentence
- NavigableMap likelihoodSubMap = likelihoodMap
- .subMap(firstLikelihoodBegin, true,
- lastLikelihoodBegin, true);
- int currentLikelihoodValue = 100;
- for (int i : likelihoodSubMap.keySet()) {
- LikelihoodIndicator likelihood = likelihoodSubMap
- .get(i);
- String likelihoodCat = likelihood.getLikelihood();
- int likelihoodValue = likelihoodValueMap
- .get(likelihoodCat);
- if (likelihoodValue < currentLikelihoodValue) {
- assignedLikelihood = likelihood;
- currentLikelihoodValue = likelihoodValue;
- }
- }
- } else {
- LikelihoodIndicator likelihood = likelihoodMap
- .get(firstLikelihoodBegin);
- assignedLikelihood = likelihood;
- }
- } else {
- assignedLikelihood = assertionIndicator;
- }
-
- // get all events in the sentence and assign the corresponding
- // likelihood indicator
- if (conceptMap.ceilingKey(sentBegin) != null) {
- int firstConceptBegin = conceptMap.ceilingKey(sentBegin);
- if (firstConceptBegin > sentEnd) {
- continue;
- } else {
- int lastConceptBegin = conceptMap.floorKey(sentEnd);
- NavigableMap> conceptSubMap = conceptMap
- .subMap(firstConceptBegin, true, lastConceptBegin,
- true);
- for (int i : conceptSubMap.keySet()) {
- ArrayList conceptList = conceptSubMap
- .get(i);
- for (ConceptMention concept : conceptList) {
- concept.setLikelihood(assignedLikelihood);
- }
- }
- }
- }
- }
- }
-
- @SuppressWarnings("rawtypes")
- public void buildTreeMaps(JCas aJCas) {
- FSIterator sentIt = aJCas.getAnnotationIndex(Sentence.type).iterator();
- FSIterator conceptIt = aJCas.getAnnotationIndex(ConceptMention.type)
- .iterator();
- FSIterator likelihoodIt = aJCas.getAnnotationIndex(
- LikelihoodIndicator.type).iterator();
-
- sentMap = new TreeMap();
- while (sentIt.hasNext()) {
- Sentence sent = (Sentence) sentIt.next();
- int sentBegin = sent.getBegin();
- int sentEnd = sent.getEnd();
- sentMap.put(sentBegin, sentEnd);
- }
-
- conceptMap = new TreeMap>();
- while (conceptIt.hasNext()) {
- ConceptMention concept = (ConceptMention) conceptIt.next();
- int conceptBegin = concept.getBegin();
- if (conceptMap.containsKey(conceptBegin)) {
- ArrayList conceptList = conceptMap
- .get(conceptBegin);
- conceptList.add(concept);
- conceptMap.put(conceptBegin, conceptList);
- } else {
- ArrayList conceptList = new ArrayList();
- conceptList.add(concept);
- conceptMap.put(conceptBegin, conceptList);
- }
- }
-
- likelihoodMap = new TreeMap();
- while (likelihoodIt.hasNext()) {
- LikelihoodIndicator likelihood = (LikelihoodIndicator) likelihoodIt
- .next();
- int likelihoodBegin = likelihood.getBegin();
- likelihoodMap.put(likelihoodBegin, likelihood);
- }
- }
+ public static final String PARAM_ASSIGNMENT_STRATEGY = "AssignmentStrategy";
+ public static final String PARAM_CONCEPT_TYPE_NAME = "ConceptTypeName";
+ public static final String STRATEGY_ALL = "all";
+ public static final String STRATEGY_NEXT_CONCEPT = "next-concept";
+ private static final Logger LOGGER = LoggerFactory
+ .getLogger(LikelihoodAssignmentAnnotator.class);
+ @ConfigurationParameter(name = PARAM_ASSIGNMENT_STRATEGY, mandatory = false, defaultValue = STRATEGY_NEXT_CONCEPT, description = "There are two available assignment strategies for likelihood indicators to ConceptMentions, '" + STRATEGY_ALL + "' and '" + STRATEGY_NEXT_CONCEPT + "'. The first, 'all', assigns the lowest likelihood indicator in a sentence to all ConceptMention in this sentence. The second assigns a likelihood indicator only to the directly following ConceptMention in the same sentence. The latter strategy fares a bit better in evaluations carried out for the publication of this approach. Defaults to '" + STRATEGY_NEXT_CONCEPT + "'.")
+ private String assignmentStrategy;
+ @ConfigurationParameter(name = PARAM_CONCEPT_TYPE_NAME, mandatory = false, defaultValue = "de.julielab.jcore.types.ConceptMention", description = "The qualified UIMA type name for the concept annotation for which likelihood assignment should be performed. Must be a subclass of de.julielab.jcore.types.ConceptMention. Defaults to de.julielab.jcore.types.ConceptMention.")
+ private String conceptTypeName;
+ /**
+ * Maps sentence ends to sentence begins.
+ */
+ private TreeMap sentMap;
+ /**
+ * Maps concept mentions to their begins.
+ */
+ private TreeMap> conceptMap;
+ /**
+ * Maps likelihood indicators to their begins.
+ */
+ private TreeMap likelihoodMap;
+
+ /**
+ * Quantifies likelihood values.
+ */
+ private HashMap likelihoodValueMap;
+ private ConceptMention conceptTypeTemplate;
+
+ public void initialize(UimaContext aContext)
+ throws ResourceInitializationException {
+ super.initialize(aContext);
+
+ assignmentStrategy = (String) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_ASSIGNMENT_STRATEGY)).orElse("next-concept");
+ conceptTypeName = (String) Optional.ofNullable(aContext.getConfigParameterValue(PARAM_CONCEPT_TYPE_NAME)).orElse(ConceptMention.class.getCanonicalName());
+
+ // ordinal scale for likelihood indicators;
+ // used when there are multiple occurrences (the lowest category is
+ // chosen)
+ likelihoodValueMap = new HashMap<>();
+ likelihoodValueMap.put("negation", 1);
+ likelihoodValueMap.put("low", 2);
+ likelihoodValueMap.put("investigation", 3);
+ likelihoodValueMap.put("moderate", 4);
+ likelihoodValueMap.put("high", 5);
+ }
+
+ @Override
+ public void process(JCas aJCas) throws AnalysisEngineProcessException {
+ if (conceptTypeTemplate == null) {
+ try {
+ conceptTypeTemplate = (ConceptMention) JCoReAnnotationTools.getAnnotationByClassName(aJCas, conceptTypeName);
+ } catch (Exception e) {
+ LOGGER.error("Could not obtain the specified concept UIMA type with name " + conceptTypeName + ".", e);
+ throw new AnalysisEngineProcessException(e);
+ }
+ }
+ // We have two strategies available for the assignment of likelhood indicators to ConceptMentions.
+ // Either the original one, implemented in 'assignLikelihood', where likelihood indicators in a sentences are
+ // assigned to all ConceptMentions in the same sentence or a simplified one that, according to
+ // Christine Engelmann, actually fared a bit better in evaluations, were a likelihood indicator is only
+ // assigned to the next following ConceptMention, implemented in 'assignLikelihoodToNextConceptMention'.
+ if (assignmentStrategy.equalsIgnoreCase(STRATEGY_NEXT_CONCEPT))
+ assignLikelihoodToNextConceptMention(aJCas);
+ else if (assignmentStrategy.equalsIgnoreCase(STRATEGY_ALL))
+ assignLikelihood(aJCas);
+ else
+ throw new AnalysisEngineProcessException(new IllegalArgumentException("The " + PARAM_ASSIGNMENT_STRATEGY + " parameter requires one of two values, " + STRATEGY_ALL + " or " + STRATEGY_NEXT_CONCEPT + " but was set to " + assignmentStrategy + "."));
+ }
+
+ /**
+ *
Simple assignment strategy that sets the direct nearest previous likelihood indicator to each ConceptMention.
+ *
No other ConceptMention must stand in between because then, a previous ConceptMention would be assigned the
+ * likelihood indicator.
+ *
This strategy was proposed by Christine Engelmann because it fared a bit better in her evaluations than
+ * the alternative strategy implemented in {@link #assignLikelihood(JCas)}.
+ *
+ * @param aJCas The CAS to do likelihood assignment in.
+ * @throws AnalysisEngineProcessException If the creation of the {@link JCoReAnnotationIndexMerger}, that is used internally, fails.
+ */
+ private void assignLikelihoodToNextConceptMention(JCas aJCas) throws AnalysisEngineProcessException {
+ // create default likelihood indicator for assertions (has begin = 0 and
+ // end = 0)
+ LikelihoodIndicator assertionIndicator = new LikelihoodIndicator(aJCas);
+ assertionIndicator.setLikelihood("assertion");
+ assertionIndicator.setComponentId(this.getClass().getName());
+ assertionIndicator.addToIndexes();
+
+ for (Sentence sentence : aJCas.getAnnotationIndex(Sentence.type)) {
+ // We use the annotation merger that gives us a sorted sequence of annotations of specified types.
+ // Then, we must only assign for each concept the directly preceding likelihood annotation, if there is one.
+ JCoReAnnotationIndexMerger merger;
+ try {
+ merger = new JCoReAnnotationIndexMerger(Set.of(JCasUtil.getAnnotationType(aJCas, conceptTypeTemplate.getClass()), JCasUtil.getAnnotationType(aJCas, LikelihoodIndicator.class)), true, sentence, aJCas);
+ } catch (ClassNotFoundException e) {
+ LOGGER.error("Could not create JCoReAnnotationIndexMerger", e);
+ throw new AnalysisEngineProcessException(e);
+ }
+ LikelihoodIndicator previousLikelihood = null;
+ boolean previousLikelihoodConsumed = false;
+ int lastAssignedCmBegin = 0;
+ int lastAssignedCmEnd = 0;
+ while (merger.incrementAnnotation()) {
+ final Annotation annotation = (Annotation) merger.getAnnotation();
+ ConceptMention cm = null;
+ if (conceptTypeTemplate.getClass().isAssignableFrom(annotation.getClass())) {
+ cm = (ConceptMention) annotation;
+ // default likelihood is assertion
+ cm.setLikelihood(assertionIndicator);
+ }
+ // check if there is a likelihood anntotion preceeding the ConceptMention in this sentence without
+ // another ConceptMention in between - except when multiple ConceptMentions exist in the same offsets
+ // which is possible for EventMentions that exist on the EventTrigger annotation. The trigger may
+ // refer to multiple events.
+ if (cm != null && (previousLikelihood != null && (!previousLikelihoodConsumed || (lastAssignedCmBegin == cm.getBegin() && lastAssignedCmEnd == cm.getEnd())))) {
+ cm.setLikelihood(previousLikelihood);
+ // this likelihood indicator has been "consumed"
+ previousLikelihoodConsumed = true;
+ lastAssignedCmBegin = cm.getBegin();
+ lastAssignedCmEnd = cm.getEnd();
+ }
+ if (annotation instanceof LikelihoodIndicator) {
+ previousLikelihood = (LikelihoodIndicator) annotation;
+ previousLikelihoodConsumed = false;
+ }
+ }
+ }
+ }
+
+ /**
+ * If a sentence contains a likelihood indicator, this indicator is assigned
+ * to all concept mentions occurring in the sentence. If a sentence does not
+ * contain a likelihood indicator, the default likelihood category (i.e.
+ * 'assertion') is assigned to all concept mentions occurring in the
+ * sentence. In case of multiple likelihood indicators the lowest likelihood
+ * category is chosen.
+ *
+ * @param aJCas
+ */
+ private void assignLikelihood(JCas aJCas) {
+ buildTreeMaps(aJCas);
+
+ // create default likelihood indicator for assertions (has begin = 0 and
+ // end = 0)
+ LikelihoodIndicator assertionIndicator = new LikelihoodIndicator(aJCas);
+ assertionIndicator.setLikelihood("assertion");
+ assertionIndicator.setComponentId(this.getClass().getName());
+ assertionIndicator.addToIndexes();
+
+ // iterate over sentences
+ for (int sentBegin : sentMap.keySet()) {
+ int sentEnd = sentMap.get(sentBegin);
+ boolean sentHasLikelihood = false;
+ boolean multipleLikelihood = false;
+ Integer firstLikelihoodBegin = 0;
+ Integer lastLikelihoodBegin = 0;
+
+ // determine whether the sentence contains a likelihood indicator at
+ // all and whether it even contains multiple likelihood indicators
+ firstLikelihoodBegin = likelihoodMap.ceilingKey(sentBegin);
+ if (firstLikelihoodBegin != null) {
+ if (firstLikelihoodBegin > sentEnd) {
+ sentHasLikelihood = false;
+ } else {
+ sentHasLikelihood = true;
+ }
+ }
+ if (sentHasLikelihood == true) {
+ lastLikelihoodBegin = likelihoodMap.floorKey(sentEnd);
+ if (firstLikelihoodBegin == lastLikelihoodBegin) {
+ multipleLikelihood = false;
+ } else {
+ multipleLikelihood = true;
+ }
+ }
+
+ // determine which likelihood category to assign to concept mentions
+ // in the sentence and create the corresponding likelihood indicator
+ LikelihoodIndicator assignedLikelihood = null;
+ if (sentHasLikelihood == true) {
+ if (multipleLikelihood == true) {
+ // determine the lowest likelihood category in the sentence
+ NavigableMap likelihoodSubMap = likelihoodMap
+ .subMap(firstLikelihoodBegin, true,
+ lastLikelihoodBegin, true);
+ int currentLikelihoodValue = 100;
+ for (int i : likelihoodSubMap.keySet()) {
+ LikelihoodIndicator likelihood = likelihoodSubMap
+ .get(i);
+ String likelihoodCat = likelihood.getLikelihood();
+ int likelihoodValue = likelihoodValueMap
+ .get(likelihoodCat);
+ if (likelihoodValue < currentLikelihoodValue) {
+ assignedLikelihood = likelihood;
+ currentLikelihoodValue = likelihoodValue;
+ }
+ }
+ } else {
+ LikelihoodIndicator likelihood = likelihoodMap
+ .get(firstLikelihoodBegin);
+ assignedLikelihood = likelihood;
+ }
+ } else {
+ assignedLikelihood = assertionIndicator;
+ }
+
+ // get all events in the sentence and assign the corresponding
+ // likelihood indicator
+ if (conceptMap.ceilingKey(sentBegin) != null) {
+ int firstConceptBegin = conceptMap.ceilingKey(sentBegin);
+ if (firstConceptBegin > sentEnd) {
+ continue;
+ } else {
+ int lastConceptBegin = conceptMap.floorKey(sentEnd);
+ NavigableMap> conceptSubMap = conceptMap
+ .subMap(firstConceptBegin, true, lastConceptBegin,
+ true);
+ for (int i : conceptSubMap.keySet()) {
+ ArrayList conceptList = conceptSubMap
+ .get(i);
+ for (ConceptMention concept : conceptList) {
+ concept.setLikelihood(assignedLikelihood);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ @SuppressWarnings("rawtypes")
+ public void buildTreeMaps(JCas aJCas) {
+ FSIterator sentIt = aJCas.getAnnotationIndex(Sentence.type).iterator();
+ FSIterator conceptIt = aJCas.getAnnotationIndex(conceptTypeTemplate.type)
+ .iterator();
+ FSIterator likelihoodIt = aJCas.getAnnotationIndex(
+ LikelihoodIndicator.type).iterator();
+
+ sentMap = new TreeMap<>();
+ while (sentIt.hasNext()) {
+ Sentence sent = (Sentence) sentIt.next();
+ int sentBegin = sent.getBegin();
+ int sentEnd = sent.getEnd();
+ sentMap.put(sentBegin, sentEnd);
+ }
+
+ conceptMap = new TreeMap<>();
+ while (conceptIt.hasNext()) {
+ ConceptMention concept = (ConceptMention) conceptIt.next();
+ int conceptBegin = concept.getBegin();
+ if (conceptMap.containsKey(conceptBegin)) {
+ ArrayList conceptList = conceptMap
+ .get(conceptBegin);
+ conceptList.add(concept);
+ conceptMap.put(conceptBegin, conceptList);
+ } else {
+ ArrayList conceptList = new ArrayList<>();
+ conceptList.add(concept);
+ conceptMap.put(conceptBegin, conceptList);
+ }
+ }
+
+ likelihoodMap = new TreeMap<>();
+ while (likelihoodIt.hasNext()) {
+ LikelihoodIndicator likelihood = (LikelihoodIndicator) likelihoodIt
+ .next();
+ int likelihoodBegin = likelihood.getBegin();
+ likelihoodMap.put(likelihoodBegin, likelihood);
+ }
+ }
}
diff --git a/jcore-likelihood-assignment-ae/src/main/resources/de/julielab/jcore/ae/likelihoodassignment/desc/jcore-likelihood-assignment-ae.xml b/jcore-likelihood-assignment-ae/src/main/resources/de/julielab/jcore/ae/likelihoodassignment/desc/jcore-likelihood-assignment-ae.xml
index 14bc6f60a..2db5339a6 100644
--- a/jcore-likelihood-assignment-ae/src/main/resources/de/julielab/jcore/ae/likelihoodassignment/desc/jcore-likelihood-assignment-ae.xml
+++ b/jcore-likelihood-assignment-ae/src/main/resources/de/julielab/jcore/ae/likelihoodassignment/desc/jcore-likelihood-assignment-ae.xml
@@ -6,8 +6,23 @@
JCoRe Likelihood Assignment AEAnalysis Engine to assign likelihood indicators to their corresponding entities and events.
- 2.5.1-SNAPSHOT
-
+ 2.6.0
+
+
+ AssignmentStrategy
+ There are two available assignment strategies for likelihood indicators to ConceptMentions, 'all' and 'next-concept'. The first, 'all', assigns the lowest likelihood indicator in a sentence to all ConceptMention in this sentence. The second assigns a likelihood indicator only to the directly following ConceptMention in the same sentence. The latter strategy fares a bit better in evaluations carried out for the publication of this approach. Defaults to 'next-concept'."
+ String
+ false
+ false
+
+
+ ConceptTypeName
+ The qualified UIMA type name for the concept annotation for which likelihood assignment should be performed. Must be a subclass of de.julielab.jcore.types.ConceptMention. Defaults to de.julielab.jcore.types.ConceptMention.
+ String
+ false
+ false
+
+
diff --git a/jcore-likelihood-assignment-ae/src/test/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotatorTest.java b/jcore-likelihood-assignment-ae/src/test/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotatorTest.java
index 5caf84f55..34861be6c 100644
--- a/jcore-likelihood-assignment-ae/src/test/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotatorTest.java
+++ b/jcore-likelihood-assignment-ae/src/test/java/de/julielab/jcore/ae/likelihoodassignment/LikelihoodAssignmentAnnotatorTest.java
@@ -1,25 +1,21 @@
package de.julielab.jcore.ae.likelihoodassignment;
-import de.julielab.jcore.types.ConceptMention;
-import de.julielab.jcore.types.LikelihoodIndicator;
-import de.julielab.jcore.types.Sentence;
+import de.julielab.jcore.types.*;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.JFSIndexRepository;
import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.resource.ResourceSpecifier;
import org.apache.uima.util.InvalidXMLException;
-import org.apache.uima.util.XMLInputSource;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Iterator;
-import static org.junit.Assert.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
/**
@@ -71,10 +67,7 @@ public void initCas(JCas aJCas) {
@Test
@SuppressWarnings({ "rawtypes"})
public void testProcess() throws ResourceInitializationException, IOException, InvalidXMLException {
-
- XMLInputSource assignmentXML = null;
- ResourceSpecifier assignmentSpec = null;
- AnalysisEngine assignmentAnnotator = AnalysisEngineFactory.createEngine(DESCRIPTOR);
+ AnalysisEngine assignmentAnnotator = AnalysisEngineFactory.createEngine(DESCRIPTOR, LikelihoodAssignmentAnnotator.PARAM_ASSIGNMENT_STRATEGY, LikelihoodAssignmentAnnotator.STRATEGY_ALL);
JCas aJCas = null;
try {
@@ -119,4 +112,76 @@ public String getPredictedAssignments(Iterator conceptIter) {
return conceptLikelihood;
}
+
+ @Test
+ public void testAssignNextStrategy() throws Exception {
+ AnalysisEngine assignmentAnnotator = AnalysisEngineFactory.createEngine(DESCRIPTOR, LikelihoodAssignmentAnnotator.PARAM_ASSIGNMENT_STRATEGY, LikelihoodAssignmentAnnotator.STRATEGY_NEXT_CONCEPT);
+ final JCas jCas = assignmentAnnotator.newJCas();
+ jCas.setDocumentText("Our data suggest that it is highly probable that the interaction occurred, however not the other one.");
+ new Sentence(jCas, 0, jCas.getDocumentText().length()).addToIndexes();
+
+ LikelihoodIndicator suggest = new LikelihoodIndicator(jCas, 9, 16);
+ suggest.setLikelihood("moderate");
+ suggest.addToIndexes();
+
+ LikelihoodIndicator highly = new LikelihoodIndicator(jCas, 28, 43);
+ highly.setLikelihood("high");
+ highly.addToIndexes();
+
+ ConceptMention interaction = new ConceptMention(jCas, 53, 64);
+ interaction.addToIndexes();
+
+ LikelihoodIndicator not = new LikelihoodIndicator(jCas, 83, 86);
+ not.setLikelihood("negation");
+ not.addToIndexes();
+
+ ConceptMention theOtherOne = new ConceptMention(jCas, 87, 100);
+ theOtherOne.addToIndexes();
+
+ assignmentAnnotator.process(jCas);
+
+ assertEquals(highly, interaction.getLikelihood());
+ assertEquals( not, theOtherOne.getLikelihood());
+ }
+
+ @Test
+ public void testAssignNextStrategySpecificConceptType() throws Exception {
+ // Here we test that the interaction type EventMention gets the likelihood assignment and not
+ // the entity argument because that is also a ConceptMention which gets assigned by default.
+ AnalysisEngine assignmentAnnotator = AnalysisEngineFactory.createEngine(DESCRIPTOR,
+ LikelihoodAssignmentAnnotator.PARAM_ASSIGNMENT_STRATEGY, LikelihoodAssignmentAnnotator.STRATEGY_NEXT_CONCEPT,
+ LikelihoodAssignmentAnnotator.PARAM_CONCEPT_TYPE_NAME, EventMention.class.getCanonicalName());
+ final JCas jCas = assignmentAnnotator.newJCas();
+ jCas.setDocumentText("Our data suggest one entity interacts with another but there is phosphorylation.");
+ new Sentence(jCas, 0, jCas.getDocumentText().length()).addToIndexes();
+
+ LikelihoodIndicator suggest = new LikelihoodIndicator(jCas, 9, 16);
+ suggest.setLikelihood("moderate");
+ suggest.addToIndexes();
+
+ EntityMention oneEntity = new EntityMention(jCas, 17, 27);
+ oneEntity.addToIndexes();
+
+ EventMention interacts = new EventMention(jCas, 28, 37);
+ interacts.addToIndexes();
+ // this is here to test that the assignment to same-offset annotations works
+ EventMention interacts2 = new EventMention(jCas, 28, 37);
+ interacts2.addToIndexes();
+
+ EntityMention another = new EntityMention(jCas, 43, 50);
+ another.addToIndexes();
+
+ EventMention phosphorylation = new EventMention(jCas, 64, 79);
+ phosphorylation.addToIndexes();
+
+ assignmentAnnotator.process(jCas);
+
+ // only the EventMentions should be assigned likelihoods.
+ assertEquals(null, oneEntity.getLikelihood());
+ assertEquals( suggest, interacts.getLikelihood());
+ assertEquals( suggest, interacts2.getLikelihood());
+ assertEquals(null, another.getLikelihood());
+ // due to the next-concept strategy, this mention should receive the default assertion likelihood
+ assertEquals("assertion", phosphorylation.getLikelihood().getLikelihood());
+ }
}
diff --git a/jcore-likelihood-detection-ae/component.meta b/jcore-likelihood-detection-ae/component.meta
index e58826719..068a3ab10 100644
--- a/jcore-likelihood-detection-ae/component.meta
+++ b/jcore-likelihood-detection-ae/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-likelihood-detection-ae",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe Likelihood Detection AE"
}
diff --git a/jcore-likelihood-detection-ae/pom.xml b/jcore-likelihood-detection-ae/pom.xml
index c68a79a73..0fc7e7fff 100644
--- a/jcore-likelihood-detection-ae/pom.xml
+++ b/jcore-likelihood-detection-ae/pom.xml
@@ -10,7 +10,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0
@@ -42,8 +42,8 @@
julielab-java-utilities
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-engineJCoRe Likelihood Detection AE
diff --git a/jcore-likelihood-detection-ae/src/main/resources/de/julielab/jcore/ae/likelihooddetection/desc/jcore-likelihood-detection-ae.xml b/jcore-likelihood-detection-ae/src/main/resources/de/julielab/jcore/ae/likelihooddetection/desc/jcore-likelihood-detection-ae.xml
index 81e9c76f1..9e3a492f4 100644
--- a/jcore-likelihood-detection-ae/src/main/resources/de/julielab/jcore/ae/likelihooddetection/desc/jcore-likelihood-detection-ae.xml
+++ b/jcore-likelihood-detection-ae/src/main/resources/de/julielab/jcore/ae/likelihooddetection/desc/jcore-likelihood-detection-ae.xml
@@ -6,7 +6,7 @@
JCoRe Likelihood Detection AEAnalysis Engine to detect epistemic modal expressions and assign the appropriate likelihood category.
- 2.5.1-SNAPSHOT
+ 2.6.0LikelihoodDict
diff --git a/jcore-likelihood-detection-ae/src/test/java/de/julielab/jcore/ae/likelihooddetection/LikelihoodDetectionAnnotatorTest.java b/jcore-likelihood-detection-ae/src/test/java/de/julielab/jcore/ae/likelihooddetection/LikelihoodDetectionAnnotatorTest.java
index 864b0c431..eee8b0d8e 100644
--- a/jcore-likelihood-detection-ae/src/test/java/de/julielab/jcore/ae/likelihooddetection/LikelihoodDetectionAnnotatorTest.java
+++ b/jcore-likelihood-detection-ae/src/test/java/de/julielab/jcore/ae/likelihooddetection/LikelihoodDetectionAnnotatorTest.java
@@ -5,21 +5,21 @@
import de.julielab.jcore.types.Token;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.JFSIndexRepository;
import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.resource.ResourceSpecifier;
import org.apache.uima.util.InvalidXMLException;
-import org.apache.uima.util.XMLInputSource;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.Iterator;
-import static org.junit.Assert.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
/**
@@ -68,9 +68,6 @@ public void initCas(JCas aJCas) {
@Test
@SuppressWarnings("rawtypes")
public void testProcess() throws ResourceInitializationException, IOException, InvalidXMLException {
-
- XMLInputSource likelihoodXML = null;
- ResourceSpecifier likelihoodSpec = null;
AnalysisEngine likelihoodAnnotator = AnalysisEngineFactory.createEngine(DESCRIPTOR);
JCas aJCas = null;
try {
@@ -127,4 +124,22 @@ private ArrayList getPredictedIndicators(Iterator likelihoodIter) {
prediction.add(predictedCategories);
return prediction;
}
+
+ @Test
+ public void test() throws Exception {
+ String text = "Genome-wide expression analyses indicate that TAZ/YAP, TEADs, and TGFβ-induced signals coordinate a specific pro-tumorigenic transcriptional program";
+ AnalysisEngine likelihoodAnnotator = AnalysisEngineFactory.createEngine(DESCRIPTOR);
+ JCas aJCas = null;
+ try {
+ aJCas = likelihoodAnnotator.newJCas();
+ } catch (ResourceInitializationException e) {
+ LOGGER.error("testProcess()", e);
+ }
+ likelihoodAnnotator.process(aJCas);
+
+ final Collection select = JCasUtil.select(aJCas, LikelihoodIndicator.class);
+ for (var s : select) {
+ System.out.println(s.getCoveredText());
+ }
+ }
}
diff --git a/jcore-line-multiplier/component.meta b/jcore-line-multiplier/component.meta
index 432aa6b6a..38394f9cd 100644
--- a/jcore-line-multiplier/component.meta
+++ b/jcore-line-multiplier/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-line-multiplier",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe Line Multiplier"
}
diff --git a/jcore-line-multiplier/pom.xml b/jcore-line-multiplier/pom.xml
index 12aa067d8..650c68038 100644
--- a/jcore-line-multiplier/pom.xml
+++ b/jcore-line-multiplier/pom.xml
@@ -10,7 +10,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0
@@ -29,8 +29,8 @@
${jcore-types-version}
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-engineorg.assertj
diff --git a/jcore-line-multiplier/src/main/resources/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml b/jcore-line-multiplier/src/main/resources/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml
index 69ff063cd..f58d9d2ed 100644
--- a/jcore-line-multiplier/src/main/resources/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml
+++ b/jcore-line-multiplier/src/main/resources/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml
@@ -6,7 +6,7 @@
JCoRe Line MultiplierSplits incoming CAS document texts on line breaks and returns one CAS for each non-blank line.
- 2.5.1-SNAPSHOT
+ 2.6.0NumberLinesPerCAS
diff --git a/jcore-line-multiplier/src/test/java/de/julielab/jcore/multiplier/line/LineMultiplierTest.java b/jcore-line-multiplier/src/test/java/de/julielab/jcore/multiplier/line/LineMultiplierTest.java
index 23b7e9ea3..5ecd2c19a 100644
--- a/jcore-line-multiplier/src/test/java/de/julielab/jcore/multiplier/line/LineMultiplierTest.java
+++ b/jcore-line-multiplier/src/test/java/de/julielab/jcore/multiplier/line/LineMultiplierTest.java
@@ -5,13 +5,13 @@
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import java.util.ArrayList;
import java.util.List;
import static org.assertj.core.api.Assertions.assertThat;
-import static org.junit.Assert.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertTrue;
/**
* Unit tests for jcore-line-multiplier.
*/
diff --git a/jcore-line-multiplier/target/classes/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml b/jcore-line-multiplier/target/classes/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml
index 69ff063cd..f58d9d2ed 100644
--- a/jcore-line-multiplier/target/classes/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml
+++ b/jcore-line-multiplier/target/classes/de/julielab/jcore/multiplier/line/desc/jcore-line-multiplier-ae.xml
@@ -6,7 +6,7 @@
JCoRe Line MultiplierSplits incoming CAS document texts on line breaks and returns one CAS for each non-blank line.
- 2.5.1-SNAPSHOT
+ 2.6.0NumberLinesPerCAS
diff --git a/jcore-line-multiplier/target/test-classes/de/julielab/jcore/multiplier/line/LineMultiplierTest.class b/jcore-line-multiplier/target/test-classes/de/julielab/jcore/multiplier/line/LineMultiplierTest.class
index e654ed056..f32ad510b 100644
Binary files a/jcore-line-multiplier/target/test-classes/de/julielab/jcore/multiplier/line/LineMultiplierTest.class and b/jcore-line-multiplier/target/test-classes/de/julielab/jcore/multiplier/line/LineMultiplierTest.class differ
diff --git a/jcore-lingpipe-porterstemmer-ae/component.meta b/jcore-lingpipe-porterstemmer-ae/component.meta
index f0adaa9a1..843a38e95 100644
--- a/jcore-lingpipe-porterstemmer-ae/component.meta
+++ b/jcore-lingpipe-porterstemmer-ae/component.meta
@@ -14,7 +14,7 @@
"maven-artifact": {
"artifactId": "jcore-lingpipe-porterstemmer-ae",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe Lingpipe Porter Stemmer AE"
}
diff --git a/jcore-lingpipe-porterstemmer-ae/pom.xml b/jcore-lingpipe-porterstemmer-ae/pom.xml
index 6a10f10c5..6cd1f56ca 100644
--- a/jcore-lingpipe-porterstemmer-ae/pom.xml
+++ b/jcore-lingpipe-porterstemmer-ae/pom.xml
@@ -5,7 +5,7 @@
de.julielabjcore-base
- 2.5.1-SNAPSHOT
+ 2.6.0jcore-lingpipe-porterstemmer-aeJCoRe Lingpipe Porter Stemmer AE
@@ -22,8 +22,8 @@
4.1.2-JL1.0
- junit
- junit
+ org.junit.jupiter
+ junit-jupiter-enginehttps://github.com/JULIELab/jcore-base/tree/master/jcore-lingpipe-porterstemmer-ae
diff --git a/jcore-lingpipe-porterstemmer-ae/src/main/resources/de/julielab/jcore/ae/lingpipe/porterstemmer/desc/jcore-lingpipe-porterstemmer-ae.xml b/jcore-lingpipe-porterstemmer-ae/src/main/resources/de/julielab/jcore/ae/lingpipe/porterstemmer/desc/jcore-lingpipe-porterstemmer-ae.xml
index b959cf460..c432b936e 100644
--- a/jcore-lingpipe-porterstemmer-ae/src/main/resources/de/julielab/jcore/ae/lingpipe/porterstemmer/desc/jcore-lingpipe-porterstemmer-ae.xml
+++ b/jcore-lingpipe-porterstemmer-ae/src/main/resources/de/julielab/jcore/ae/lingpipe/porterstemmer/desc/jcore-lingpipe-porterstemmer-ae.xml
@@ -5,7 +5,7 @@
JCoRe Lingpipe Porterstemmer AEAdds a StemmedForm to each token in the CAS. The offsets and the value feature of each StemmedForm are set to the stem as returned by the Porter stemmer algorithm as implemented by Lingpipe.
- 2.5.1-SNAPSHOT
+ 2.6.0JULIE Lab, Germany
diff --git a/jcore-lingpipe-porterstemmer-ae/src/test/java/de/julielab/jcore/ae/lingpipe/porterstemmer/LingpipePorterstemmerAnnotatorTest.java b/jcore-lingpipe-porterstemmer-ae/src/test/java/de/julielab/jcore/ae/lingpipe/porterstemmer/LingpipePorterstemmerAnnotatorTest.java
index 58eb08a15..5bc2d85dd 100644
--- a/jcore-lingpipe-porterstemmer-ae/src/test/java/de/julielab/jcore/ae/lingpipe/porterstemmer/LingpipePorterstemmerAnnotatorTest.java
+++ b/jcore-lingpipe-porterstemmer-ae/src/test/java/de/julielab/jcore/ae/lingpipe/porterstemmer/LingpipePorterstemmerAnnotatorTest.java
@@ -16,10 +16,10 @@
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
public class LingpipePorterstemmerAnnotatorTest {
@Test
diff --git a/jcore-lingpipegazetteer-ae/LICENSE b/jcore-lingpipegazetteer-ae/LICENSE
index be3f7b28e..f57182ac3 100644
--- a/jcore-lingpipegazetteer-ae/LICENSE
+++ b/jcore-lingpipegazetteer-ae/LICENSE
@@ -1,661 +1,73 @@
- GNU AFFERO GENERAL PUBLIC LICENSE
- Version 3, 19 November 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc.
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
- Preamble
-
- The GNU Affero General Public License is a free, copyleft license for
-software and other kinds of works, specifically designed to ensure
-cooperation with the community in the case of network server software.
-
- The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works. By contrast,
-our General Public Licenses are intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.
-
- When we speak of free software, we are referring to freedom, not
-price. Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
- Developers that use our General Public Licenses protect your rights
-with two steps: (1) assert copyright on the software, and (2) offer
-you this License which gives you legal permission to copy, distribute
-and/or modify the software.
-
- A secondary benefit of defending all users' freedom is that
-improvements made in alternate versions of the program, if they
-receive widespread use, become available for other developers to
-incorporate. Many developers of free software are heartened and
-encouraged by the resulting cooperation. However, in the case of
-software used on network servers, this result may fail to come about.
-The GNU General Public License permits making a modified version and
-letting the public access it on a server without ever releasing its
-source code to the public.
-
- The GNU Affero General Public License is designed specifically to
-ensure that, in such cases, the modified source code becomes available
-to the community. It requires the operator of a network server to
-provide the source code of the modified version running there to the
-users of that server. Therefore, public use of a modified version, on
-a publicly accessible server, gives the public access to the source
-code of the modified version.
-
- An older license, called the Affero General Public License and
-published by Affero, was designed to accomplish similar goals. This is
-a different license, not a version of the Affero GPL, but Affero has
-released a new version of the Affero GPL which permits relicensing under
-this license.
-
- The precise terms and conditions for copying, distribution and
-modification follow.
-
- TERMS AND CONDITIONS
-
- 0. Definitions.
-
- "This License" refers to version 3 of the GNU Affero General Public License.
-
- "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
- "The Program" refers to any copyrightable work licensed under this
-License. Each licensee is addressed as "you". "Licensees" and
-"recipients" may be individuals or organizations.
-
- To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy. The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
- A "covered work" means either the unmodified Program or a work based
-on the Program.
-
- To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy. Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
- To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies. Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
- An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License. If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
- 1. Source Code.
-
- The "source code" for a work means the preferred form of the work
-for making modifications to it. "Object code" means any non-source
-form of a work.
-
- A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
- The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form. A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
- The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities. However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work. For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
- The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
- The Corresponding Source for a work in source code form is that
-same work.
-
- 2. Basic Permissions.
-
- All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met. This License explicitly affirms your unlimited
-permission to run the unmodified Program. The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work. This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
- You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force. You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright. Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
- Conveying under any other circumstances is permitted solely under
-the conditions stated below. Sublicensing is not allowed; section 10
-makes it unnecessary.
-
- 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
- No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
- When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
- 4. Conveying Verbatim Copies.
-
- You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
- You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
- 5. Conveying Modified Source Versions.
-
- You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
- a) The work must carry prominent notices stating that you modified
- it, and giving a relevant date.
-
- b) The work must carry prominent notices stating that it is
- released under this License and any conditions added under section
- 7. This requirement modifies the requirement in section 4 to
- "keep intact all notices".
-
- c) You must license the entire work, as a whole, under this
- License to anyone who comes into possession of a copy. This
- License will therefore apply, along with any applicable section 7
- additional terms, to the whole of the work, and all its parts,
- regardless of how they are packaged. This License gives no
- permission to license the work in any other way, but it does not
- invalidate such permission if you have separately received it.
-
- d) If the work has interactive user interfaces, each must display
- Appropriate Legal Notices; however, if the Program has interactive
- interfaces that do not display Appropriate Legal Notices, your
- work need not make them do so.
-
- A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit. Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
- 6. Conveying Non-Source Forms.
-
- You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
- a) Convey the object code in, or embodied in, a physical product
- (including a physical distribution medium), accompanied by the
- Corresponding Source fixed on a durable physical medium
- customarily used for software interchange.
-
- b) Convey the object code in, or embodied in, a physical product
- (including a physical distribution medium), accompanied by a
- written offer, valid for at least three years and valid for as
- long as you offer spare parts or customer support for that product
- model, to give anyone who possesses the object code either (1) a
- copy of the Corresponding Source for all the software in the
- product that is covered by this License, on a durable physical
- medium customarily used for software interchange, for a price no
- more than your reasonable cost of physically performing this
- conveying of source, or (2) access to copy the
- Corresponding Source from a network server at no charge.
-
- c) Convey individual copies of the object code with a copy of the
- written offer to provide the Corresponding Source. This
- alternative is allowed only occasionally and noncommercially, and
- only if you received the object code with such an offer, in accord
- with subsection 6b.
-
- d) Convey the object code by offering access from a designated
- place (gratis or for a charge), and offer equivalent access to the
- Corresponding Source in the same way through the same place at no
- further charge. You need not require recipients to copy the
- Corresponding Source along with the object code. If the place to
- copy the object code is a network server, the Corresponding Source
- may be on a different server (operated by you or a third party)
- that supports equivalent copying facilities, provided you maintain
- clear directions next to the object code saying where to find the
- Corresponding Source. Regardless of what server hosts the
- Corresponding Source, you remain obligated to ensure that it is
- available for as long as needed to satisfy these requirements.
-
- e) Convey the object code using peer-to-peer transmission, provided
- you inform other peers where the object code and Corresponding
- Source of the work are being offered to the general public at no
- charge under subsection 6d.
-
- A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
- A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling. In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage. For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product. A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
- "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source. The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
- If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information. But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
- The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed. Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
- Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
- 7. Additional Terms.
-
- "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law. If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
- When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it. (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.) You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
- Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
- a) Disclaiming warranty or limiting liability differently from the
- terms of sections 15 and 16 of this License; or
-
- b) Requiring preservation of specified reasonable legal notices or
- author attributions in that material or in the Appropriate Legal
- Notices displayed by works containing it; or
-
- c) Prohibiting misrepresentation of the origin of that material, or
- requiring that modified versions of such material be marked in
- reasonable ways as different from the original version; or
-
- d) Limiting the use for publicity purposes of names of licensors or
- authors of the material; or
-
- e) Declining to grant rights under trademark law for use of some
- trade names, trademarks, or service marks; or
-
- f) Requiring indemnification of licensors and authors of that
- material by anyone who conveys the material (or modified versions of
- it) with contractual assumptions of liability to the recipient, for
- any liability that these contractual assumptions directly impose on
- those licensors and authors.
-
- All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10. If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term. If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
- If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
- Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
- 8. Termination.
-
- You may not propagate or modify a covered work except as expressly
-provided under this License. Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
- However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
- Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
- Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License. If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
- 9. Acceptance Not Required for Having Copies.
-
- You are not required to accept this License in order to receive or
-run a copy of the Program. Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance. However,
-nothing other than this License grants you permission to propagate or
-modify any covered work. These actions infringe copyright if you do
-not accept this License. Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
- 10. Automatic Licensing of Downstream Recipients.
-
- Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License. You are not responsible
-for enforcing compliance by third parties with this License.
-
- An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations. If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
- You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License. For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
- 11. Patents.
-
- A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based. The
-work thus licensed is called the contributor's "contributor version".
-
- A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version. For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
- Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
- In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement). To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
- If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients. "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
- If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
- A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License. You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
- Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
- 12. No Surrender of Others' Freedom.
-
- If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License. If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all. For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
- 13. Remote Network Interaction; Use with the GNU General Public License.
-
- Notwithstanding any other provision of this License, if you modify the
-Program, your modified version must prominently offer all users
-interacting with it remotely through a computer network (if your version
-supports such interaction) an opportunity to receive the Corresponding
-Source of your version by providing access to the Corresponding Source
-from a network server at no charge, through some standard or customary
-means of facilitating copying of software. This Corresponding Source
-shall include the Corresponding Source for any work covered by version 3
-of the GNU General Public License that is incorporated pursuant to the
-following paragraph.
-
- Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU General Public License into a single
-combined work, and to convey the resulting work. The terms of this
-License will continue to apply to the part which is the covered work,
-but the work with which it is combined will remain governed by version
-3 of the GNU General Public License.
-
- 14. Revised Versions of this License.
-
- The Free Software Foundation may publish revised and/or new versions of
-the GNU Affero General Public License from time to time. Such new versions
-will be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
- Each version is given a distinguishing version number. If the
-Program specifies that a certain numbered version of the GNU Affero General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation. If the Program does not specify a version number of the
-GNU Affero General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
- If the Program specifies that a proxy can decide which future
-versions of the GNU Affero General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
- Later license versions may give you additional or different
-permissions. However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
- 15. Disclaimer of Warranty.
-
- THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
- 16. Limitation of Liability.
-
- IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
- 17. Interpretation of Sections 15 and 16.
-
- If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
- END OF TERMS AND CONDITIONS
-
- How to Apply These Terms to Your New Programs
-
- If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
- To do so, attach the following notices to the program. It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-
- Copyright (C)
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see .
-
-Also add information on how to contact you by electronic and paper mail.
-
- If your software can interact with users remotely through a computer
-network, you should also make sure that it provides a way for users to
-get its source. For example, if your program is a web application, its
-interface could display a "Source" link that leads users to an archive
-of the code. There are many ways you could offer source, and different
-solutions will be better for different programs; see section 13 for the
-specific requirements.
-
- You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU AGPL, see
-.
+Alias-i ROYALTY FREE LICENSE VERSION 1
+
+Copyright c 2003-2007 Alias-i, Inc
+All Rights Reserved
+
+1. This Alias-i Royalty Free License Version 1 ("License") governs
+ the copying, modifying, and distributing of the computer program or
+ work containing a notice stating that it is subject to the terms of
+ this License and any derivative works of that computer program or
+ work. The computer program or work and any derivative works thereof
+ are the "Software." Your copying, modifying, or distributing of the
+ Software constitutes acceptance of this License. Although you are not
+ required to accept this License, since you have not signed it, nothing
+ else grants you permission to copy, modify, or distribute the
+ Software. If you wish to receive a license from Alias-i under
+ different terms than those contained in this License, please contact
+ Alias-i. Otherwise, if you do not accept this License, any copying,
+ modifying, or distributing of the Software is strictly prohibited by
+ law.
+
+2. You may copy or modify the Software or use any output of the
+ Software (i) for internal non-production trial, testing and evaluation
+ of the Software, or (ii) in connection with any product or service you
+ provide to third parties for free. Copying or modifying the Software
+ includes the acts of "installing", "running", "using", "accessing" or
+ "deploying" the Software as those terms are understood in the software
+ industry. Therefore, those activities are only permitted under this
+ License in the ways that copying or modifying are permitted.
+
+3. You may distribute the Software, provided that you: (i) distribute
+ the Software only under the terms of this License, no more, no less;
+ (ii) include a copy of this License along with any such distribution;
+ (iii) include the complete corresponding machine-readable source code
+ of the Software you are distributing; (iv) do not remove any copyright
+ or other notices from the Software; and, (v) cause any files of the
+ Software that you modified to carry prominent notices stating that you
+ changed the Software and the date of any change so that recipients
+ know that they are not receiving the original Software.
+
+4. Whether you distribute the Software or not, if you distribute any
+ computer program that is not the Software, but that (a) is distributed
+ in connection with the Software or contains any part of the Software,
+ (b) causes the Software to be copied or modified (i.e., ran, used, or
+ executed), such as through an API call, or (c) uses any output of the
+ Software, then you must distribute that other computer program under a
+ license defined as a Free Software License by the Free Software
+ Foundation or an Approved Open Source License by the Open Source
+ Initiative.
+
+5. You may not copy, modify, or distribute the Software except as
+ expressly provided under this License, unless you receive a different
+ written license from Alias-i to do so. Any attempt otherwise to copy,
+ modify, or distribute the Software is without Alias-i's permission, is
+ void, and will automatically terminate your rights under this License.
+ Your rights under this License may only be reinstated by a signed
+ writing from Alias-i.
+
+THE SOFTWARE IS PROVIDED "AS IS." TO THE MAXIMUM EXTENT PERMITTED BY
+APPLICABLE LAW, ALIAS-i DOES NOT MAKE, AND HEREBY EXPRESSLY DISCLAIMS,
+ANY WARRANTIES, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, CONCERNING
+THE SOFTWARE OR ANY SUBJECT MATTER OF THIS LICENSE. SPECIFICALLY, BUT
+WITHOUT LIMITING THE FOREGOING, LICENSOR MAKES NO EXPRESS OR IMPLIED
+WARRANTY OF MERCHANTABILITY, FITNESS (FOR A PARTICULAR PURPOSE OR
+OTHERWISE), QUALITY, USEFULNESS, TITLE, OR NON-INFRINGEMENT. TO THE
+MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL LICENSOR
+BE LIABLE TO YOU OR ANY THIRD PARTY FOR ANY DAMAGES OR IN RESPECT OF
+ANY CLAIM UNDER ANY TORT, CONTRACT, STRICT LIABILITY, NEGLIGENCE OR
+OTHER THEORY FOR ANY DIRECT, INDIRECT, INCIDENTAL, CONSEQUENTIAL,
+PUNITIVE, SPECIAL OR EXEMPLARY DAMAGES, EVEN IF IT HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES, OR FOR ANY AMOUNTS IN EXCESS OF THE
+AMOUNT YOU PAID ALIAS-i FOR THIS LICENSE. YOU MUST PASS THIS ENTIRE
+LICENSE, INCLUDING SPECIFICALLY THIS DISCLAIMER AND LIMITATION OF
+LIABILITY, ON WHENEVER YOU DISTRIBUTE THE SOFTWARE.
diff --git a/jcore-lingpipegazetteer-ae/component.meta b/jcore-lingpipegazetteer-ae/component.meta
index 0a77648a3..6b1d1c0bf 100644
--- a/jcore-lingpipegazetteer-ae/component.meta
+++ b/jcore-lingpipegazetteer-ae/component.meta
@@ -18,7 +18,7 @@
"maven-artifact": {
"artifactId": "jcore-lingpipe-gazetteer-ae",
"groupId": "de.julielab",
- "version": "2.5.1-SNAPSHOT"
+ "version": "2.6.0"
},
"name": "JCoRe Lingpipe Gazetteer AE"
}
diff --git a/jcore-lingpipegazetteer-ae/pom.xml b/jcore-lingpipegazetteer-ae/pom.xml
index 1d39efcf8..3046249fb 100644
--- a/jcore-lingpipegazetteer-ae/pom.xml
+++ b/jcore-lingpipegazetteer-ae/pom.xml
@@ -1,68 +1,76 @@
-
+
- 4.0.0
- jcore-lingpipe-gazetteer-ae
- jar
- JCoRe Lingpipe Gazetteer AE
- Basically used as NE tagger based on Lingpipe's dictionary-lookup tagger.
+ 4.0.0
+ jcore-lingpipe-gazetteer-ae
+ jar
+ JCoRe Lingpipe Gazetteer AE
+ Basically used as NE tagger based on Lingpipe's dictionary-lookup tagger.
-
- de.julielab
- jcore-base
- 2.5.1-SNAPSHOT
-
+
+ de.julielab
+ jcore-base
+ 2.6.0
+
-
-
- de.julielab
- jcore-descriptor-creator
-
+
+
+ de.julielab
+ jcore-descriptor-creator
+ de.julielabjcore-types${jcore-types-version}
-
- org.slf4j
- slf4j-api
-
-
- de.julielab
- jcore-utilities
- ${jcore-utilities-version}
-
-
- ch.qos.logback
- logback-classic
- provided
-
-
- com.ibm.icu
- icu4j
- 4.8.1.1
-
-
- de.julielab
- aliasi-lingpipe
- 4.1.2-JL1.0
-
-
- org.apache.commons
- commons-lang3
- 3.4
-
- junitjunit
-
- JULIE Lab, Germany
- http://www.julielab.de
-
-
+
+ org.slf4j
+ slf4j-api
+
+
+ de.julielab
+ jcore-utilities
+ ${jcore-utilities-version}
+
+
+ ch.qos.logback
+ logback-classic
+ provided
+
+
+ com.ibm.icu
+ icu4j
+ 4.8.1.1
+
+
+ de.julielab
+ aliasi-lingpipe
+ 4.1.2-JL1.0
+
+
+ org.apache.commons
+ commons-lang3
+
+
+ org.assertj
+ assertj-core
+
+
+ org.junit.jupiter
+ junit-jupiter-engine
+
+
+
+ JULIE Lab, Germany
+ http://www.julielab.de
+
+ GNU Affero General Public License, Version 3.0http://www.gnu.org/licenses/agpl-3.0.en.html
- https://github.com/JULIELab/jcore-base/tree/master/jcore-lingpipe-gazetteer-ae
-
+ https://github.com/JULIELab/jcore-base/tree/master/jcore-lingpipe-gazetteer-ae
+
diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProvider.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProvider.java
index 0395da7c8..0e43d4cd4 100644
--- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProvider.java
+++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProvider.java
@@ -13,6 +13,8 @@ public interface ChunkerProvider {
public boolean getUseApproximateMatching();
public boolean getNormalize();
+
+ public boolean getNormalizePlural();
public boolean getTransliterate();
diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImpl.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImpl.java
index dc5613755..06171ed03 100644
--- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImpl.java
+++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImpl.java
@@ -428,6 +428,11 @@ public boolean getNormalize() {
return false;
}
+ @Override
+ public boolean getNormalizePlural() {
+ return false;
+ }
+
@Override
public boolean getTransliterate() {
return false;
diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImplAlt.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImplAlt.java
index 7e3daa924..175653bf5 100644
--- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImplAlt.java
+++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ChunkerProviderImplAlt.java
@@ -42,6 +42,12 @@ public class ChunkerProviderImplAlt implements ChunkerProvider, SharedResourceOb
* switched on in the descriptor for the annotator itself!
*/
public final static String PARAM_NORMALIZE_TEXT = "NormalizeText";
+ /**
+ * Only in effect when {@link #PARAM_NORMALIZE_TEXT} is set to true. If so, will normalize plurals
+ * found in the text by removing the training 's'. Requires annotations of the type {@link de.julielab.jcore.types.PennBioIEPOSTag}
+ * to be present in the CAS.
+ */
+ public static final String PARAM_NORMALIZE_PLURAL = "NormalizePlural";
/**
* Parameter to indicate whether text - dictionary entries for this class - should be transliterated, i.e. whether
* accents and other character variations should be stripped. If this is switched on here, it must also be switched
@@ -54,6 +60,7 @@ public class ChunkerProviderImplAlt implements ChunkerProvider, SharedResourceOb
private boolean useApproximateMatching;
private boolean transliterate;
private boolean normalize;
+ private boolean normalizePlural;
private InputStream dictFile;
private InputStream stopFile;
@@ -71,6 +78,10 @@ public Chunker getChunker() {
return dictChunker;
}
+ public boolean getNormalizePlural() {
+ return normalizePlural;
+ }
+
public void load(DataResource resource) throws ResourceInitializationException {
LOGGER.info("Loading configuration file from URI \"{}\" (URL: \"{}\").", resource.getUri(), resource.getUrl());
Properties properties = new Properties();
@@ -118,7 +129,11 @@ public void load(DataResource resource) throws ResourceInitializationException {
normalize = false;
if (normalizeString != null)
normalize = new Boolean(normalizeString);
- LOGGER.info("Normalize dictionary entries (i.e. completely strip dashes, parenthesis etc): {}", normalize);
+ LOGGER.info("Normalize dictionary entries and text (i.e. completely strip dashes, parenthesis etc): {}", normalize);
+
+ normalizePlural = Boolean.parseBoolean(properties.getProperty(PARAM_NORMALIZE_PLURAL, "false")) && normalize;
+ if (normalize)
+ LOGGER.info("Also normalize plural forms to singular: {}", normalizePlural);
String transliterateString = properties.getProperty(PARAM_TRANSLITERATE_TEXT);
transliterate = false;
@@ -256,14 +271,14 @@ private void readDictionary(InputStream dictFileStream) throws IOException, Anal
bf = new BufferedReader(new InputStreamReader(dictFileStream));
String line = "";
- Transliterator transliterator = null;
- if (transliterate)
- transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower");
+ Transliterator transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC");
TokenizerFactory tokenizerFactory = null;
if (normalize)
tokenizerFactory = new IndoEuropeanTokenizerFactory();
while ((line = bf.readLine()) != null) {
+ if (line.startsWith("#"))
+ continue;
String[] values = line.split("\t");
if (values.length != 2) {
LOGGER.error("readDictionary() - wrong format of line: " + line);
@@ -276,11 +291,11 @@ private void readDictionary(InputStream dictFileStream) throws IOException, Anal
continue;
if (normalize) {
- term = StringNormalizerForChunking.normalizeString(term, tokenizerFactory).string;
+ term = StringNormalizerForChunking.normalizeString(term, tokenizerFactory, transliterator).string;
}
if (transliterate)
term = transliterator.transform(term);
- if (useApproximateMatching && !caseSensitive && !transliterate)
+ if (useApproximateMatching && !caseSensitive)
term = term.toLowerCase();
String label = values[1].trim();
diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.java
index f0ae88711..f319562bd 100644
--- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.java
+++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.java
@@ -1,4 +1,3 @@
-
package de.julielab.jcore.ae.lingpipegazetteer.chunking;
import com.aliasi.chunk.Chunker;
@@ -21,6 +20,7 @@
import java.io.*;
import java.net.URI;
import java.util.HashSet;
+import java.util.Optional;
import java.util.Set;
import java.util.zip.GZIPInputStream;
@@ -29,317 +29,333 @@
* Also, this implementation expects a configurableDataResourceSpecifier for the external resource,
* specifying the dictionary directly and providing the parameters via the normal UIMA resource meta data
* mechanism.
- *
+ *
* @author faessler
- *
*/
public class ConfigurableChunkerProviderImplAlt implements ChunkerProvider, SharedResourceObject {
- private static final Logger LOGGER = LoggerFactory.getLogger(ConfigurableChunkerProviderImplAlt.class);
- public final static String PARAM_USE_APPROXIMATE_MATCHING = "UseApproximateMatching";
- public final static String PARAM_CASE_SENSITIVE = "CaseSensitive";
- public final static String PARAM_MAKE_VARIANTS = "MakeVariants";
- public final static String PARAM_STOPWORD_FILE = "StopWordFile";
- /**
- * Parameter to indicate whether text - dictionary entries for this class - should be normalized by completely
- * removing dashes, parenthesis, genitive 's and perhaps more. This is meant to replace the generation of term
- * variants and cannot be used together with variation generation. If this is switched on here, it must also be
- * switched on in the descriptor for the annotator itself!
- */
- public final static String PARAM_NORMALIZE_TEXT = "NormalizeText";
- /**
- * Parameter to indicate whether text - dictionary entries for this class - should be transliterated, i.e. whether
- * accents and other character variations should be stripped. If this is switched on here, it must also be switched
- * on in the descriptor of the annotator itself!
- */
- public final static String PARAM_TRANSLITERATE_TEXT = "TransliterateText";
-
- private Boolean generateVariants;
- private Boolean caseSensitive;
- private Boolean useApproximateMatching;
- private Boolean transliterate;
- private Boolean normalize;
- private InputStream dictFile;
- private InputStream stopFile;
-
- private AbstractDictionary dict;
- private Chunker dictChunker = null;
- private final double CHUNK_SCORE = 1.0;
-
- private final int MIN_TERM_LENGTH = 3;
- private final double APPROX_MATCH_THRESHOLD_SCORE = 100;
- private Set stopWords = new HashSet();
- private String stopwordFilePath;
+ public final static String PARAM_USE_APPROXIMATE_MATCHING = "UseApproximateMatching";
+ public final static String PARAM_CASE_SENSITIVE = "CaseSensitive";
+ public final static String PARAM_MAKE_VARIANTS = "MakeVariants";
+ public final static String PARAM_STOPWORD_FILE = "StopWordFile";
+ /**
+ * Parameter to indicate whether text - dictionary entries for this class - should be normalized by completely
+ * removing dashes, parenthesis, genitive 's and perhaps more. This is meant to replace the generation of term
+ * variants and cannot be used together with variation generation. If this is switched on here, it must also be
+ * switched on in the descriptor for the annotator itself!
+ */
+ public final static String PARAM_NORMALIZE_TEXT = "NormalizeText";
+ /**
+ * Only in effect when {@link #PARAM_NORMALIZE_TEXT} is set to true. If so, will normalize plurals
+ * found in the text by removing the training 's'. Requires annotations of the type {@link de.julielab.jcore.types.PennBioIEPOSTag}
+ * to be present in the CAS.
+ */
+ public static final String PARAM_NORMALIZE_PLURAL = "NormalizePlural";
+ /**
+ * Parameter to indicate whether text - dictionary entries for this class - should be transliterated, i.e. whether
+ * accents and other character variations should be stripped. If this is switched on here, it must also be switched
+ * on in the descriptor of the annotator itself!
+ */
+ public final static String PARAM_TRANSLITERATE_TEXT = "TransliterateText";
+ private static final Logger LOGGER = LoggerFactory.getLogger(ConfigurableChunkerProviderImplAlt.class);
+ private final double CHUNK_SCORE = 1.0;
+ private final int MIN_TERM_LENGTH = 3;
+ private final double APPROX_MATCH_THRESHOLD_SCORE = 100;
+ private Boolean generateVariants;
+ private Boolean caseSensitive;
+ private Boolean useApproximateMatching;
+ private Boolean transliterate;
+ private Boolean normalize;
+ private Boolean normalizePlural;
+ private InputStream dictFile;
+ private InputStream stopFile;
+ private AbstractDictionary dict;
+ private Chunker dictChunker = null;
+ private Set stopWords = new HashSet();
+ private String stopwordFilePath;
private URI resourceUri;
public Chunker getChunker() {
- return dictChunker;
- }
+ return dictChunker;
+ }
- public void load(DataResource resource) throws ResourceInitializationException {
+ public void load(DataResource resource) throws ResourceInitializationException {
resourceUri = resource.getUri();
LOGGER.info("Creating dictionary chunker with dictionary loaded from " + resourceUri);
- ConfigurationParameterSettings settings = resource.getMetaData().getConfigurationParameterSettings();
- stopwordFilePath = (String) settings.getParameterValue(PARAM_STOPWORD_FILE);
- if (stopwordFilePath == null)
- throw new ResourceInitializationException(ResourceInitializationException.CONFIG_SETTING_ABSENT,
- new Object[] { PARAM_STOPWORD_FILE });
+ ConfigurationParameterSettings settings = resource.getMetaData().getConfigurationParameterSettings();
+ stopwordFilePath = (String) settings.getParameterValue(PARAM_STOPWORD_FILE);
+ if (stopwordFilePath == null)
+ throw new ResourceInitializationException(ResourceInitializationException.CONFIG_SETTING_ABSENT,
+ new Object[]{PARAM_STOPWORD_FILE});
- generateVariants = (Boolean) settings.getParameterValue(PARAM_MAKE_VARIANTS);
- LOGGER.info("Generate variants: {}", generateVariants);
+ generateVariants = (Boolean) settings.getParameterValue(PARAM_MAKE_VARIANTS);
+ LOGGER.info("Generate variants: {}", generateVariants);
- normalize = (Boolean) settings.getParameterValue(PARAM_NORMALIZE_TEXT);
- LOGGER.info("Normalize dictionary entries (i.e. completely strip dashes, parenthesis etc): {}", normalize);
+ normalize = (Boolean) settings.getParameterValue(PARAM_NORMALIZE_TEXT);
+ LOGGER.info("Normalize dictionary entries (i.e. completely strip dashes, parenthesis etc): {}", normalize);
+ normalizePlural = Optional.ofNullable((Boolean) settings.getParameterValue(PARAM_NORMALIZE_PLURAL)).orElse(false) && normalize;
+ if (normalize)
+ LOGGER.info("Also normalize plural forms to singular: {}", normalizePlural);
- transliterate = (Boolean) settings.getParameterValue(PARAM_TRANSLITERATE_TEXT);
- LOGGER.info("Transliterate dictionary entries (i.e. transform accented characters to their base forms): {}",
- transliterate);
+ transliterate = (Boolean) settings.getParameterValue(PARAM_TRANSLITERATE_TEXT);
+ LOGGER.info("Transliterate dictionary entries (i.e. transform accented characters to their base forms): {}",
+ transliterate);
- caseSensitive = (Boolean) settings.getParameterValue(PARAM_CASE_SENSITIVE);
- LOGGER.info("Case sensitive: {}", caseSensitive);
+ caseSensitive = (Boolean) settings.getParameterValue(PARAM_CASE_SENSITIVE);
+ LOGGER.info("Case sensitive: {}", caseSensitive);
- useApproximateMatching = (Boolean) settings.getParameterValue(PARAM_USE_APPROXIMATE_MATCHING);
- LOGGER.info("Use approximate matching: {}", useApproximateMatching);
+ useApproximateMatching = (Boolean) settings.getParameterValue(PARAM_USE_APPROXIMATE_MATCHING);
+ LOGGER.info("Use approximate matching: {}", useApproximateMatching);
- if (normalize && generateVariants)
- throw new ResourceInitializationException(
- new IllegalStateException(
- "MakeVariants and NormalizeText are both activated which is invalid. The two options work towards the same goal in two different ways, i.e. to recognize dictionary entry variants not given explicitly. However, the approaches are not compatible and you have to choose a single one."));
+ if (normalize && generateVariants)
+ throw new ResourceInitializationException(
+ new IllegalStateException(
+ "MakeVariants and NormalizeText are both activated which is invalid. The two options work towards the same goal in two different ways, i.e. to recognize dictionary entry variants not given explicitly. However, the approaches are not compatible and you have to choose a single one."));
- try {
+ try {
try {
dictFile = UriUtilities.getInputStreamFromUri(resource.getUri());
} catch (Exception e) {
LOGGER.error("Could not load the dictionary from {}, see the following exception for details.", resource.getUri());
throw e;
}
- stopFile = readStreamFromFileSystemOrClassPath(stopwordFilePath);
- initStopWords(stopFile);
- readDictionary(dictFile);
-
- LOGGER.info("Now creating chunker.");
- long time = System.currentTimeMillis();
- if (useApproximateMatching) {
- final Set charsToDelete = new HashSet<>();
- charsToDelete.add('-');
- // charsToDelete.add('+');
- // charsToDelete.add(',');
- // charsToDelete.add('.');
- // charsToDelete.add(':');
- // charsToDelete.add(';');
- // charsToDelete.add('?');
- // charsToDelete.add('!');
- // charsToDelete.add('*');
- // charsToDelete.add('§');
- // charsToDelete.add('$');
- // charsToDelete.add('%');
- // charsToDelete.add('&');
- // charsToDelete.add('/');
- // charsToDelete.add('\\');
- // charsToDelete.add('(');
- // charsToDelete.add(')');
- // charsToDelete.add('<');
- // charsToDelete.add('>');
- // charsToDelete.add('[');
- // charsToDelete.add(']');
- // charsToDelete.add('=');
- // charsToDelete.add('\'');
- // charsToDelete.add('`');
- // charsToDelete.add('´');
- // charsToDelete.add('"');
- // charsToDelete.add('#');
-
- WeightedEditDistance editDistance = ApproxDictionaryChunker.TT_DISTANCE;
- editDistance = new WeightedEditDistance() {
-
- @Override
- public double deleteWeight(char cDeleted) {
- double ret;
- if (cDeleted == '-')
- ret = -5.0;
- else if (cDeleted == ' ' || charsToDelete.contains(cDeleted))
- ret = -10.0;
- else
- ret = -110.0;
- return ret;
- }
-
- @Override
- public double insertWeight(char cInserted) {
- return deleteWeight(cInserted);
- }
-
- @Override
- public double matchWeight(char cMatched) {
- return 0.0;
- }
-
- @Override
- public double substituteWeight(char cDeleted, char cInserted) {
- if (cDeleted == ' ' && cInserted == '-')
- return -2.0;
- if (cDeleted == '-' && cInserted == ' ')
- return -2.0;
- if (cDeleted == ' ' && charsToDelete.contains(cInserted))
- return -10.0;
- if (charsToDelete.contains(cDeleted) && cInserted == ' ')
- return -10.0;
- return -110.0;
- }
-
- @Override
- public double transposeWeight(char c1, char c2) {
- return Double.NEGATIVE_INFINITY;
- }
- };
-
- dictChunker =
- new ApproxDictionaryChunker((TrieDictionary) dict,
- IndoEuropeanTokenizerFactory.INSTANCE, editDistance, APPROX_MATCH_THRESHOLD_SCORE);
- } else {
- dictChunker =
- new ExactDictionaryChunker(dict, IndoEuropeanTokenizerFactory.INSTANCE, false, caseSensitive);
- }
- time = System.currentTimeMillis() - time;
- LOGGER.info("Building the actual chunker from the dictionary took {}ms ({}s).", time, time / 1000);
-
- } catch (Exception e) {
- LOGGER.error("Exception while creating chunker instance from dictionary file {} with stopwords from {}", resource.getUri(), stopwordFilePath, e);
- }
- }
-
- private void readDictionary(InputStream dictFileStream) throws IOException, AnalysisEngineProcessException {
- long time = System.currentTimeMillis();
- if (useApproximateMatching) {
- dict = new TrieDictionary();
- } else {
- dict = new MapDictionary();
- }
- // now read from file and add entries
- LOGGER.info("readDictionary() - adding entries from " + resourceUri.toString() + " to dictionary...");
- BufferedReader bf = null;
- try {
- bf = new BufferedReader(new InputStreamReader(dictFileStream));
- String line = "";
-
- Transliterator transliterator = null;
- if (transliterate)
- transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower");
-
- TokenizerFactory tokenizerFactory = null;
- if (normalize)
- tokenizerFactory = new IndoEuropeanTokenizerFactory();
- while ((line = bf.readLine()) != null) {
- String[] values = line.split("\t");
- if (values.length != 2) {
- LOGGER.error("readDictionary() - wrong format of line: " + line);
- throw new AnalysisEngineProcessException(AnalysisEngineProcessException.ANNOTATOR_EXCEPTION, null);
- }
-
- String term = values[0].trim();
-
- if (stopWords.contains(term.toLowerCase()))
- continue;
-
- if (normalize) {
- term = StringNormalizerForChunking.normalizeString(term, tokenizerFactory).string;
- }
- if (transliterate)
- term = transliterator.transform(term);
- if (useApproximateMatching && !caseSensitive && !transliterate)
- term = term.toLowerCase();
-
- String label = values[1].trim();
- if (term.length() < MIN_TERM_LENGTH)
- continue;
-
- if (generateVariants) {
- if (true)
- throw new NotImplementedException(
- "In this alternative ChunkerProvider, generating variants will currently fail to adequately filter out stop words due to the transliteration and/or normalization algorithms. If you don't need those algorithms, just stick to the original ChunkerProviderImpl. Otherwise, this issue must be fixed (shouldnt be too difficult). Variants are also currently not treated with normalization/transliteration (but this is deemed to be two alternative ways to achieve a similar thing anyway)");
- } else {
- // This is a second stop-word-check but here the term has been transliterated and/or normalized. If
- // somehow the result of this was a stop word, ignore it.
- if (!stopWords.contains(term.toLowerCase()))
- dict.addEntry(new DictionaryEntry(term, label, CHUNK_SCORE));
- }
- }
-
- time = System.currentTimeMillis() - time;
- LOGGER.info("Reading dictionary took {}ms ({}s)", time, time / 1000);
- } finally {
- if (null != bf)
- bf.close();
- }
- }
-
- private void initStopWords(InputStream stopFileStream) throws IOException {
- stopWords = new HashSet();
-
- LOGGER.info("readDictionary() - adding entries from " + stopwordFilePath + " to dictionary...");
- BufferedReader bf = new BufferedReader(new InputStreamReader(stopFileStream));
- String line = "";
-
- try {
- while ((line = bf.readLine()) != null) {
- if (line.startsWith("#")) {
- continue;
- }
- stopWords.add(line.trim().toLowerCase());
- }
- bf.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- @Override
- public Set getStopWords() {
- return stopWords;
- }
-
- @Override
- public boolean getUseApproximateMatching() {
- return useApproximateMatching;
- }
-
- @Override
- public boolean getNormalize() {
- return normalize;
- }
-
- @Override
- public boolean getTransliterate() {
- return transliterate;
- }
-
- @Override
- public boolean getCaseSensitive() {
- return caseSensitive;
-
- }
-
- private InputStream readStreamFromFileSystemOrClassPath(String filePath) {
- InputStream is = null;
- File file = new File(filePath);
- if (file.exists()) {
- try {
- is = new FileInputStream(file);
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- }
- } else {
- is = getClass().getResourceAsStream(filePath.startsWith("/") ? filePath : "/" + filePath);
- }
- if (filePath.endsWith(".gz") || filePath.endsWith(".gzip"))
- try {
- is = new GZIPInputStream(is);
- } catch (IOException e) {
- e.printStackTrace();
- }
- return is;
- }
+ stopFile = readStreamFromFileSystemOrClassPath(stopwordFilePath);
+ initStopWords(stopFile);
+ readDictionary(dictFile);
+
+ LOGGER.info("Now creating chunker.");
+ long time = System.currentTimeMillis();
+ if (useApproximateMatching) {
+ final Set charsToDelete = new HashSet<>();
+ charsToDelete.add('-');
+ // charsToDelete.add('+');
+ // charsToDelete.add(',');
+ // charsToDelete.add('.');
+ // charsToDelete.add(':');
+ // charsToDelete.add(';');
+ // charsToDelete.add('?');
+ // charsToDelete.add('!');
+ // charsToDelete.add('*');
+ // charsToDelete.add('§');
+ // charsToDelete.add('$');
+ // charsToDelete.add('%');
+ // charsToDelete.add('&');
+ // charsToDelete.add('/');
+ // charsToDelete.add('\\');
+ // charsToDelete.add('(');
+ // charsToDelete.add(')');
+ // charsToDelete.add('<');
+ // charsToDelete.add('>');
+ // charsToDelete.add('[');
+ // charsToDelete.add(']');
+ // charsToDelete.add('=');
+ // charsToDelete.add('\'');
+ // charsToDelete.add('`');
+ // charsToDelete.add('´');
+ // charsToDelete.add('"');
+ // charsToDelete.add('#');
+
+ WeightedEditDistance editDistance = ApproxDictionaryChunker.TT_DISTANCE;
+ editDistance = new WeightedEditDistance() {
+
+ @Override
+ public double deleteWeight(char cDeleted) {
+ double ret;
+ if (cDeleted == '-')
+ ret = -5.0;
+ else if (cDeleted == ' ' || charsToDelete.contains(cDeleted))
+ ret = -10.0;
+ else
+ ret = -110.0;
+ return ret;
+ }
+
+ @Override
+ public double insertWeight(char cInserted) {
+ return deleteWeight(cInserted);
+ }
+
+ @Override
+ public double matchWeight(char cMatched) {
+ return 0.0;
+ }
+
+ @Override
+ public double substituteWeight(char cDeleted, char cInserted) {
+ if (cDeleted == ' ' && cInserted == '-')
+ return -2.0;
+ if (cDeleted == '-' && cInserted == ' ')
+ return -2.0;
+ if (cDeleted == ' ' && charsToDelete.contains(cInserted))
+ return -10.0;
+ if (charsToDelete.contains(cDeleted) && cInserted == ' ')
+ return -10.0;
+ return -110.0;
+ }
+
+ @Override
+ public double transposeWeight(char c1, char c2) {
+ return Double.NEGATIVE_INFINITY;
+ }
+ };
+
+ dictChunker =
+ new ApproxDictionaryChunker((TrieDictionary) dict,
+ IndoEuropeanTokenizerFactory.INSTANCE, editDistance, APPROX_MATCH_THRESHOLD_SCORE);
+ } else {
+ dictChunker =
+ new ExactDictionaryChunker(dict, IndoEuropeanTokenizerFactory.INSTANCE, false, caseSensitive);
+ }
+ time = System.currentTimeMillis() - time;
+ LOGGER.info("Building the actual chunker from the dictionary took {}ms ({}s).", time, time / 1000);
+
+ } catch (Exception e) {
+ LOGGER.error("Exception while creating chunker instance from dictionary file {} with stopwords from {}", resource.getUri(), stopwordFilePath, e);
+ }
+ }
+
+ private void readDictionary(InputStream dictFileStream) throws IOException, AnalysisEngineProcessException {
+ long time = System.currentTimeMillis();
+ if (useApproximateMatching) {
+ dict = new TrieDictionary();
+ } else {
+ dict = new MapDictionary();
+ }
+ // now read from file and add entries
+ LOGGER.info("readDictionary() - adding entries from " + resourceUri.toString() + " to dictionary...");
+ BufferedReader bf = null;
+ try {
+ bf = new BufferedReader(new InputStreamReader(dictFileStream));
+ String line = "";
+
+ Transliterator transliterator = null;
+// transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower");
+ transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC");
+
+ TokenizerFactory tokenizerFactory = null;
+ if (normalize)
+ tokenizerFactory = new IndoEuropeanTokenizerFactory();
+ while ((line = bf.readLine()) != null) {
+ if (line.startsWith("#"))
+ continue;
+ String[] values = line.split("\t");
+ if (values.length != 2) {
+ LOGGER.error("readDictionary() - wrong format of line: " + line);
+ throw new AnalysisEngineProcessException(AnalysisEngineProcessException.ANNOTATOR_EXCEPTION, null);
+ }
+
+ String term = values[0].trim();
+
+ if (stopWords.contains(term.toLowerCase()))
+ continue;
+
+ if (normalize) {
+ term = StringNormalizerForChunking.normalizeString(term, tokenizerFactory, transliterator).string;
+ }
+ if (transliterate)
+ term = transliterator.transform(term);
+ // the exact matcher takes the caseSensitive switch as a parameter, we don't need to do it ourselves
+ if (useApproximateMatching && !caseSensitive)
+ term = term.toLowerCase();
+
+ String label = values[1].trim();
+ if (term.length() < MIN_TERM_LENGTH)
+ continue;
+
+ if (generateVariants) {
+ if (true)
+ throw new NotImplementedException(
+ "In this alternative ChunkerProvider, generating variants will currently fail to adequately filter out stop words due to the transliteration and/or normalization algorithms. If you don't need those algorithms, just stick to the original ChunkerProviderImpl. Otherwise, this issue must be fixed (shouldnt be too difficult). Variants are also currently not treated with normalization/transliteration (but this is deemed to be two alternative ways to achieve a similar thing anyway)");
+ } else {
+ // This is a second stop-word-check but here the term has been transliterated and/or normalized. If
+ // somehow the result of this was a stop word, ignore it.
+ if (!stopWords.contains(term.toLowerCase()))
+ dict.addEntry(new DictionaryEntry(term, label, CHUNK_SCORE));
+ }
+ }
+
+ time = System.currentTimeMillis() - time;
+ LOGGER.info("Reading dictionary took {}ms ({}s)", time, time / 1000);
+ } finally {
+ if (null != bf)
+ bf.close();
+ }
+ }
+
+ private void initStopWords(InputStream stopFileStream) throws IOException {
+ stopWords = new HashSet();
+
+ LOGGER.info("readDictionary() - adding entries from " + stopwordFilePath + " to dictionary...");
+ BufferedReader bf = new BufferedReader(new InputStreamReader(stopFileStream));
+ String line = "";
+
+ try {
+ while ((line = bf.readLine()) != null) {
+ if (line.startsWith("#")) {
+ continue;
+ }
+ stopWords.add(line.trim().toLowerCase());
+ }
+ bf.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ @Override
+ public Set getStopWords() {
+ return stopWords;
+ }
+
+ @Override
+ public boolean getUseApproximateMatching() {
+ return useApproximateMatching;
+ }
+
+ @Override
+ public boolean getNormalize() {
+ return normalize;
+ }
+
+ @Override
+ public boolean getNormalizePlural() {
+ return normalizePlural;
+ }
+
+ @Override
+ public boolean getTransliterate() {
+ return transliterate;
+ }
+
+ @Override
+ public boolean getCaseSensitive() {
+ return caseSensitive;
+
+ }
+
+ private InputStream readStreamFromFileSystemOrClassPath(String filePath) throws FileNotFoundException {
+ InputStream is = null;
+ File file = new File(filePath);
+ if (file.exists()) {
+ try {
+ is = new FileInputStream(file);
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+ } else {
+ is = getClass().getResourceAsStream(filePath.startsWith("/") ? filePath : "/" + filePath);
+ }
+ if (filePath.endsWith(".gz") || filePath.endsWith(".gzip"))
+ try {
+ is = new GZIPInputStream(is);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ if (is == null)
+ throw new FileNotFoundException("Could not read contents from " + filePath);
+ return is;
+ }
}
diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java
index 6ddd3b58a..b2a534d9f 100644
--- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java
+++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.java
@@ -1,21 +1,20 @@
-/**
- *
+/**
* Copyright (c) 2015, JULIE Lab.
- *
+ *
* A entity tagger based on a dictionary lookup. Lingpipe's gazetteer is used.
- *
- * There are two modes: exact matching (only terms which map exactly to
- * those specified in dictionary are found). Approximate matching (by means of
- * weighted levenstein distance, approximate matches are found.)
- *
- * As approximate matching results in concurring matches on overlapping spans, I
+ *
+ * There are two modes: exact matching (only terms which map exactly to
+ * those specified in dictionary are found). Approximate matching (by means of
+ * weighted levenstein distance, approximate matches are found.)
+ *
+ * As approximate matching results in concurring matches on overlapping spans, I
* added a mechanism to resolve this according to this rules: in overlapping matches
- * the one with the best (here: lowest) score is taken, if more than one chunk has the
+ * the one with the best (here: lowest) score is taken, if more than one chunk has the
* same score, the one with the longest span is chosen.
**/
package de.julielab.jcore.ae.lingpipegazetteer.uima;
@@ -26,6 +25,7 @@
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.TokenizerFactory;
import com.ibm.icu.text.Transliterator;
+import de.julielab.java.utilities.spanutils.OffsetSet;
import de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider;
import de.julielab.jcore.ae.lingpipegazetteer.chunking.OverlappingChunk;
import de.julielab.jcore.ae.lingpipegazetteer.utils.StringNormalizerForChunking;
@@ -33,12 +33,14 @@
import de.julielab.jcore.types.Abbreviation;
import de.julielab.jcore.types.AbbreviationLongform;
import de.julielab.jcore.types.ConceptMention;
+import de.julielab.jcore.types.PennBioIEPOSTag;
import de.julielab.jcore.types.mantra.Entity;
import de.julielab.jcore.utility.JCoReAnnotationTools;
import de.julielab.jcore.utility.index.IndexTermGenerator;
import de.julielab.jcore.utility.index.JCoReHashMapAnnotationIndex;
import de.julielab.jcore.utility.index.TermGenerators;
import de.julielab.jcore.utility.index.TermGenerators.LongOffsetIndexTermGenerator;
+import org.apache.commons.lang3.Range;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
@@ -54,705 +56,677 @@
import org.slf4j.LoggerFactory;
import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
public class GazetteerAnnotator extends JCasAnnotator_ImplBase {
- private static final String COMPONENT_ID = GazetteerAnnotator.class.getCanonicalName();
- private static final Logger LOGGER = LoggerFactory.getLogger(GazetteerAnnotator.class);
- public static final String CHUNKER_RESOURCE_NAME = "DictionaryChunkerProvider";
- // public final static String PARAM_USE_APPROXIMATE_MATCHING =
- // "UseApproximateMatching";
- public final static String PARAM_CHECK_ACRONYMS = "CheckAcronyms";
- public final static String PARAM_OUTPUT_TYPE = "OutputType";
- /**
- * Only required to set to false as an annotator parameter when using
- * approximate matching and the ChunkerProvider is set to CaseSensitive false.
- * That is because the approximate chunker is always case sensitive.
- */
- // public final static String PARAM_CASE_SENSITIVE = "CaseSensitive";
- private static final String PARAM_USE_MANTRA_MODE = "MantraMode";
- /**
- * Parameter to indicate whether text - CAS document text for this class -
- * should be normalized by completely removing dashes, parenthesis, genitive 's
- * and perhaps more. This is meant to replace the generation of term variants
- * and cannot be used together with variation generation. If this is switched on
- * here, it must also be switched on in the external resource configuration for
- * the ChunkerProvider! Can only be used with alternative ChunkerProviderImplAlt
- * implementation.
- */
- // public final static String PARAM_NORMALIZE_TEXT = "NormalizeText";
- /**
- * Parameter to indicate whether text - CAS document text for this class -
- * should be transliterated, i.e. whether accents and other character variations
- * should be stripped. If this is switched on here, it must also be switched on
- * in the external resource configuration for the ChunkerProvider! Can only be
- * used with alternative ChunkerProviderImplAlt implementation.
- */
- // public final static String PARAM_TRANSLITERATE_TEXT =
- // "TransliterateText";
-
- @ConfigurationParameter(name = PARAM_USE_MANTRA_MODE, defaultValue = "false")
- private boolean mantraMode = false;
-
- // needs to be true because of chunker injection:
- @ConfigurationParameter(name = PARAM_CHECK_ACRONYMS, defaultValue = "true")
- private boolean checkAcronyms = true;
- @ConfigurationParameter(name = PARAM_OUTPUT_TYPE)
- private String outputType = null;
-
- @ExternalResource(key = CHUNKER_RESOURCE_NAME, mandatory = true)
- private ChunkerProvider provider;
- /**
- * Removes diacritics and does lower casing
- */
- private Transliterator transliterator;
- private Chunker gazetteer = null;
- private TokenizerFactory normalizationTokenFactory;
- private Set stopWords;
-
- // TODO for debug only
- private static int initializeCount = 0;
-
- public void initialize(UimaContext aContext) throws ResourceInitializationException {
- LOGGER.info("calls to initialize: " + initializeCount);
-
- super.initialize(aContext);
- LOGGER.info("initialize() - initializing GazetteerAnnotator...");
-
- try {
- provider = (ChunkerProvider) getContext().getResourceObject(CHUNKER_RESOURCE_NAME);
- gazetteer = provider.getChunker();
-// stopWords = provider.getStopWords();
- String[] stopwordArray = { "a", "about", "above", "across", "after", "afterwards", "again", "against",
- "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among",
- "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything",
- "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become",
- "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside",
- "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot",
- "cant", "co", "computer", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do",
- "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere",
- "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except",
- "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly",
- "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has",
- "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
- "herself", "high", "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in",
- "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
- "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill",
- "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name",
- "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
- "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only",
- "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own",
- "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed",
- "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere",
- "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes",
- "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them",
- "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein",
- "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though", "three",
- "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards",
- "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we",
- "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas",
- "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who",
- "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet",
- "you", "your", "yours", "yourself", "yourselves", };
- stopWords = new HashSet<>();
- for (String sw : stopwordArray)
- stopWords.add(sw);
- } catch (ResourceAccessException e) {
- LOGGER.error("Exception while initializing", e);
- }
-
- // check acronyms
- checkAcronyms = (Boolean) aContext.getConfigParameterValue(PARAM_CHECK_ACRONYMS);
- LOGGER.info(
- "Check for acronyms (found dictionary entries that are abbreviations are only accepted if their long form is an abbreviation of the same type, too): {}",
- checkAcronyms);
- // filter stop words
-
- Boolean normalizeBoolean = provider.getNormalize();// (Boolean)
- // aContext.getConfigParameterValue(PARAM_NORMALIZE_TEXT);
- if (normalizeBoolean) {
- normalizationTokenFactory = new IndoEuropeanTokenizerFactory();
- }
- LOGGER.info("Normalize CAS document text (i.e. do stemming and remove possessive 's): {}", provider.getNormalize());
-
- Boolean transliterateBoolean = provider.getTransliterate();// (Boolean)
- // aContext.getConfigParameterValue(PARAM_TRANSLITERATE_TEXT);
- if (transliterateBoolean) {
- transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower");
- }
- LOGGER.info("Transliterate CAS document text (i.e. transform accented characters to their base forms): {}",
- provider.getTransliterate());
-
- // define output level
- outputType = (String) aContext.getConfigParameterValue(PARAM_OUTPUT_TYPE);
- if (outputType == null) {
- LOGGER.error("initialize() - output type not specified.");
- throw new ResourceInitializationException();
- }
-
- mantraMode = aContext.getConfigParameterValue(PARAM_USE_MANTRA_MODE) != null
- ? (Boolean) aContext.getConfigParameterValue(PARAM_USE_MANTRA_MODE)
- : false;
- }
-
- /**
- * process the CAS, there are two subroutines: one for exact and one for
- * approximate matching.
- */
- public void process(JCas aJCas) throws AnalysisEngineProcessException {
- if (gazetteer == null)
- throw new IllegalStateException("The actual gazetteer object is null. Check previous log messages pointing to the error (most probably the dictionary file could not be found).");
- String docText = aJCas.getDocumentText();
- if (docText == null || docText.length() == 0)
- return;
- if (provider.getUseApproximateMatching() && !provider.getTransliterate() && !provider.getCaseSensitive())
- docText = docText.toLowerCase();
- NormalizedString normalizedDocText = null;
- if (provider.getNormalize()) {
- normalizedDocText = StringNormalizerForChunking.normalizeString(docText, normalizationTokenFactory,
- transliterator);
- }
-
- IndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator();
- JCoReHashMapAnnotationIndex conceptMentionIndex = new JCoReHashMapAnnotationIndex<>(
- longOffsetTermGenerator, longOffsetTermGenerator, aJCas, ConceptMention.type);
- JCoReHashMapAnnotationIndex abbreviationIndex = new JCoReHashMapAnnotationIndex<>(
- longOffsetTermGenerator, longOffsetTermGenerator, aJCas, Abbreviation.type);
-
- LOGGER.debug("Performing actual Gazetteer annotation...");
- Chunking chunking;
- if (provider.getNormalize())
- chunking = gazetteer.chunk(normalizedDocText.string);
- else
- chunking = gazetteer.chunk(docText);
- LOGGER.debug("Gazetteer annotation done.");
- if (provider.getUseApproximateMatching()) {
- /*
- * handle matches found by approx matching: this means especially overlapping
- * matches with different scores (doesn't happen with exact matches)
- */
- List chunkList = filterChunking(chunking);
- List overlappingChunks = groupOverlappingChunks(chunkList,
- chunking.charSequence().toString());
- // now add the best chunk of all overlappingChunks to the CAS
- LOGGER.debug("all overlapping chunks:\n");
- // Set bestChunksSet = new HashSet<>();
- for (OverlappingChunk overlappingChunk : overlappingChunks) {
- // show chunks
- LOGGER.debug(overlappingChunk.toStringAll());
- List bestChunks = overlappingChunk.getBestChunks();
- LOGGER.debug("Found {} best chunks.", bestChunks.size());
- for (int i = 0; i < bestChunks.size(); i++) {
- Chunk bestChunk = bestChunks.get(i);
- LOGGER.debug("Nr. " + i + " best chunk: " + bestChunk.start() + " - " + bestChunk.end() + ": "
- + bestChunk.score() + " ; type: " + bestChunk.type());
- // TODO this check and the corresponding set may be removed
- // when this exception hasn't been thrown
- // in a
- // while. Its currently just to be sure, this should not
- // happen any more since the chunks are sorted
- // by
- // offset in the grouping method.
- // if (bestChunksSet.contains(bestChunk)) {
- // throw new IllegalStateException("Duplicate best chunk: " + bestChunk);
- // }
- // bestChunksSet.add(bestChunk);
- // add 2 cas
- add2Cas(aJCas, bestChunk, normalizedDocText, conceptMentionIndex, abbreviationIndex);
- }
- }
- // for (Chunk chunk : chunking.chunkSet()) {
- // add2Cas(aJCas, chunk, normalizedDocText);
- // }
- } else {
- for (Chunk chunk : chunking.chunkSet()) {
- add2Cas(aJCas, chunk, normalizedDocText, conceptMentionIndex, abbreviationIndex);
- }
- }
- if (checkAcronyms && !mantraMode) {
- LOGGER.debug("process() - checking acronyms");
- annotateAcronymsWithFullFormEntity(aJCas, conceptMentionIndex);
- }
- }
-
- private List filterChunking(Chunking chunking) {
- // ChunkingImpl newChunking = new ChunkingImpl(chunking.charSequence());
- List newChunking = new ArrayList<>(chunking.chunkSet().size());
- for (Chunk chunk : chunking.chunkSet()) {
- String chunkText = chunking.charSequence().subSequence(chunk.start(), chunk.end()).toString();
- if (filterParenthesis(chunkText))
- continue;
- if (filterPunctuationArtifacts(chunkText))
- continue;
- if (filterStopwords(chunkText))
- continue;
- newChunking.add(chunk);
- }
- return newChunking;
- }
-
- private boolean filterPunctuationArtifacts(String chunkText) {
- if (chunkText.startsWith("-"))
- return true;
- if (chunkText.endsWith("-"))
- return true;
- return false;
- }
-
- private boolean filterStopwords(String chunkText) {
- if (stopWords.contains(chunkText.toLowerCase()))
- return true;
- if (chunkText.contains(" ")) {
- String[] words = chunkText.split(" ");
- int stopWordCounter = 0;
- for (String word : words) {
- if (stopWords.contains(word.toLowerCase()))
- stopWordCounter++;
- }
- if (Math.ceil(words.length / 2.0) <= stopWordCounter) {
- LOGGER.debug("Filtering due to high stop word occurrences: {}", chunkText);
- return true;
- }
- }
- return false;
- }
-
- static boolean filterParenthesis(String chunkText) {
- Stack parenthesisStack = new Stack<>();
- // Map pMap = new HashMap<>();
- for (int i = 0; i < chunkText.length(); i++) {
- char current = chunkText.charAt(i);
- if (isParentheses(current)) {
- if (isOpenedParentheses(current)) {
- parenthesisStack.add(current);
- } else {
- if (parenthesisStack.isEmpty())
- return true;
- if (!isParenthesisCounterpart(parenthesisStack.pop(), current))
- return true;
- }
- }
- }
- if (!parenthesisStack.isEmpty())
- return true;
- return false;
- }
-
- private static boolean isParenthesisCounterpart(Character char1, Character char2) {
- ParenthesisType char1ParenthesisType = getParenthesisType(char2);
- ParenthesisType char2ParenthesisType = getParenthesisType(char1);
- if (char1ParenthesisType == ParenthesisType.NONE || char2ParenthesisType == ParenthesisType.NONE)
- throw new IllegalArgumentException("The two characters '" + char1 + "' and '" + char2
- + "' were given in order to determine whether they are compatible parenthesis counterparts, but at least one of those characters is no parentheses.");
- return char1ParenthesisType.equals(char2ParenthesisType);
- }
-
- // enum ParenthesesType {
- // ROUND_CLOSED {
- // @Override
- // boolean isOpen() {
- // return false;
- // }
- //
- // },
- // BRACKET_CLOSED {
- // @Override
- // boolean isOpen() {
- // return false;
- // }
- // },
- // CURLY_CLOSED {
- // @Override
- // boolean isOpen() {
- // return false;
- // }
- //
- // },
- // ROUND_OPENED {
- // @Override
- // boolean isOpen() {
- // return true;
- // }
- // },
- // BRACKET_OPENED {
- // @Override
- // boolean isOpen() {
- // return true;
- // }
- // },
- // CURLY_OPENED {
- // @Override
- // boolean isOpen() {
- // return true;
- // }
- // };
- // abstract boolean isOpen();
- //
- // boolean isClose() {
- // return !isOpen();
- // };
- // }
-
- enum ParenthesisType {
- ROUND, BRACKET, CURLY, NONE
- }
-
- static ParenthesisType getParenthesisType(char current) {
- switch (current) {
- case '(':
- case ')':
- return ParenthesisType.ROUND;
- case '[':
- case ']':
- return ParenthesisType.BRACKET;
- case '{':
- case '}':
- return ParenthesisType.CURLY;
- default:
- return ParenthesisType.NONE;
- }
- }
-
- static boolean isParentheses(char current) {
- return isOpenedParentheses(current) || isClosedParentheses(current);
- }
-
- static boolean isOpenedParentheses(char current) {
- switch (current) {
- case '(':
- case '[':
- case '{':
- return true;
- default:
- return false;
- }
- }
-
- static boolean isClosedParentheses(char current) {
- switch (current) {
- case ')':
- case ']':
- case '}':
- return true;
- default:
- return false;
- }
- }
-
- static List groupOverlappingChunks(List chunkList, String chunkedText) {
- // sort chunkList so the grouping works as intended
- Collections.sort(chunkList, new Comparator() {
-
- @Override
- public int compare(Chunk o1, Chunk o2) {
- return o1.start() - o2.start();
- }
-
- });
- // group overlapping chunks
- List overlappingChunks = new ArrayList();
- for (Chunk chunk : chunkList) {
- // for debugging
- // System.out.println("chunking.add(ChunkFactory.createChunk(" +
- // chunk.start() + ", " + chunk.end() +
- // ", 0d));");
- boolean added = false;
- for (OverlappingChunk over : overlappingChunks) {
- if (over.isOverlappingSpan(chunk.start(), chunk.end())) {
- over.addChunk(chunk.start(), chunk.end(), chunk);
- added = true;
- }
- }
- if (!added) {
- overlappingChunks.add(new OverlappingChunk(chunk.start(), chunk.end(), chunk, chunkedText));
- added = true;
- }
- }
- return overlappingChunks;
- }
-
- // ------------ INFO ..........
- // String text = aJCas.getDocumentText();
- // int start = chunk.start();
- // int end = chunk.end();
- // String type = chunk.type();
- // double score = chunk.score();
- // String phrase = text.substring(start, end);
- // System.out.println(" found phrase=|" + phrase + "|"
- // + " start=" + start + " end=" + end + " type=" + type
- // + " score=" + score);
- // ------------ INFO ..........
- /**
- * checks whether a chunk (= dictionary match) is an acronym. If yes, checks
- * whether respective full form (obtained via abbr textReference) is
- * ConceptMention and has same specificType as chunk If these conditions are not
- * fulfilled, no entity annotation will be made.
- *
- * @param abbreviationIndex
- * @param conceptMentionIndex
- */
- private boolean isAcronymWithSameFullFormSpecificType(JCas aJCas, Chunk chunk, NormalizedString normalizedDocText,
- JCoReHashMapAnnotationIndex conceptMentionIndex,
- JCoReHashMapAnnotationIndex abbreviationIndex) {
- // Annotation anno;
- int start;
- int end;
- if (provider.getNormalize()) {
- try {
- start = normalizedDocText.getOriginalOffset(chunk.start());
- end = normalizedDocText.getOriginalOffset(chunk.end());
- } catch (Exception e) {
- System.out.println("Text: " + normalizedDocText);
- System.out.println("Chunk: " + chunk);
- System.out.println("Chunk end: " + chunk.end());
- System.out
- .println("Normalized Text: " + normalizedDocText.string.substring(chunk.start(), chunk.end()));
- throw e;
- }
- // anno = new Annotation(aJCas, start, end);
- } else {
- start = chunk.start();
- end = chunk.end();
- }
-
- LongOffsetIndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator();
- // Retrieves potential abbr annotation
- Abbreviation abbr = abbreviationIndex.getFirst(longOffsetTermGenerator.forOffsets(start, end));
- // check whether it's an abbr
- String chunktext = null;
- if (LOGGER.isDebugEnabled())
- chunktext = aJCas.getDocumentText().substring(start, end);
- if (abbr == null) {
- LOGGER.debug("{} chunk \"{}\" is not an abbreviation\n", chunk, chunktext);
- return true;
- }
- // checks whether respective full form is ConceptMention
- AbbreviationLongform textRef = abbr.getTextReference();
- ConceptMention em = conceptMentionIndex.getFirst(textRef);
- if (em == null) {
- LOGGER.debug(
- chunk + " chunk \"{}\" is an abbreviation but respective full \"{}\" form is no ConceptMention\n",
- chunktext, textRef.getCoveredText());
- return false;
- }
-
- // checks whether full form annotation matches the type to be annotated
- // here
- String emType = em.getClass().getCanonicalName();
- if (emType.equals(outputType)) {
- LOGGER.debug(chunk
- + " chunk \"{}\" is an abbreviation and respective full form \"{}\" is ConceptMention with same type as OutputType\n",
- chunktext, em.getCoveredText());
- return true;
- }
-
- LOGGER.debug(chunk
- + " chunk \"{}\" is an abbreviation but respective full form \"{}\" is ConceptMention without the correct OutputType (is: {}; OutputType: {})\n",
- new Object[] { chunktext, em.getCoveredText(), emType, outputType });
- return false;
- }
-
- /**
- * adds a chunk as an annotation to the CAS
- *
- * @param normalizedDocText
- * @param abbreviationIndex
- * @param conceptMentionIndex
- */
- private void add2Cas(JCas aJCas, Chunk chunk, NormalizedString normalizedDocText,
- JCoReHashMapAnnotationIndex conceptMentionIndex,
- JCoReHashMapAnnotationIndex abbreviationIndex) throws AnalysisEngineProcessException {
- // System.out.println("CHUNK: start=" + chunk.start() + " end=" +
- // chunk.end());
- // if checkAcronyms, then check acronyms for compliant full forms (=
- // with same specificType)
- if (checkAcronyms && !isAcronymWithSameFullFormSpecificType(aJCas, chunk, normalizedDocText,
- conceptMentionIndex, abbreviationIndex)) {
- return;
- }
-
- int start = provider.getNormalize() ? normalizedDocText.getOriginalOffset(chunk.start()) : chunk.start();
- int end = provider.getNormalize() ? normalizedDocText.getOriginalOffset(chunk.end()) : chunk.end();
-
- try {
- if (mantraMode) {
- // the "type" string is used to transport all data needed for
- // the MAN-XML format
- for (String term : chunk.type().split("@@TERM@@")) {
- // @@ is used to separate source, cui, type(s) and group (in
- // this order!)
- String[] info = term.split("@@");
- Entity newEntity = (Entity) JCoReAnnotationTools.getAnnotationByClassName(aJCas,
- "de.julielab.jcore.types.mantra.Entity");
- newEntity.setBegin(start);
- newEntity.setEnd(end);
- newEntity.setComponentId(COMPONENT_ID);
- newEntity.setConfidence(chunk.score() + "");
-
- // mantra specific
- newEntity.setSource(info[0]);
- newEntity.setCui(info[1]);
- newEntity.setSemanticType(info[2]);
- newEntity.setSemanticGroup(info[3]);
-
- newEntity.addToIndexes();
- }
- } else {
- ConceptMention newEntity = (ConceptMention) JCoReAnnotationTools.getAnnotationByClassName(aJCas,
- outputType);
- newEntity.setBegin(start);
- newEntity.setEnd(end);
-
- // String entityText = newEntity.getCoveredText();
- // if (stopWords.contains(entityText.toLowerCase()))
- // return;
- // if (entityText.contains(" ")) {
- // String[] words = entityText.split(" ");
- // int stopWordCounter = 0;
- // for (String word : words) {
- // if (stopWords.contains(word.toLowerCase()))
- // stopWordCounter++;
- // }
- // if (words.length == stopWordCounter)
- // return;
- // }
-
- newEntity.setSpecificType(chunk.type());
- newEntity.setComponentId(COMPONENT_ID);
- newEntity.setConfidence(chunk.score() + "");
- newEntity.addToIndexes();
-
- conceptMentionIndex.index(newEntity);
- }
- } catch (Exception e) {
- LOGGER.error("process() - could not generate output type: " + e.getMessage());
- e.printStackTrace();
- throw new AnalysisEngineProcessException(e);
- }
- }
-
- private void annotateAcronymsWithFullFormEntity(JCas aJCas,
- JCoReHashMapAnnotationIndex conceptMentionIndex)
- throws AnalysisEngineProcessException {
-
- JFSIndexRepository indexes = aJCas.getJFSIndexRepository();
- FSIterator abbrevIter = indexes.getAnnotationIndex(Abbreviation.type).iterator();
- IndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator();
-
- // loop over all abbreviations
- while (abbrevIter.hasNext()) {
- Abbreviation abbrev = (Abbreviation) abbrevIter.next();
- AbbreviationLongform fullFormAnnotation = abbrev.getTextReference();
- LOGGER.debug("annotateAcronymsWithFullFormEntity() - checking abbreviation: " + abbrev.getCoveredText());
- ConceptMention emFullform = null;// AnnotationRetrieval.getMatchingAnnotation(aJCas, fullFormAnnotation,
- // ConceptMention.class);
- emFullform = conceptMentionIndex.getFirst(fullFormAnnotation);
-
- // The following code was once introduced for gene tagging. There,
- // the acronym fullforms sometimes miss minor parts of an annotated
- // gene, leading to non-annotated acronyms that would have been
- // correct.
- // However, for general-purpose concept recognition this approach
- // can be quite harmful. Example: "Anaphase-promoting complex (APC)"
- // where only "anaphase" is recognized as concept. Now, "APC" would
- // be annotated as an acronym for "anaphase". Here, a better
- // recognition of the abbreviation span is required.
- // ConceptMention emFullform = null;
- // List conceptsInFullform =
- // JCoReAnnotationTools.getIncludedAnnotations(aJCas,
- // fullFormAnnotation,
- // ConceptMention.class);
- // if (conceptsInFullform.size() == 1) {
- // emFullform = conceptsInFullform.get(0);
- // LOGGER.debug("Found a single ConceptMention included in the full
- // form: {}", emFullform.getCoveredText());
- // } else if (conceptsInFullform.size() > 1) {
- // // If there are multiple ConceptMentions found in the full form,
- // take that largest right-most candidate.
- // int maxSize = -1;
- // for (ConceptMention em : conceptsInFullform) {
- // int emSize = em.getEnd() - em.getBegin();
- // if (emSize > maxSize) {
- // emFullform = em;
- // maxSize = emSize;
- // }
- // }
- // LOGGER.debug("Found multiple ConceptMentions included in the full
- // form \"{}\", returning the longest.",
- // fullFormAnnotation.getCoveredText());
- // if (LOGGER.isTraceEnabled()) {
- // LOGGER.trace("All found ConceptMentions:");
- // for (ConceptMention cm : conceptsInFullform) {
- // LOGGER.trace("Text: {}; offsets: {}-{}",
- // new Object[] { cm.getCoveredText(), cm.getBegin(), cm.getEnd()
- // });
- // }
- // }
- // } else {
- // LOGGER.debug("No ConceptMention in the span of acronym fullform
- // \"{}\" found.",
- // fullFormAnnotation.getCoveredText());
- // }
-
- String type = null;
- if (emFullform != null)
- type = emFullform.getClass().getCanonicalName();
-
- ConceptMention emAcronym = null;// AnnotationRetrieval.getMatchingAnnotation(aJCas, abbrev,
- // ConceptMention.class);
- emAcronym = conceptMentionIndex.getFirst(abbrev);
- // This is really slow, really a pain with full texts.
- // It was originally introduced to push recall for gene recognition.
- // So now we will lose (a bit) of recognition performance there.
- // ConceptMention emAcronym =
- // JCoReAnnotationTools.getPartiallyOverlappingAnnotation(aJCas,
- // abbrev,
- // ConceptMention.class);
-
- // if type of the entity is equal to the output type for this
- // annotator
- if (type != null && type.equals(outputType)) {
- if (emFullform == null) {
- LOGGER.debug(
- "annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no ConceptMention\n");
- continue;
- }
- if (emFullform.getComponentId() != null && emFullform.getComponentId().equals(COMPONENT_ID)
- && (emAcronym == null
- || !emAcronym.getClass().getName().equals(emFullform.getClass().getName()))) {
-
- try {
- LOGGER.debug("annotateAcronymsWithFullFormEntity() - fullform of abbreviation ("
- + abbrev.getCoveredText() + " [begin=" + abbrev.getBegin() + "; end=" + abbrev.getEnd()
- + "]) has ConceptMention: " + emFullform.toString());
- ConceptMention newEntityOnAcronym = (ConceptMention) JCoReAnnotationTools
- .getAnnotationByClassName(aJCas, outputType);
- newEntityOnAcronym.setBegin(abbrev.getBegin());
- newEntityOnAcronym.setEnd(abbrev.getEnd());
- newEntityOnAcronym.setTextualRepresentation(newEntityOnAcronym.getCoveredText());
- newEntityOnAcronym.setSpecificType(emFullform.getSpecificType());
- newEntityOnAcronym.setComponentId(COMPONENT_ID + "+acronym");
- newEntityOnAcronym.setConfidence(emFullform.getConfidence() + "");
- newEntityOnAcronym.addToIndexes();
-
- } catch (Exception e) {
- LOGGER.error("process() - could not generate output type: " + e.getMessage());
- e.printStackTrace();
- throw new AnalysisEngineProcessException(AnalysisEngineProcessException.ANNOTATOR_EXCEPTION,
- null);
- }
-
- } else {
- if (emAcronym == null)
- LOGGER.debug("annotateAcronymsWithFullFormEntity() - emAcronym != null");
- else if (emAcronym.getClass().getName().equals(emFullform.getClass().getName()))
- LOGGER.debug("annotateAcronymsWithFullFormEntity() - emAcroType="
- + emAcronym.getClass().getCanonicalName() + " == emFullformType="
- + emFullform.getClass().getCanonicalName());
- }
-
- }
- }
- }
+ public static final String CHUNKER_RESOURCE_NAME = "DictionaryChunkerProvider";
+ // public final static String PARAM_USE_APPROXIMATE_MATCHING =
+ // "UseApproximateMatching";
+ public final static String PARAM_CHECK_ACRONYMS = "CheckAcronyms";
+ public final static String PARAM_OUTPUT_TYPE = "OutputType";
+ private static final String COMPONENT_ID = GazetteerAnnotator.class.getCanonicalName();
+ private static final Logger LOGGER = LoggerFactory.getLogger(GazetteerAnnotator.class);
+ /**
+ * Only required to set to false as an annotator parameter when using
+ * approximate matching and the ChunkerProvider is set to CaseSensitive false.
+ * That is because the approximate chunker is always case sensitive.
+ */
+ // public final static String PARAM_CASE_SENSITIVE = "CaseSensitive";
+ private static final String PARAM_USE_MANTRA_MODE = "MantraMode";
+ /**
+ * Parameter to indicate whether text - CAS document text for this class -
+ * should be normalized by completely removing dashes, parenthesis, genitive 's
+ * and perhaps more. This is meant to replace the generation of term variants
+ * and cannot be used together with variation generation. If this is switched on
+ * here, it must also be switched on in the external resource configuration for
+ * the ChunkerProvider! Can only be used with alternative ChunkerProviderImplAlt
+ * implementation.
+ */
+ // public final static String PARAM_NORMALIZE_TEXT = "NormalizeText";
+ // TODO for debug only
+ private static int initializeCount = 0;
+ /**
+ * Parameter to indicate whether text - CAS document text for this class -
+ * should be transliterated, i.e. whether accents and other character variations
+ * should be stripped. If this is switched on here, it must also be switched on
+ * in the external resource configuration for the ChunkerProvider! Can only be
+ * used with alternative ChunkerProviderImplAlt implementation.
+ */
+ // public final static String PARAM_TRANSLITERATE_TEXT =
+ // "TransliterateText";
+
+ @ConfigurationParameter(name = PARAM_USE_MANTRA_MODE, defaultValue = "false")
+ private boolean mantraMode = false;
+ // needs to be true because of chunker injection:
+ @ConfigurationParameter(name = PARAM_CHECK_ACRONYMS, defaultValue = "true")
+ private boolean checkAcronyms = true;
+ @ConfigurationParameter(name = PARAM_OUTPUT_TYPE)
+ private String outputType = null;
+ @ExternalResource(key = CHUNKER_RESOURCE_NAME, mandatory = true)
+ private ChunkerProvider provider;
+ /**
+ * Removes diacritics and does lower casing
+ */
+ private Transliterator transliterator;
+ private Chunker gazetteer = null;
+ private TokenizerFactory normalizationTokenFactory;
+ private Set stopWords;
+
+ static boolean filterParenthesis(String chunkText) {
+ Stack parenthesisStack = new Stack<>();
+ // Map pMap = new HashMap<>();
+ for (int i = 0; i < chunkText.length(); i++) {
+ char current = chunkText.charAt(i);
+ if (isParentheses(current)) {
+ if (isOpenedParentheses(current)) {
+ parenthesisStack.add(current);
+ } else {
+ if (parenthesisStack.isEmpty())
+ return true;
+ if (!isParenthesisCounterpart(parenthesisStack.pop(), current))
+ return true;
+ }
+ }
+ }
+ if (!parenthesisStack.isEmpty())
+ return true;
+ return false;
+ }
+
+ private static boolean isParenthesisCounterpart(Character char1, Character char2) {
+ ParenthesisType char1ParenthesisType = getParenthesisType(char2);
+ ParenthesisType char2ParenthesisType = getParenthesisType(char1);
+ if (char1ParenthesisType == ParenthesisType.NONE || char2ParenthesisType == ParenthesisType.NONE)
+ throw new IllegalArgumentException("The two characters '" + char1 + "' and '" + char2
+ + "' were given in order to determine whether they are compatible parenthesis counterparts, but at least one of those characters is no parentheses.");
+ return char1ParenthesisType.equals(char2ParenthesisType);
+ }
+
+ static ParenthesisType getParenthesisType(char current) {
+ switch (current) {
+ case '(':
+ case ')':
+ return ParenthesisType.ROUND;
+ case '[':
+ case ']':
+ return ParenthesisType.BRACKET;
+ case '{':
+ case '}':
+ return ParenthesisType.CURLY;
+ default:
+ return ParenthesisType.NONE;
+ }
+ }
+
+ static boolean isParentheses(char current) {
+ return isOpenedParentheses(current) || isClosedParentheses(current);
+ }
+
+ static boolean isOpenedParentheses(char current) {
+ switch (current) {
+ case '(':
+ case '[':
+ case '{':
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ static boolean isClosedParentheses(char current) {
+ switch (current) {
+ case ')':
+ case ']':
+ case '}':
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ static List groupOverlappingChunks(List chunkList, String chunkedText) {
+ // sort chunkList so the grouping works as intended
+ Collections.sort(chunkList, new Comparator() {
+
+ @Override
+ public int compare(Chunk o1, Chunk o2) {
+ return o1.start() - o2.start();
+ }
+
+ });
+ // group overlapping chunks
+ List overlappingChunks = new ArrayList();
+ for (Chunk chunk : chunkList) {
+ // for debugging
+ // System.out.println("chunking.add(ChunkFactory.createChunk(" +
+ // chunk.start() + ", " + chunk.end() +
+ // ", 0d));");
+ boolean added = false;
+ for (OverlappingChunk over : overlappingChunks) {
+ if (over.isOverlappingSpan(chunk.start(), chunk.end())) {
+ over.addChunk(chunk.start(), chunk.end(), chunk);
+ added = true;
+ }
+ }
+ if (!added) {
+ overlappingChunks.add(new OverlappingChunk(chunk.start(), chunk.end(), chunk, chunkedText));
+ added = true;
+ }
+ }
+ return overlappingChunks;
+ }
+
+ public void initialize(UimaContext aContext) throws ResourceInitializationException {
+ LOGGER.info("calls to initialize: " + initializeCount);
+
+ super.initialize(aContext);
+ LOGGER.info("initialize() - initializing GazetteerAnnotator...");
+
+ try {
+ provider = (ChunkerProvider) getContext().getResourceObject(CHUNKER_RESOURCE_NAME);
+ gazetteer = provider.getChunker();
+ stopWords = provider.getStopWords();
+// String[] stopwordArray = {"a", "about", "above", "across", "after", "afterwards", "again", "against",
+// "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among",
+// "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything",
+// "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become",
+// "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside",
+// "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot",
+// "cant", "co", "computer", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do",
+// "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere",
+// "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except",
+// "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly",
+// "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has",
+// "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
+// "herself", "high", "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in",
+// "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
+// "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill",
+// "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name",
+// "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
+// "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only",
+// "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own",
+// "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed",
+// "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere",
+// "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes",
+// "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them",
+// "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein",
+// "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though", "three",
+// "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards",
+// "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we",
+// "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas",
+// "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who",
+// "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet",
+// "you", "your", "yours", "yourself", "yourselves",};
+// stopWords = new HashSet<>();
+// for (String sw : stopwordArray)
+// stopWords.add(sw);
+ } catch (ResourceAccessException e) {
+ LOGGER.error("Exception while initializing", e);
+ }
+
+ // check acronyms
+ checkAcronyms = (Boolean) aContext.getConfigParameterValue(PARAM_CHECK_ACRONYMS);
+ LOGGER.info(
+ "Check for acronyms (found dictionary entries that are abbreviations are only accepted if their long form is an abbreviation of the same type, too): {}",
+ checkAcronyms);
+ // filter stop words
+
+ Boolean normalizeBoolean = provider.getNormalize();
+ if (normalizeBoolean) {
+ normalizationTokenFactory = new IndoEuropeanTokenizerFactory();
+ }
+ LOGGER.info("Normalize CAS document text (i.e. do stemming and remove possessive 's): {}", provider.getNormalize());
+
+ Boolean transliterateBoolean = provider.getTransliterate();// (Boolean)
+ // aContext.getConfigParameterValue(PARAM_TRANSLITERATE_TEXT);
+// if (transliterateBoolean) {
+// transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC; Lower");
+ transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC");
+// }
+ LOGGER.info("Transliterate CAS document text (i.e. transform accented characters to their base forms): {}",
+ provider.getTransliterate());
+
+ // define output level
+ outputType = (String) aContext.getConfigParameterValue(PARAM_OUTPUT_TYPE);
+ if (outputType == null) {
+ LOGGER.error("initialize() - output type not specified.");
+ throw new ResourceInitializationException();
+ }
+
+ mantraMode = aContext.getConfigParameterValue(PARAM_USE_MANTRA_MODE) != null
+ ? (Boolean) aContext.getConfigParameterValue(PARAM_USE_MANTRA_MODE)
+ : false;
+ }
+
+ /**
+ * process the CAS, there are two subroutines: one for exact and one for
+ * approximate matching.
+ */
+ public void process(JCas aJCas) throws AnalysisEngineProcessException {
+ if (gazetteer == null)
+ throw new IllegalStateException("The actual gazetteer object is null. Check previous log messages pointing to the error (most probably the dictionary file could not be found).");
+ String docText = aJCas.getDocumentText();
+ if (docText == null || docText.length() == 0)
+ return;
+ // normalization includes transliteration
+ if (provider.getTransliterate() && !provider.getNormalize())
+ docText = transliterator.transform(docText);
+ NormalizedString normalizedDocText = null;
+ if (provider.getNormalize()) {
+ if (provider.getNormalizePlural()) {
+ OffsetSet pluralOffsets = StreamSupport.stream(Spliterators.spliterator(aJCas.getAnnotationIndex(PennBioIEPOSTag.type).iterator(), 0, 0), false).filter(tag -> tag.getValue().equals("NNS")).map(tag -> Range.between(tag.getBegin(), tag.getEnd())).collect(Collectors.toCollection(OffsetSet::new));
+ normalizedDocText = StringNormalizerForChunking.normalizeString(docText, normalizationTokenFactory, true, pluralOffsets, transliterator);
+ } else {
+ normalizedDocText = StringNormalizerForChunking.normalizeString(docText, normalizationTokenFactory, transliterator);
+ }
+ }
+ // exact matching has a switch for case sensitivity, so we can save the work here
+ if (!provider.getCaseSensitive() && provider.getUseApproximateMatching()) {
+ if (provider.getNormalize())
+ normalizedDocText.string = normalizedDocText.string.toLowerCase();
+ else
+ docText = docText.toLowerCase();
+ }
+
+ IndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator();
+ JCoReHashMapAnnotationIndex conceptMentionIndex = new JCoReHashMapAnnotationIndex<>(
+ longOffsetTermGenerator, longOffsetTermGenerator, aJCas, ConceptMention.type);
+ JCoReHashMapAnnotationIndex abbreviationIndex = new JCoReHashMapAnnotationIndex<>(
+ longOffsetTermGenerator, longOffsetTermGenerator, aJCas, Abbreviation.type);
+
+ LOGGER.debug("Performing actual Gazetteer annotation...");
+ Chunking chunking;
+ if (provider.getNormalize())
+ chunking = gazetteer.chunk(normalizedDocText.string);
+ else
+ chunking = gazetteer.chunk(docText);
+ LOGGER.debug("Gazetteer annotation done.");
+ if (provider.getUseApproximateMatching()) {
+ /*
+ * handle matches found by approx matching: this means especially overlapping
+ * matches with different scores (doesn't happen with exact matches)
+ */
+ List chunkList = filterChunking(chunking);
+ List overlappingChunks = groupOverlappingChunks(chunkList,
+ chunking.charSequence().toString());
+ // now add the best chunk of all overlappingChunks to the CAS
+ LOGGER.debug("all overlapping chunks:\n");
+ // Set bestChunksSet = new HashSet<>();
+ for (OverlappingChunk overlappingChunk : overlappingChunks) {
+ // show chunks
+ LOGGER.debug(overlappingChunk.toStringAll());
+ List bestChunks = overlappingChunk.getBestChunks();
+ LOGGER.debug("Found {} best chunks.", bestChunks.size());
+ for (int i = 0; i < bestChunks.size(); i++) {
+ Chunk bestChunk = bestChunks.get(i);
+ if (LOGGER.isDebugEnabled()) {
+ String chunkText = provider.getNormalize() ? normalizedDocText.string.substring(bestChunk.start(), bestChunk.end()) : aJCas.getDocumentText().substring(bestChunk.start(), bestChunk.end());
+ LOGGER.debug("Nr. " + i + " best chunk: " + bestChunk.start() + " - " + bestChunk.end() + ": "
+ + bestChunk.score() + " ; type: " + bestChunk.type() + " ; text: " + chunkText);
+ }
+ // TODO this check and the corresponding set may be removed
+ // when this exception hasn't been thrown
+ // in a
+ // while. Its currently just to be sure, this should not
+ // happen any more since the chunks are sorted
+ // by
+ // offset in the grouping method.
+ // if (bestChunksSet.contains(bestChunk)) {
+ // throw new IllegalStateException("Duplicate best chunk: " + bestChunk);
+ // }
+ // bestChunksSet.add(bestChunk);
+ // add 2 cas
+ add2Cas(aJCas, bestChunk, normalizedDocText, conceptMentionIndex, abbreviationIndex);
+ }
+ }
+ // for (Chunk chunk : chunking.chunkSet()) {
+ // add2Cas(aJCas, chunk, normalizedDocText);
+ // }
+ } else {
+ for (Chunk chunk : chunking.chunkSet()) {
+ add2Cas(aJCas, chunk, normalizedDocText, conceptMentionIndex, abbreviationIndex);
+ }
+ }
+ if (checkAcronyms && !mantraMode) {
+ LOGGER.debug("process() - checking acronyms");
+ annotateAcronymsWithFullFormEntity(aJCas, conceptMentionIndex);
+ }
+ }
+
+ private List filterChunking(Chunking chunking) {
+ // ChunkingImpl newChunking = new ChunkingImpl(chunking.charSequence());
+ List newChunking = new ArrayList<>(chunking.chunkSet().size());
+ for (Chunk chunk : chunking.chunkSet()) {
+ String chunkText = chunking.charSequence().subSequence(chunk.start(), chunk.end()).toString();
+ if (filterParenthesis(chunkText))
+ continue;
+ if (filterPunctuationArtifacts(chunkText))
+ continue;
+ if (filterStopwords(chunkText))
+ continue;
+ newChunking.add(chunk);
+ }
+ return newChunking;
+ }
+
+ private boolean filterPunctuationArtifacts(String chunkText) {
+ if (chunkText.startsWith("-"))
+ return true;
+ if (chunkText.endsWith("-"))
+ return true;
+ return false;
+ }
+
+ private boolean filterStopwords(String chunkText) {
+ if (stopWords.contains(chunkText.toLowerCase()))
+ return true;
+ if (chunkText.contains(" ")) {
+ String[] words = chunkText.split(" ");
+ int stopWordCounter = 0;
+ for (String word : words) {
+ if (stopWords.contains(word.toLowerCase()))
+ stopWordCounter++;
+ }
+ if (Math.ceil(words.length / 2.0) <= stopWordCounter) {
+ LOGGER.debug("Filtering due to high stop word occurrences: {}", chunkText);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * checks whether a chunk (= dictionary match) is an acronym. If yes, checks
+ * whether respective full form (obtained via abbr textReference) is
+ * ConceptMention and has same specificType as chunk If these conditions are not
+ * fulfilled, no entity annotation will be made.
+ *
+ * @param abbreviationIndex
+ * @param conceptMentionIndex
+ */
+ private boolean isAcronymWithSameFullFormSpecificType(JCas aJCas, Chunk chunk, NormalizedString normalizedDocText,
+ JCoReHashMapAnnotationIndex conceptMentionIndex,
+ JCoReHashMapAnnotationIndex abbreviationIndex) {
+ // Annotation anno;
+ int start;
+ int end;
+ if (provider.getNormalize()) {
+ try {
+ start = normalizedDocText.getOriginalOffset(chunk.start());
+ end = normalizedDocText.getOriginalOffset(chunk.end());
+ } catch (Exception e) {
+ System.out.println("Text: " + normalizedDocText);
+ System.out.println("Chunk: " + chunk);
+ System.out.println("Chunk end: " + chunk.end());
+ System.out
+ .println("Normalized Text: " + normalizedDocText.string.substring(chunk.start(), chunk.end()));
+ throw e;
+ }
+ // anno = new Annotation(aJCas, start, end);
+ } else {
+ start = chunk.start();
+ end = chunk.end();
+ }
+
+ LongOffsetIndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator();
+ // Retrieves potential abbr annotation
+ Abbreviation abbr = abbreviationIndex.getFirst(longOffsetTermGenerator.forOffsets(start, end));
+ // check whether it's an abbr
+ String chunktext = null;
+ if (LOGGER.isDebugEnabled())
+ chunktext = aJCas.getDocumentText().substring(start, end);
+ if (abbr == null) {
+ LOGGER.debug("{} chunk \"{}\" is not an abbreviation\n", chunk, chunktext);
+ return true;
+ }
+ // checks whether respective full form is ConceptMention
+ AbbreviationLongform textRef = abbr.getTextReference();
+ ConceptMention em = conceptMentionIndex.getFirst(textRef);
+ if (em == null) {
+ LOGGER.debug(
+ chunk + " chunk \"{}\" is an abbreviation but respective full \"{}\" form is no ConceptMention\n",
+ chunktext, textRef.getCoveredText());
+ return false;
+ }
+
+ // checks whether full form annotation matches the type to be annotated
+ // here
+ String emType = em.getClass().getCanonicalName();
+ if (emType.equals(outputType)) {
+ LOGGER.debug(chunk
+ + " chunk \"{}\" is an abbreviation and respective full form \"{}\" is ConceptMention with same type as OutputType\n",
+ chunktext, em.getCoveredText());
+ return true;
+ }
+
+ LOGGER.debug(chunk
+ + " chunk \"{}\" is an abbreviation but respective full form \"{}\" is ConceptMention without the correct OutputType (is: {}; OutputType: {})\n",
+ new Object[]{chunktext, em.getCoveredText(), emType, outputType});
+ return false;
+ }
+
+ // ------------ INFO ..........
+ // String text = aJCas.getDocumentText();
+ // int start = chunk.start();
+ // int end = chunk.end();
+ // String type = chunk.type();
+ // double score = chunk.score();
+ // String phrase = text.substring(start, end);
+ // System.out.println(" found phrase=|" + phrase + "|"
+ // + " start=" + start + " end=" + end + " type=" + type
+ // + " score=" + score);
+ // ------------ INFO ..........
+
+ /**
+ * adds a chunk as an annotation to the CAS
+ *
+ * @param normalizedDocText
+ * @param abbreviationIndex
+ * @param conceptMentionIndex
+ */
+ private void add2Cas(JCas aJCas, Chunk chunk, NormalizedString normalizedDocText,
+ JCoReHashMapAnnotationIndex conceptMentionIndex,
+ JCoReHashMapAnnotationIndex abbreviationIndex) throws AnalysisEngineProcessException {
+ // System.out.println("CHUNK: start=" + chunk.start() + " end=" +
+ // chunk.end());
+ // if checkAcronyms, then check acronyms for compliant full forms (=
+ // with same specificType)
+ if (checkAcronyms && !isAcronymWithSameFullFormSpecificType(aJCas, chunk, normalizedDocText,
+ conceptMentionIndex, abbreviationIndex)) {
+ return;
+ }
+
+ // The Math.min(, Math.max(0, )) application is a security measure. I rare cases they are issues with multi
+ // byte character encodings. This security measure won't correct the underlying error but avoid errors
+ // due to invalid offsets.
+ int start = Math.min(aJCas.getDocumentText().length(), Math.max(0, provider.getNormalize() ? normalizedDocText.getOriginalOffset(chunk.start()) : chunk.start()));
+ int end = Math.min(aJCas.getDocumentText().length(), Math.max(0, provider.getNormalize() ? normalizedDocText.getOriginalOffset(chunk.end()) : chunk.end()));
+
+ try {
+ if (mantraMode) {
+ // the "type" string is used to transport all data needed for
+ // the MAN-XML format
+ for (String term : chunk.type().split("@@TERM@@")) {
+ // @@ is used to separate source, cui, type(s) and group (in
+ // this order!)
+ String[] info = term.split("@@");
+ Entity newEntity = (Entity) JCoReAnnotationTools.getAnnotationByClassName(aJCas,
+ "de.julielab.jcore.types.mantra.Entity");
+ newEntity.setBegin(start);
+ newEntity.setEnd(end);
+ newEntity.setComponentId(COMPONENT_ID);
+ newEntity.setConfidence(chunk.score() + "");
+
+ // mantra specific
+ newEntity.setSource(info[0]);
+ newEntity.setCui(info[1]);
+ newEntity.setSemanticType(info[2]);
+ newEntity.setSemanticGroup(info[3]);
+
+ newEntity.addToIndexes();
+ }
+ } else {
+ ConceptMention newEntity = (ConceptMention) JCoReAnnotationTools.getAnnotationByClassName(aJCas,
+ outputType);
+ newEntity.setBegin(start);
+ newEntity.setEnd(end);
+
+ // String entityText = newEntity.getCoveredText();
+ // if (stopWords.contains(entityText.toLowerCase()))
+ // return;
+ // if (entityText.contains(" ")) {
+ // String[] words = entityText.split(" ");
+ // int stopWordCounter = 0;
+ // for (String word : words) {
+ // if (stopWords.contains(word.toLowerCase()))
+ // stopWordCounter++;
+ // }
+ // if (words.length == stopWordCounter)
+ // return;
+ // }
+
+ newEntity.setSpecificType(chunk.type());
+ newEntity.setComponentId(COMPONENT_ID);
+ newEntity.setConfidence(chunk.score() + "");
+ newEntity.addToIndexes();
+
+ conceptMentionIndex.index(newEntity);
+ }
+ } catch (Exception e) {
+ LOGGER.error("process() - could not generate output type: " + e.getMessage());
+ e.printStackTrace();
+ throw new AnalysisEngineProcessException(e);
+ }
+ }
+
+ private void annotateAcronymsWithFullFormEntity(JCas aJCas,
+ JCoReHashMapAnnotationIndex conceptMentionIndex)
+ throws AnalysisEngineProcessException {
+
+ JFSIndexRepository indexes = aJCas.getJFSIndexRepository();
+ FSIterator abbrevIter = indexes.getAnnotationIndex(Abbreviation.type).iterator();
+ IndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator();
+
+ // loop over all abbreviations
+ while (abbrevIter.hasNext()) {
+ Abbreviation abbrev = (Abbreviation) abbrevIter.next();
+ AbbreviationLongform fullFormAnnotation = abbrev.getTextReference();
+ LOGGER.debug("annotateAcronymsWithFullFormEntity() - checking abbreviation: " + abbrev.getCoveredText());
+ ConceptMention emFullform = null;// AnnotationRetrieval.getMatchingAnnotation(aJCas, fullFormAnnotation,
+ // ConceptMention.class);
+ emFullform = conceptMentionIndex.getFirst(fullFormAnnotation);
+
+ // The following code was once introduced for gene tagging. There,
+ // the acronym fullforms sometimes miss minor parts of an annotated
+ // gene, leading to non-annotated acronyms that would have been
+ // correct.
+ // However, for general-purpose concept recognition this approach
+ // can be quite harmful. Example: "Anaphase-promoting complex (APC)"
+ // where only "anaphase" is recognized as concept. Now, "APC" would
+ // be annotated as an acronym for "anaphase". Here, a better
+ // recognition of the abbreviation span is required.
+ // ConceptMention emFullform = null;
+ // List conceptsInFullform =
+ // JCoReAnnotationTools.getIncludedAnnotations(aJCas,
+ // fullFormAnnotation,
+ // ConceptMention.class);
+ // if (conceptsInFullform.size() == 1) {
+ // emFullform = conceptsInFullform.get(0);
+ // LOGGER.debug("Found a single ConceptMention included in the full
+ // form: {}", emFullform.getCoveredText());
+ // } else if (conceptsInFullform.size() > 1) {
+ // // If there are multiple ConceptMentions found in the full form,
+ // take that largest right-most candidate.
+ // int maxSize = -1;
+ // for (ConceptMention em : conceptsInFullform) {
+ // int emSize = em.getEnd() - em.getBegin();
+ // if (emSize > maxSize) {
+ // emFullform = em;
+ // maxSize = emSize;
+ // }
+ // }
+ // LOGGER.debug("Found multiple ConceptMentions included in the full
+ // form \"{}\", returning the longest.",
+ // fullFormAnnotation.getCoveredText());
+ // if (LOGGER.isTraceEnabled()) {
+ // LOGGER.trace("All found ConceptMentions:");
+ // for (ConceptMention cm : conceptsInFullform) {
+ // LOGGER.trace("Text: {}; offsets: {}-{}",
+ // new Object[] { cm.getCoveredText(), cm.getBegin(), cm.getEnd()
+ // });
+ // }
+ // }
+ // } else {
+ // LOGGER.debug("No ConceptMention in the span of acronym fullform
+ // \"{}\" found.",
+ // fullFormAnnotation.getCoveredText());
+ // }
+
+ String type = null;
+ if (emFullform != null)
+ type = emFullform.getClass().getCanonicalName();
+
+ ConceptMention emAcronym = null;// AnnotationRetrieval.getMatchingAnnotation(aJCas, abbrev,
+ // ConceptMention.class);
+ emAcronym = conceptMentionIndex.getFirst(abbrev);
+ // This is really slow, really a pain with full texts.
+ // It was originally introduced to push recall for gene recognition.
+ // So now we will lose (a bit) of recognition performance there.
+ // ConceptMention emAcronym =
+ // JCoReAnnotationTools.getPartiallyOverlappingAnnotation(aJCas,
+ // abbrev,
+ // ConceptMention.class);
+
+ // if type of the entity is equal to the output type for this
+ // annotator
+ if (type != null && type.equals(outputType)) {
+ if (emFullform == null) {
+ LOGGER.debug(
+ "annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no ConceptMention\n");
+ continue;
+ }
+ if (emFullform.getComponentId() != null && emFullform.getComponentId().equals(COMPONENT_ID)
+ && (emAcronym == null
+ || !emAcronym.getClass().getName().equals(emFullform.getClass().getName()))) {
+
+ try {
+ LOGGER.debug("annotateAcronymsWithFullFormEntity() - fullform of abbreviation ("
+ + abbrev.getCoveredText() + " [begin=" + abbrev.getBegin() + "; end=" + abbrev.getEnd()
+ + "]) has ConceptMention: " + emFullform.toString());
+ ConceptMention newEntityOnAcronym = (ConceptMention) JCoReAnnotationTools
+ .getAnnotationByClassName(aJCas, outputType);
+ newEntityOnAcronym.setBegin(abbrev.getBegin());
+ newEntityOnAcronym.setEnd(abbrev.getEnd());
+ newEntityOnAcronym.setTextualRepresentation(newEntityOnAcronym.getCoveredText());
+ newEntityOnAcronym.setSpecificType(emFullform.getSpecificType());
+ newEntityOnAcronym.setComponentId(COMPONENT_ID + "+acronym");
+ newEntityOnAcronym.setConfidence(emFullform.getConfidence() + "");
+ newEntityOnAcronym.addToIndexes();
+
+ } catch (Exception e) {
+ LOGGER.error("process() - could not generate output type: " + e.getMessage());
+ e.printStackTrace();
+ throw new AnalysisEngineProcessException(AnalysisEngineProcessException.ANNOTATOR_EXCEPTION,
+ null);
+ }
+
+ } else {
+ if (emAcronym == null)
+ LOGGER.debug("annotateAcronymsWithFullFormEntity() - emAcronym != null");
+ else if (emAcronym.getClass().getName().equals(emFullform.getClass().getName()))
+ LOGGER.debug("annotateAcronymsWithFullFormEntity() - emAcroType="
+ + emAcronym.getClass().getCanonicalName() + " == emFullformType="
+ + emFullform.getClass().getCanonicalName());
+ }
+
+ }
+ }
+ }
+
+ enum ParenthesisType {
+ ROUND, BRACKET, CURLY, NONE
+ }
}
diff --git a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java
index 2cffe9bde..3172f5601 100644
--- a/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java
+++ b/jcore-lingpipegazetteer-ae/src/main/java/de/julielab/jcore/ae/lingpipegazetteer/utils/StringNormalizerForChunking.java
@@ -1,213 +1,252 @@
-
package de.julielab.jcore.ae.lingpipegazetteer.utils;
+import com.aliasi.tokenizer.PorterStemmerTokenizerFactory;
import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;
import com.ibm.icu.text.Transliterator;
+import de.julielab.java.utilities.spanutils.OffsetSet;
+import org.apache.commons.lang3.Range;
import java.util.*;
public class StringNormalizerForChunking {
- public enum Mode {
- /**
- * Punctuation characters are deleted completely, shrinking the string.
- */
- DELETE,
- /** Punctuation characters are replaced by white spaces. */
- REPLACE
- }
-
- private static Set charsToDelete = new HashSet<>();
- static {
- charsToDelete.add('-');
- charsToDelete.add('+');
- charsToDelete.add(',');
- charsToDelete.add('.');
- charsToDelete.add(':');
- charsToDelete.add(';');
- charsToDelete.add('?');
- charsToDelete.add('!');
- charsToDelete.add('*');
- charsToDelete.add('§');
- charsToDelete.add('$');
- charsToDelete.add('%');
- charsToDelete.add('&');
- charsToDelete.add('/');
- charsToDelete.add('\\');
- charsToDelete.add('(');
- charsToDelete.add(')');
- charsToDelete.add('<');
- charsToDelete.add('>');
- charsToDelete.add('[');
- charsToDelete.add(']');
- charsToDelete.add('=');
- charsToDelete.add('\'');
- charsToDelete.add('`');
- charsToDelete.add('´');
- charsToDelete.add('"');
- charsToDelete.add('#');
- }
-
- public static class NormalizedString {
- public String string;
- private Map offsetMap = new HashMap<>();
-
- public Map getOffsetMap() {
- return offsetMap;
- }
-
- private TreeSet normalizedOffsetSet;
-
- public Integer getOriginalOffset(int normalizedOffset) {
- Integer originalOffset = offsetMap.get(normalizedOffset);
- if (originalOffset == null) {
- originalOffset = deriveOriginalOffset(normalizedOffset);
- offsetMap.put(normalizedOffset, originalOffset);
- }
- return originalOffset;
- }
-
- private Integer deriveOriginalOffset(int normalizedOffset) {
- if (normalizedOffsetSet == null)
- normalizedOffsetSet = new TreeSet<>(offsetMap.keySet());
- Integer previousNormalizedOffset = normalizedOffsetSet.floor(normalizedOffset);
- Integer originalPreviousOffset = offsetMap.get(previousNormalizedOffset);
- int offsetShift = Math.abs(originalPreviousOffset - previousNormalizedOffset);
- // Typically, the normalized string will be shorter than the
- // original, thus the original offset would be larger.
- if (originalPreviousOffset > previousNormalizedOffset)
- return normalizedOffset + offsetShift;
- // But if, for some reason, the normalized string is longer than the
- // original, we would have to subtract the difference from the
- // normalized offset.
- return normalizedOffset - offsetShift;
- }
- }
-
- /**
- * This method was meant for text normalization by just deleting punctuation
- * characters. However, the approach turned out to be suboptimal in cases
- * where a dictionary entry would be "SHP-1" and the text form would be "SHP
- * 1". That is, when in the text there is just a whitespace where there is a
- * punctuation character in the dictionary, we won't recognize the
- * dictionary entry. Thus, a different normalization was developed, namely
- * in the other normalization method. It is supposed to be used together
- * with an approximate chunker.
- *
- * @param str
- * @return
- */
- public static NormalizedString normalizeString(String str) {
- NormalizedString ns = new NormalizedString();
- StringBuilder sb = new StringBuilder();
- int deletedChars = 0;
-
- for (int i = 0; i < str.length(); i++) {
- char c = str.charAt(i);
- if (charsToDelete.contains(c)) {
- deletedChars++;
- // switch (mode) {
- // case REPLACE: sb.append(" "); break;
- // case DELETE: deletedChars++; break;
- // }
- } else {
- sb.append(c);
- }
- int newOffset = Math.max(0, i - deletedChars);
- if (null == ns.offsetMap.get(newOffset))
- ns.offsetMap.put(newOffset, i);
- }
- ns.string = sb.toString();
- return ns;
- }
-
- /**
- * This normalization method uses a given TokenizerFactory (could also be a
- * PorterStemmerTokenizerFactory for stemming) and additionally removes
- * possessive 's constructions. Dashes and other punctuation is left
- * untouched. By using an approximate chunker, one can also handle
- * punctuation.
- *
- * @param str
- * @param tokenizerFactory
- * @return
- */
- public static NormalizedString normalizeString(String str, TokenizerFactory tokenizerFactory,
- Transliterator transliterator) {
- // boolean stemming = tokenizerFactory instanceof
- // PorterStemmerTokenizerFactory;
-
- NormalizedString ns = new NormalizedString();
-
- char[] strChars = str.toCharArray();
- Tokenizer tokenizer = tokenizerFactory.tokenizer(strChars, 0, strChars.length);
- StringBuilder sb = new StringBuilder();
- ArrayDeque tokenS = new ArrayDeque<>();
- Map